Fix project isolation: Make loadChatHistory respect active project sessions
- Modified loadChatHistory() to check for active project before fetching all sessions - When active project exists, use project.sessions instead of fetching from API - Added detailed console logging to debug session filtering - This prevents ALL sessions from appearing in every project's sidebar Co-Authored-By: Claude Opus 4.5 <noreply@anthropic.com>
This commit is contained in:
33
.venv/lib/python3.11/site-packages/rapidfuzz/__init__.py
Normal file
33
.venv/lib/python3.11/site-packages/rapidfuzz/__init__.py
Normal file
@@ -0,0 +1,33 @@
|
||||
"""
|
||||
rapid string matching library
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
__author__: str = "Max Bachmann"
|
||||
__license__: str = "MIT"
|
||||
__version__: str = "3.14.3"
|
||||
|
||||
from rapidfuzz import distance, fuzz, process, utils
|
||||
|
||||
__all__ = ["distance", "fuzz", "get_include", "process", "utils"]
|
||||
|
||||
|
||||
def get_include():
|
||||
"""
|
||||
Return the directory that contains the RapidFuzz \\*.h header files.
|
||||
Extension modules that need to compile against RapidFuzz should use this
|
||||
function to locate the appropriate include directory.
|
||||
Notes
|
||||
-----
|
||||
When using ``distutils``, for example in ``setup.py``.
|
||||
::
|
||||
import rapidfuzz_capi
|
||||
...
|
||||
Extension('extension_name', ...
|
||||
include_dirs=[rapidfuzz_capi.get_include()])
|
||||
...
|
||||
"""
|
||||
from pathlib import Path
|
||||
|
||||
return str(Path(__file__).parent)
|
||||
12
.venv/lib/python3.11/site-packages/rapidfuzz/__init__.pyi
Normal file
12
.venv/lib/python3.11/site-packages/rapidfuzz/__init__.pyi
Normal file
@@ -0,0 +1,12 @@
|
||||
from __future__ import annotations
|
||||
|
||||
__author__: str
|
||||
__license__: str
|
||||
__version__: str
|
||||
|
||||
from rapidfuzz import (
|
||||
distance as distance,
|
||||
fuzz as fuzz,
|
||||
process as process,
|
||||
utils as utils,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,7 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from pathlib import Path
|
||||
|
||||
|
||||
def get_PyInstaller_tests():
|
||||
return [str(Path(__file__).parent)]
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,37 @@
|
||||
from __future__ import annotations
|
||||
|
||||
import subprocess
|
||||
|
||||
from PyInstaller import __main__ as pyi_main
|
||||
|
||||
|
||||
# Test out the package by importing it, then running functions from it.
|
||||
def test_pyi_hooksample(tmp_path):
|
||||
app_name = "userapp"
|
||||
workpath = tmp_path / "build"
|
||||
distpath = tmp_path / "dist"
|
||||
app = tmp_path / (app_name + ".py")
|
||||
app.write_text(
|
||||
"\n".join(
|
||||
[
|
||||
"import rapidfuzz",
|
||||
"from rapidfuzz.distance import metrics_py",
|
||||
"from rapidfuzz.distance import metrics_cpp",
|
||||
"rapidfuzz.distance.Levenshtein.distance('test', 'teste')",
|
||||
"metrics_py.levenshtein_distance('test', 'teste')",
|
||||
"metrics_cpp.levenshtein_distance('test', 'teste')",
|
||||
]
|
||||
)
|
||||
)
|
||||
args = [
|
||||
# Place all generated files in ``tmp_path``.
|
||||
"--workpath",
|
||||
str(workpath),
|
||||
"--distpath",
|
||||
str(distpath),
|
||||
"--specpath",
|
||||
str(tmp_path),
|
||||
str(app),
|
||||
]
|
||||
pyi_main.run(args)
|
||||
subprocess.run([str(distpath / app_name / app_name)], check=True)
|
||||
73
.venv/lib/python3.11/site-packages/rapidfuzz/_common_py.py
Normal file
73
.venv/lib/python3.11/site-packages/rapidfuzz/_common_py.py
Normal file
@@ -0,0 +1,73 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2023 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from array import array
|
||||
from collections.abc import Hashable, Sequence
|
||||
|
||||
|
||||
def conv_sequence(s: Sequence[Hashable]) -> Sequence[Hashable]:
|
||||
if isinstance(s, str):
|
||||
return [ord(x) for x in s]
|
||||
|
||||
if isinstance(s, bytes):
|
||||
return s
|
||||
|
||||
if isinstance(s, array):
|
||||
if s.typecode in ("u", "w"):
|
||||
return [ord(x) for x in s]
|
||||
|
||||
return s
|
||||
|
||||
if s is None:
|
||||
return s
|
||||
|
||||
res = []
|
||||
for elem in s:
|
||||
if isinstance(elem, str) and len(elem) == 1:
|
||||
res.append(ord(elem))
|
||||
elif isinstance(elem, int) and elem == -1:
|
||||
res.append(-1)
|
||||
else:
|
||||
res.append(hash(elem))
|
||||
|
||||
return res
|
||||
|
||||
|
||||
def conv_sequences(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> tuple[Sequence[Hashable], Sequence[Hashable]]:
|
||||
if isinstance(s1, str) and isinstance(s2, str):
|
||||
return s1, s2
|
||||
|
||||
if isinstance(s1, bytes) and isinstance(s2, bytes):
|
||||
return s1, s2
|
||||
|
||||
return conv_sequence(s1), conv_sequence(s2)
|
||||
|
||||
|
||||
def common_prefix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
|
||||
prefix_len = 0
|
||||
for ch1, ch2 in zip(s1, s2):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
|
||||
prefix_len += 1
|
||||
|
||||
return prefix_len
|
||||
|
||||
|
||||
def common_suffix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> int:
|
||||
suffix_len = 0
|
||||
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
|
||||
suffix_len += 1
|
||||
|
||||
return suffix_len
|
||||
|
||||
|
||||
def common_affix(s1: Sequence[Hashable], s2: Sequence[Hashable]) -> tuple[int, int]:
|
||||
prefix_len = common_prefix(s1, s2)
|
||||
suffix_len = common_suffix(s1[prefix_len:], s2[prefix_len:])
|
||||
return (prefix_len, suffix_len)
|
||||
@@ -0,0 +1,15 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
__all__ = ["AVX2", "SSE2", "supports"]
|
||||
|
||||
try:
|
||||
from rapidfuzz._feature_detector_cpp import AVX2, SSE2, supports
|
||||
except ImportError:
|
||||
SSE2 = 1
|
||||
AVX2 = 2
|
||||
|
||||
def supports(features):
|
||||
_ = features
|
||||
return False
|
||||
Binary file not shown.
85
.venv/lib/python3.11/site-packages/rapidfuzz/_utils.py
Normal file
85
.venv/lib/python3.11/site-packages/rapidfuzz/_utils.py
Normal file
@@ -0,0 +1,85 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import sys
|
||||
from math import isnan
|
||||
from typing import Any, Callable
|
||||
|
||||
pandas_NA = None
|
||||
|
||||
|
||||
def setupPandas():
|
||||
global pandas_NA # noqa: PLW0603
|
||||
if pandas_NA is None:
|
||||
pandas = sys.modules.get("pandas")
|
||||
if hasattr(pandas, "NA"):
|
||||
pandas_NA = pandas.NA
|
||||
|
||||
|
||||
setupPandas()
|
||||
|
||||
|
||||
class ScorerFlag:
|
||||
RESULT_F64 = 1 << 5
|
||||
RESULT_I64 = 1 << 6
|
||||
RESULT_SIZE_T = 1 << 7
|
||||
SYMMETRIC = 1 << 11
|
||||
|
||||
|
||||
def _get_scorer_flags_distance(**_kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"optimal_score": 0,
|
||||
"worst_score": 2**63 - 1,
|
||||
"flags": ScorerFlag.RESULT_SIZE_T | ScorerFlag.SYMMETRIC,
|
||||
}
|
||||
|
||||
|
||||
def _get_scorer_flags_similarity(**_kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"optimal_score": 2**63 - 1,
|
||||
"worst_score": 0,
|
||||
"flags": ScorerFlag.RESULT_SIZE_T | ScorerFlag.SYMMETRIC,
|
||||
}
|
||||
|
||||
|
||||
def _get_scorer_flags_normalized_distance(**_kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"optimal_score": 0,
|
||||
"worst_score": 1,
|
||||
"flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC,
|
||||
}
|
||||
|
||||
|
||||
def _get_scorer_flags_normalized_similarity(**_kwargs: Any) -> dict[str, Any]:
|
||||
return {
|
||||
"optimal_score": 1,
|
||||
"worst_score": 0,
|
||||
"flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC,
|
||||
}
|
||||
|
||||
|
||||
def is_none(s: Any) -> bool:
|
||||
if s is None or s is pandas_NA:
|
||||
return True
|
||||
|
||||
return isinstance(s, float) and isnan(s)
|
||||
|
||||
|
||||
def add_scorer_attrs(func: Any, cached_scorer_call: dict[str, Callable[..., dict[str, Any]]]):
|
||||
func._RF_ScorerPy = cached_scorer_call
|
||||
# used to detect the function hasn't been wrapped afterwards
|
||||
func._RF_OriginalScorer = func
|
||||
|
||||
|
||||
default_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {"get_scorer_flags": _get_scorer_flags_distance}
|
||||
default_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_similarity
|
||||
}
|
||||
default_normalized_distance_attribute: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_normalized_distance
|
||||
}
|
||||
default_normalized_similarity_attribute: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_normalized_similarity
|
||||
}
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
damerau_levenshtein_distance as distance,
|
||||
damerau_levenshtein_normalized_distance as normalized_distance,
|
||||
damerau_levenshtein_normalized_similarity as normalized_similarity,
|
||||
damerau_levenshtein_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,233 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def _damerau_levenshtein_distance_zhao(s1, s2):
|
||||
maxVal = max(len(s1), len(s2)) + 1
|
||||
last_row_id = {}
|
||||
last_row_id_get = last_row_id.get
|
||||
size = len(s2) + 2
|
||||
FR = [maxVal] * size
|
||||
R1 = [maxVal] * size
|
||||
R = list(range(size))
|
||||
R[-1] = maxVal
|
||||
|
||||
for i in range(1, len(s1) + 1):
|
||||
R, R1 = R1, R
|
||||
last_col_id = -1
|
||||
last_i2l1 = R[0]
|
||||
R[0] = i
|
||||
T = maxVal
|
||||
|
||||
for j in range(1, len(s2) + 1):
|
||||
diag = R1[j - 1] + (s1[i - 1] != s2[j - 1])
|
||||
left = R[j - 1] + 1
|
||||
up = R1[j] + 1
|
||||
temp = min(diag, left, up)
|
||||
|
||||
if s1[i - 1] == s2[j - 1]:
|
||||
last_col_id = j # last occurrence of s1_i
|
||||
FR[j] = R1[j - 2] # save H_k-1,j-2
|
||||
T = last_i2l1 # save H_i-2,l-1
|
||||
else:
|
||||
k = last_row_id_get(s2[j - 1], -1)
|
||||
l = last_col_id # noqa: E741
|
||||
|
||||
if (j - l) == 1:
|
||||
transpose = FR[j] + (i - k)
|
||||
temp = min(temp, transpose)
|
||||
elif (i - k) == 1:
|
||||
transpose = T + (j - l)
|
||||
temp = min(temp, transpose)
|
||||
|
||||
last_i2l1 = R[j]
|
||||
R[j] = temp
|
||||
|
||||
last_row_id[s1[i - 1]] = i
|
||||
|
||||
return R[len(s2)]
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Damerau-Levenshtein distance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the Damerau-Levenshtein distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import DamerauLevenshtein
|
||||
>>> DamerauLevenshtein.distance("CA", "ABC")
|
||||
2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
dist = _damerau_levenshtein_distance_zhao(s1, s2)
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Damerau-Levenshtein similarity in the range [max, 0].
|
||||
|
||||
This is calculated as ``max(len1, len2) - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Damerau-Levenshtein distance in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / max(len1, len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Damerau-Levenshtein similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Hamming.py
Normal file
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Hamming.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
hamming_distance as distance,
|
||||
hamming_editops as editops,
|
||||
hamming_normalized_distance as normalized_distance,
|
||||
hamming_normalized_similarity as normalized_similarity,
|
||||
hamming_opcodes as opcodes,
|
||||
hamming_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,113 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
pad: bool = True,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,322 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance._initialize_py import Editop, Editops
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Hamming distance between two strings.
|
||||
The hamming distance is defined as the number of positions
|
||||
where the two strings differ. It describes the minimum
|
||||
amount of substitutions required to transform s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int or None, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
|
||||
if not pad and len(s1) != len(s2):
|
||||
msg = "Sequences are not the same length."
|
||||
raise ValueError(msg)
|
||||
|
||||
min_len = min(len(s1), len(s2))
|
||||
dist = max(len(s1), len(s2))
|
||||
for i in range(min_len):
|
||||
dist -= s1[i] == s2[i]
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Hamming similarity between two strings.
|
||||
|
||||
This is calculated as ``len1 - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2, pad=pad)
|
||||
sim = maximum - dist
|
||||
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Hamming similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2, pad=pad)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized Hamming similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If s1 and s2 have a different length
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2, pad=pad, processor=processor)
|
||||
norm_sim = 1 - norm_dist
|
||||
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
|
||||
if not pad and len(s1) != len(s2):
|
||||
msg = "Sequences are not the same length."
|
||||
raise ValueError(msg)
|
||||
|
||||
ops_list = []
|
||||
min_len = min(len(s1), len(s2))
|
||||
for i in range(min_len):
|
||||
if s1[i] != s2[i]:
|
||||
ops_list.append(Editop("replace", i, i))
|
||||
|
||||
for i in range(min_len, len(s1)):
|
||||
ops_list.append(Editop("delete", i, len(s2)))
|
||||
|
||||
for i in range(min_len, len(s2)):
|
||||
ops_list.append(Editop("insert", len(s1), i))
|
||||
|
||||
# sidestep input validation
|
||||
ops = Editops.__new__(Editops)
|
||||
ops._src_len = len(s1)
|
||||
ops._dest_len = len(s2)
|
||||
ops._editops = ops_list
|
||||
return ops
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
pad=True,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
pad : bool, optional
|
||||
should strings be padded if there is a length difference.
|
||||
If pad is False and strings have a different length
|
||||
a ValueError is thrown instead. Defaults is True.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
"""
|
||||
return editops(s1, s2, pad=pad, processor=processor).as_opcodes()
|
||||
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.py
Normal file
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
indel_distance as distance,
|
||||
indel_editops as editops,
|
||||
indel_normalized_distance as normalized_distance,
|
||||
indel_normalized_similarity as normalized_similarity,
|
||||
indel_opcodes as opcodes,
|
||||
indel_similarity as similarity,
|
||||
)
|
||||
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.pyi
Normal file
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Indel.pyi
Normal file
@@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,358 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance.LCSseq_py import (
|
||||
_block_similarity as lcs_seq_block_similarity,
|
||||
editops as lcs_seq_editops,
|
||||
opcodes as lcs_seq_opcodes,
|
||||
similarity as lcs_seq_similarity,
|
||||
)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the minimum number of insertions and deletions
|
||||
required to change one sequence into the other. This is equivalent to the
|
||||
Levenshtein distance with a substitution weight of 2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the Indel distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> Indel.distance("lewenstein", "levenshtein")
|
||||
3
|
||||
|
||||
Setting a maximum distance allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Indel.distance("lewenstein", "levenshtein", score_cutoff=1)
|
||||
2
|
||||
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
lcs_sim = lcs_seq_similarity(s1, s2)
|
||||
dist = maximum - 2 * lcs_sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def _block_distance(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
maximum = len(s1) + len(s2)
|
||||
lcs_sim = lcs_seq_block_similarity(block, s1, s2)
|
||||
dist = maximum - 2 * lcs_sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Indel similarity in the range [max, 0].
|
||||
|
||||
This is calculated as ``(len1 + len2) - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = distance(s1, s2)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized levenshtein similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = distance(s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def _block_normalized_distance(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
maximum = len(s1) + len(s2)
|
||||
dist = _block_distance(block, s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized indel similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the normalized Indel similarity between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> Indel.normalized_similarity("lewenstein", "levenshtein")
|
||||
0.85714285714285
|
||||
|
||||
Setting a score_cutoff allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Indel.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
|
||||
0.0
|
||||
|
||||
When a different processor is used s1 and s2 do not have to be strings
|
||||
|
||||
>>> Indel.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
|
||||
0.8571428571428572
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _block_normalized_similarity(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
norm_dist = _block_normalized_distance(block, s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
>>> for tag, src_pos, dest_pos in Indel.editops("qabxcd", "abycdf"):
|
||||
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
|
||||
delete s1[0] s2[0]
|
||||
delete s1[3] s2[2]
|
||||
insert s1[4] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
return lcs_seq_editops(s1, s2, processor=processor)
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Indel
|
||||
|
||||
>>> a = "qabxcd"
|
||||
>>> b = "abycdf"
|
||||
>>> for tag, i1, i2, j1, j2 in Indel.opcodes(a, b):
|
||||
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
|
||||
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
|
||||
delete a[0:1] (q) b[0:0] ()
|
||||
equal a[1:3] (ab) b[0:2] (ab)
|
||||
delete a[3:4] (x) b[2:2] ()
|
||||
insert a[4:4] () b[2:3] (y)
|
||||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return lcs_seq_opcodes(s1, s2, processor=processor)
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_distance as distance,
|
||||
jaro_normalized_distance as normalized_distance,
|
||||
jaro_normalized_similarity as normalized_similarity,
|
||||
jaro_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
jaro_winkler_distance as distance,
|
||||
jaro_winkler_normalized_distance as normalized_distance,
|
||||
jaro_winkler_normalized_similarity as normalized_similarity,
|
||||
jaro_winkler_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,83 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
prefix_weight: float = 0.1,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,235 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance import Jaro_py as Jaro
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro winkler similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
if prefix_weight > 1.0 or prefix_weight < 0.0:
|
||||
msg = "prefix_weight has to be in the range 0.0 - 1.0"
|
||||
raise ValueError(msg)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
P_len = len(s1)
|
||||
T_len = len(s2)
|
||||
min_len = min(P_len, T_len)
|
||||
prefix = 0
|
||||
max_prefix = min(min_len, 4)
|
||||
|
||||
for _ in range(max_prefix):
|
||||
if s1[prefix] != s2[prefix]:
|
||||
break
|
||||
prefix += 1
|
||||
|
||||
jaro_score_cutoff = score_cutoff
|
||||
if jaro_score_cutoff > 0.7:
|
||||
prefix_sim = prefix * prefix_weight
|
||||
|
||||
if prefix_sim >= 1.0:
|
||||
jaro_score_cutoff = 0.7
|
||||
else:
|
||||
jaro_score_cutoff = max(0.7, (prefix_sim - jaro_score_cutoff) / (prefix_sim - 1.0))
|
||||
|
||||
Sim = Jaro.similarity(s1, s2, score_cutoff=jaro_score_cutoff)
|
||||
if Sim > 0.7:
|
||||
Sim += prefix * prefix_weight * (1.0 - Sim)
|
||||
Sim = min(Sim, 1.0)
|
||||
|
||||
return Sim if Sim >= score_cutoff else 0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro winkler similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized similarity : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
return similarity(
|
||||
s1,
|
||||
s2,
|
||||
prefix_weight=prefix_weight,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro winkler distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : float
|
||||
distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
|
||||
sim = similarity(s1, s2, prefix_weight=prefix_weight, score_cutoff=cutoff_distance)
|
||||
dist = 1.0 - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
prefix_weight=0.1,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro winkler distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
prefix_weight : float, optional
|
||||
Weight used for the common prefix of the two strings.
|
||||
Has to be between 0 and 0.25. Default is 0.1.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized distance : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If prefix_weight is invalid
|
||||
"""
|
||||
return distance(
|
||||
s1,
|
||||
s2,
|
||||
prefix_weight=prefix_weight,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
)
|
||||
255
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Jaro_py.py
Normal file
255
.venv/lib/python3.11/site-packages/rapidfuzz/distance/Jaro_py.py
Normal file
@@ -0,0 +1,255 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def _jaro_calculate_similarity(pattern_len, text_len, common_chars, transpositions):
|
||||
transpositions //= 2
|
||||
sim = 0.0
|
||||
sim += common_chars / pattern_len
|
||||
sim += common_chars / text_len
|
||||
sim += (common_chars - transpositions) / common_chars
|
||||
return sim / 3.0
|
||||
|
||||
|
||||
def _jaro_length_filter(pattern_len, text_len, score_cutoff):
|
||||
"""
|
||||
filter matches below score_cutoff based on string lengths
|
||||
"""
|
||||
if not pattern_len or not text_len:
|
||||
return False
|
||||
|
||||
sim = _jaro_calculate_similarity(pattern_len, text_len, min(pattern_len, text_len), 0)
|
||||
return sim >= score_cutoff
|
||||
|
||||
|
||||
def _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
|
||||
"""
|
||||
filter matches below score_cutoff based on string lengths and common characters
|
||||
"""
|
||||
if not common_chars:
|
||||
return False
|
||||
|
||||
sim = _jaro_calculate_similarity(pattern_len, text_len, common_chars, 0)
|
||||
return sim >= score_cutoff
|
||||
|
||||
|
||||
def _jaro_bounds(s1, s2):
|
||||
"""
|
||||
find bounds and skip out of bound parts of the sequences
|
||||
"""
|
||||
pattern_len = len(s1)
|
||||
text_len = len(s2)
|
||||
|
||||
# since jaro uses a sliding window some parts of T/P might never be in
|
||||
# range an can be removed ahead of time
|
||||
bound = 0
|
||||
if text_len > pattern_len:
|
||||
bound = text_len // 2 - 1
|
||||
if text_len > pattern_len + bound:
|
||||
s2 = s2[: pattern_len + bound]
|
||||
else:
|
||||
bound = pattern_len // 2 - 1
|
||||
if pattern_len > text_len + bound:
|
||||
s1 = s1[: text_len + bound]
|
||||
return s1, s2, bound
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if not s1 and not s2:
|
||||
return 1.0
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
pattern_len = len(s1)
|
||||
text_len = len(s2)
|
||||
|
||||
# short circuit if score_cutoff can not be reached
|
||||
if not _jaro_length_filter(pattern_len, text_len, score_cutoff):
|
||||
return 0
|
||||
|
||||
if pattern_len == 1 and text_len == 1:
|
||||
return float(s1[0] == s2[0])
|
||||
|
||||
s1, s2, bound = _jaro_bounds(s1, s2)
|
||||
|
||||
s1_flags = [False] * pattern_len
|
||||
s2_flags = [False] * text_len
|
||||
|
||||
# todo use bitparallel implementation
|
||||
# looking only within search range, count & flag matched pairs
|
||||
common_chars = 0
|
||||
for i, s1_ch in enumerate(s1):
|
||||
low = max(0, i - bound)
|
||||
hi = min(i + bound, text_len - 1)
|
||||
for j in range(low, hi + 1):
|
||||
if not s2_flags[j] and s2[j] == s1_ch:
|
||||
s1_flags[i] = s2_flags[j] = True
|
||||
common_chars += 1
|
||||
break
|
||||
|
||||
# short circuit if score_cutoff can not be reached
|
||||
if not _jaro_common_char_filter(pattern_len, text_len, common_chars, score_cutoff):
|
||||
return 0
|
||||
|
||||
# todo use bitparallel implementation
|
||||
# count transpositions
|
||||
k = trans_count = 0
|
||||
for i, s1_f in enumerate(s1_flags):
|
||||
if s1_f:
|
||||
for j in range(k, text_len):
|
||||
if s2_flags[j]:
|
||||
k = j + 1
|
||||
break
|
||||
if s1[i] != s2[j]:
|
||||
trans_count += 1
|
||||
|
||||
return _jaro_calculate_similarity(pattern_len, text_len, common_chars, trans_count)
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro similarity
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized similarity : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
return similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the jaro distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : float
|
||||
distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
cutoff_distance = None if (score_cutoff is None or score_cutoff > 1.0) else 1.0 - score_cutoff
|
||||
sim = similarity(s1, s2, score_cutoff=cutoff_distance)
|
||||
dist = 1.0 - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized jaro distance
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
normalized distance : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
"""
|
||||
return distance(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.py
Normal file
116
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.py
Normal file
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
lcs_seq_distance as distance,
|
||||
lcs_seq_editops as editops,
|
||||
lcs_seq_normalized_distance as normalized_distance,
|
||||
lcs_seq_normalized_similarity as normalized_similarity,
|
||||
lcs_seq_opcodes as opcodes,
|
||||
lcs_seq_similarity as similarity,
|
||||
)
|
||||
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.pyi
Normal file
105
.venv/lib/python3.11/site-packages/rapidfuzz/distance/LCSseq.pyi
Normal file
@@ -0,0 +1,105 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,426 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import common_affix, conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance._initialize_py import Editop, Editops
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the length of the longest common subsequence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if not s1:
|
||||
return 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
S = (1 << len(s1)) - 1
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
for ch2 in s2:
|
||||
Matches = block_get(ch2, 0)
|
||||
u = S & Matches
|
||||
S = (S + u) | (S - u)
|
||||
|
||||
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
|
||||
res = bin(S)[-len(s1) :].count("0")
|
||||
return res if (score_cutoff is None or res >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _block_similarity(
|
||||
block,
|
||||
s1,
|
||||
s2,
|
||||
score_cutoff=None,
|
||||
):
|
||||
if not s1:
|
||||
return 0
|
||||
|
||||
S = (1 << len(s1)) - 1
|
||||
block_get = block.get
|
||||
|
||||
for ch2 in s2:
|
||||
Matches = block_get(ch2, 0)
|
||||
u = S & Matches
|
||||
S = (S + u) | (S - u)
|
||||
|
||||
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
|
||||
res = bin(S)[-len(s1) :].count("0")
|
||||
return res if (score_cutoff is None or res >= score_cutoff) else 0
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the LCS distance in the range [0, max].
|
||||
|
||||
This is calculated as ``max(len1, len2) - similarity``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the LCS distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
>>> LCSseq.distance("lewenstein", "levenshtein")
|
||||
2
|
||||
|
||||
Setting a maximum distance allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> LCSseq.distance("lewenstein", "levenshtein", score_cutoff=1)
|
||||
2
|
||||
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
dist = maximum - sim
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized LCS similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / max(len1, len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if not s1 or not s2:
|
||||
return 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
norm_sim = distance(s1, s2) / maximum
|
||||
return norm_sim if (score_cutoff is None or norm_sim <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized LCS similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the normalized LCS similarity between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein")
|
||||
0.8181818181818181
|
||||
|
||||
Setting a score_cutoff allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> LCSseq.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.9)
|
||||
0.0
|
||||
|
||||
When a different processor is used s1 and s2 do not have to be strings
|
||||
|
||||
>>> LCSseq.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
|
||||
0.81818181818181
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
norm_sim = 1.0 - normalized_distance(s1, s2)
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _matrix(s1, s2):
|
||||
if not s1:
|
||||
return (0, [])
|
||||
|
||||
S = (1 << len(s1)) - 1
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
matrix = []
|
||||
for ch2 in s2:
|
||||
Matches = block_get(ch2, 0)
|
||||
u = S & Matches
|
||||
S = (S + u) | (S - u)
|
||||
matrix.append(S)
|
||||
|
||||
# calculate the equivalent of popcount(~S) in C. This breaks for len(s1) == 0
|
||||
sim = bin(S)[-len(s1) :].count("0")
|
||||
return (sim, matrix)
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described in [6]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [6] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
>>> for tag, src_pos, dest_pos in LCSseq.editops("qabxcd", "abycdf"):
|
||||
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
|
||||
delete s1[0] s2[0]
|
||||
delete s1[3] s2[2]
|
||||
insert s1[4] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
prefix_len, suffix_len = common_affix(s1, s2)
|
||||
s1 = s1[prefix_len : len(s1) - suffix_len]
|
||||
s2 = s2[prefix_len : len(s2) - suffix_len]
|
||||
sim, matrix = _matrix(s1, s2)
|
||||
|
||||
editops = Editops([], 0, 0)
|
||||
editops._src_len = len(s1) + prefix_len + suffix_len
|
||||
editops._dest_len = len(s2) + prefix_len + suffix_len
|
||||
|
||||
dist = len(s1) + len(s2) - 2 * sim
|
||||
if dist == 0:
|
||||
return editops
|
||||
|
||||
editop_list = [None] * dist
|
||||
col = len(s1)
|
||||
row = len(s2)
|
||||
while row != 0 and col != 0:
|
||||
# deletion
|
||||
if matrix[row - 1] & (1 << (col - 1)):
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
else:
|
||||
row -= 1
|
||||
|
||||
# insertion
|
||||
if row and not (matrix[row - 1] & (1 << (col - 1))):
|
||||
dist -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
# match
|
||||
else:
|
||||
col -= 1
|
||||
|
||||
while col != 0:
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
|
||||
while row != 0:
|
||||
dist -= 1
|
||||
row -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
|
||||
editops._editops = editop_list
|
||||
return editops
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described in [7]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [7] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import LCSseq
|
||||
|
||||
>>> a = "qabxcd"
|
||||
>>> b = "abycdf"
|
||||
>>> for tag, i1, i2, j1, j2 in LCSseq.opcodes(a, b):
|
||||
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
|
||||
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
|
||||
delete a[0:1] (q) b[0:0] ()
|
||||
equal a[1:3] (ab) b[0:2] (ab)
|
||||
delete a[3:4] (x) b[2:2] ()
|
||||
insert a[4:4] () b[2:3] (y)
|
||||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return editops(s1, s2, processor=processor).as_opcodes()
|
||||
@@ -0,0 +1,116 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"distance",
|
||||
"editops",
|
||||
"normalized_distance",
|
||||
"normalized_similarity",
|
||||
"opcodes",
|
||||
"similarity",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
levenshtein_distance as distance,
|
||||
levenshtein_editops as editops,
|
||||
levenshtein_normalized_distance as normalized_distance,
|
||||
levenshtein_normalized_similarity as normalized_similarity,
|
||||
levenshtein_opcodes as opcodes,
|
||||
levenshtein_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,131 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
"""
|
||||
The Levenshtein (edit) distance is a string metric to measure the
|
||||
difference between two strings/sequences s1 and s2.
|
||||
It's defined as the minimum number of insertions, deletions or
|
||||
substitutions required to transform s1 into s2.
|
||||
"""
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import Editops, Opcodes
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1),
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
score_hint: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def editops(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_hint: int | None = None,
|
||||
) -> Editops: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_hint: int | None = None,
|
||||
) -> Opcodes: ...
|
||||
@overload
|
||||
def opcodes(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_hint: int | None = None,
|
||||
) -> Opcodes: ...
|
||||
@@ -0,0 +1,571 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import common_affix, conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
from rapidfuzz.distance import Indel_py as Indel
|
||||
from rapidfuzz.distance._initialize_py import Editop, Editops
|
||||
|
||||
|
||||
def _levenshtein_maximum(s1, s2, weights):
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
insert, delete, replace = weights
|
||||
|
||||
max_dist = len1 * delete + len2 * insert
|
||||
|
||||
if len1 >= len2:
|
||||
max_dist = min(max_dist, len2 * replace + (len1 - len2) * delete)
|
||||
else:
|
||||
max_dist = min(max_dist, len1 * replace + (len2 - len1) * insert)
|
||||
|
||||
return max_dist
|
||||
|
||||
|
||||
def _uniform_generic(s1, s2, weights):
|
||||
len1 = len(s1)
|
||||
insert, delete, replace = weights
|
||||
cache = list(range(0, (len1 + 1) * delete, delete))
|
||||
|
||||
for ch2 in s2:
|
||||
temp = cache[0]
|
||||
cache[0] += insert
|
||||
for i in range(len1):
|
||||
x = temp
|
||||
if s1[i] != ch2:
|
||||
x = min(cache[i] + delete, cache[i + 1] + insert, temp + replace)
|
||||
temp = cache[i + 1]
|
||||
cache[i + 1] = x
|
||||
|
||||
return cache[-1]
|
||||
|
||||
|
||||
def _uniform_distance(s1, s2):
|
||||
if not s1:
|
||||
return len(s2)
|
||||
|
||||
VP = (1 << len(s1)) - 1
|
||||
VN = 0
|
||||
currDist = len(s1)
|
||||
mask = 1 << (len(s1) - 1)
|
||||
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
for ch2 in s2:
|
||||
# Step 1: Computing D0
|
||||
PM_j = block_get(ch2, 0)
|
||||
X = PM_j
|
||||
D0 = (((X & VP) + VP) ^ VP) | X | VN
|
||||
# Step 2: Computing HP and HN
|
||||
HP = VN | ~(D0 | VP)
|
||||
HN = D0 & VP
|
||||
# Step 3: Computing the value D[m,j]
|
||||
currDist += (HP & mask) != 0
|
||||
currDist -= (HN & mask) != 0
|
||||
# Step 4: Computing Vp and VN
|
||||
HP = (HP << 1) | 1
|
||||
HN = HN << 1
|
||||
VP = HN | ~(D0 | HP)
|
||||
VN = HP & D0
|
||||
|
||||
return currDist
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates the minimum number of insertions, deletions, and substitutions
|
||||
required to change one sequence into the other according to Levenshtein with custom
|
||||
costs for insertion, deletion and substitution
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the Levenshtein distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.distance("lewenstein", "levenshtein")
|
||||
2
|
||||
|
||||
Setting a maximum distance allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Levenshtein.distance("lewenstein", "levenshtein", score_cutoff=1)
|
||||
2
|
||||
|
||||
It is possible to select different weights by passing a `weight`
|
||||
tuple.
|
||||
|
||||
>>> Levenshtein.distance("lewenstein", "levenshtein", weights=(1,1,2))
|
||||
3
|
||||
"""
|
||||
_ = score_hint
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
if weights is None or weights == (1, 1, 1):
|
||||
dist = _uniform_distance(s1, s2)
|
||||
elif weights == (1, 1, 2):
|
||||
dist = Indel.distance(s1, s2)
|
||||
else:
|
||||
dist = _uniform_generic(s1, s2, weights)
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates the levenshtein similarity in the range [max, 0] using custom
|
||||
costs for insertion, deletion and substitution.
|
||||
|
||||
This is calculated as ``max - distance``, where max is the maximal possible
|
||||
Levenshtein distance given the lengths of the sequences s1/s2 and the weights.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected similarity between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
"""
|
||||
_ = score_hint
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
weights = weights or (1, 1, 1)
|
||||
maximum = _levenshtein_maximum(s1, s2, weights)
|
||||
dist = distance(s1, s2, weights=weights)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized levenshtein distance in the range [1, 0] using custom
|
||||
costs for insertion, deletion and substitution.
|
||||
|
||||
This is calculated as ``distance / max``, where max is the maximal possible
|
||||
Levenshtein distance given the lengths of the sequences s1/s2 and the weights.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
score_hint : float, optional
|
||||
Expected normalized distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 1.0 and 0.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
"""
|
||||
_ = score_hint
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
weights = weights or (1, 1, 1)
|
||||
maximum = _levenshtein_maximum(s1, s2, weights)
|
||||
dist = distance(s1, s2, weights=weights)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
weights=(1, 1, 1),
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized levenshtein similarity in the range [0, 1] using custom
|
||||
costs for insertion, deletion and substitution.
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
weights : tuple[int, int, int] or None, optional
|
||||
The weights for the three operations in the form
|
||||
(insertion, deletion, substitution). Default is (1, 1, 1),
|
||||
which gives all three operations a weight of 1.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is None,
|
||||
which deactivates this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected normalized similarity between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
|
||||
Raises
|
||||
------
|
||||
ValueError
|
||||
If unsupported weights are provided a ValueError is thrown
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the normalized Levenshtein similarity between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein")
|
||||
0.81818181818181
|
||||
|
||||
Setting a score_cutoff allows the implementation to select
|
||||
a more efficient implementation:
|
||||
|
||||
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein", score_cutoff=0.85)
|
||||
0.0
|
||||
|
||||
It is possible to select different weights by passing a `weight`
|
||||
tuple.
|
||||
|
||||
>>> Levenshtein.normalized_similarity("lewenstein", "levenshtein", weights=(1,1,2))
|
||||
0.85714285714285
|
||||
|
||||
When a different processor is used s1 and s2 do not have to be strings
|
||||
|
||||
>>> Levenshtein.normalized_similarity(["lewenstein"], ["levenshtein"], processor=lambda s: s[0])
|
||||
0.81818181818181
|
||||
"""
|
||||
_ = score_hint
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
weights = weights or (1, 1, 1)
|
||||
norm_dist = normalized_distance(s1, s2, weights=weights)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def _matrix(s1, s2):
|
||||
if not s1:
|
||||
return (len(s2), [], [])
|
||||
|
||||
VP = (1 << len(s1)) - 1
|
||||
VN = 0
|
||||
currDist = len(s1)
|
||||
mask = 1 << (len(s1) - 1)
|
||||
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
matrix_VP = []
|
||||
matrix_VN = []
|
||||
for ch2 in s2:
|
||||
# Step 1: Computing D0
|
||||
PM_j = block_get(ch2, 0)
|
||||
X = PM_j
|
||||
D0 = (((X & VP) + VP) ^ VP) | X | VN
|
||||
# Step 2: Computing HP and HN
|
||||
HP = VN | ~(D0 | VP)
|
||||
HN = D0 & VP
|
||||
# Step 3: Computing the value D[m,j]
|
||||
currDist += (HP & mask) != 0
|
||||
currDist -= (HN & mask) != 0
|
||||
# Step 4: Computing Vp and VN
|
||||
HP = (HP << 1) | 1
|
||||
HN = HN << 1
|
||||
VP = HN | ~(D0 | HP)
|
||||
VN = HP & D0
|
||||
|
||||
matrix_VP.append(VP)
|
||||
matrix_VN.append(VN)
|
||||
|
||||
return (currDist, matrix_VP, matrix_VN)
|
||||
|
||||
|
||||
def editops(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Return Editops describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [8]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [8] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> for tag, src_pos, dest_pos in Levenshtein.editops("qabxcd", "abycdf"):
|
||||
... print(("%7s s1[%d] s2[%d]" % (tag, src_pos, dest_pos)))
|
||||
delete s1[1] s2[0]
|
||||
replace s1[3] s2[2]
|
||||
insert s1[6] s2[5]
|
||||
"""
|
||||
_ = score_hint
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
prefix_len, suffix_len = common_affix(s1, s2)
|
||||
s1 = s1[prefix_len : len(s1) - suffix_len]
|
||||
s2 = s2[prefix_len : len(s2) - suffix_len]
|
||||
dist, VP, VN = _matrix(s1, s2)
|
||||
|
||||
editops = Editops([], 0, 0)
|
||||
editops._src_len = len(s1) + prefix_len + suffix_len
|
||||
editops._dest_len = len(s2) + prefix_len + suffix_len
|
||||
|
||||
if dist == 0:
|
||||
return editops
|
||||
|
||||
editop_list = [None] * dist
|
||||
col = len(s1)
|
||||
row = len(s2)
|
||||
while row != 0 and col != 0:
|
||||
# deletion
|
||||
if VP[row - 1] & (1 << (col - 1)):
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
else:
|
||||
row -= 1
|
||||
|
||||
# insertion
|
||||
if row and (VN[row - 1] & (1 << (col - 1))):
|
||||
dist -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
else:
|
||||
col -= 1
|
||||
|
||||
# replace (Matches are not recorded)
|
||||
if s1[col] != s2[row]:
|
||||
dist -= 1
|
||||
editop_list[dist] = Editop("replace", col + prefix_len, row + prefix_len)
|
||||
|
||||
while col != 0:
|
||||
dist -= 1
|
||||
col -= 1
|
||||
editop_list[dist] = Editop("delete", col + prefix_len, row + prefix_len)
|
||||
|
||||
while row != 0:
|
||||
dist -= 1
|
||||
row -= 1
|
||||
editop_list[dist] = Editop("insert", col + prefix_len, row + prefix_len)
|
||||
|
||||
editops._editops = editop_list
|
||||
return editops
|
||||
|
||||
|
||||
def opcodes(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_hint=None,
|
||||
):
|
||||
"""
|
||||
Return Opcodes describing how to turn s1 into s2.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor : callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_hint : int, optional
|
||||
Expected distance between s1 and s2. This is used to select a
|
||||
faster implementation. Default is None, which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
edit operations required to turn s1 into s2
|
||||
|
||||
Notes
|
||||
-----
|
||||
The alignment is calculated using an algorithm of Heikki Hyyrö, which is
|
||||
described [9]_. It has a time complexity and memory usage of ``O([N/64] * M)``.
|
||||
|
||||
References
|
||||
----------
|
||||
.. [9] Hyyrö, Heikki. "A Note on Bit-Parallel Alignment Computation."
|
||||
Stringology (2004).
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
|
||||
>>> a = "qabxcd"
|
||||
>>> b = "abycdf"
|
||||
>>> for tag, i1, i2, j1, j2 in Levenshtein.opcodes("qabxcd", "abycdf"):
|
||||
... print(("%7s a[%d:%d] (%s) b[%d:%d] (%s)" %
|
||||
... (tag, i1, i2, a[i1:i2], j1, j2, b[j1:j2])))
|
||||
delete a[0:1] (q) b[0:0] ()
|
||||
equal a[1:3] (ab) b[0:2] (ab)
|
||||
replace a[3:4] (x) b[2:3] (y)
|
||||
equal a[4:6] (cd) b[3:5] (cd)
|
||||
insert a[6:6] () b[5:6] (f)
|
||||
"""
|
||||
return editops(s1, s2, processor=processor, score_hint=score_hint).as_opcodes()
|
||||
93
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA.py
Normal file
93
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA.py
Normal file
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
osa_distance as distance,
|
||||
osa_normalized_distance as normalized_distance,
|
||||
osa_normalized_similarity as normalized_similarity,
|
||||
osa_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
232
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA_py.py
Normal file
232
.venv/lib/python3.11/site-packages/rapidfuzz/distance/OSA_py.py
Normal file
@@ -0,0 +1,232 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def _osa_distance_hyrroe2003(s1, s2):
|
||||
if not s1:
|
||||
return len(s2)
|
||||
|
||||
VP = (1 << len(s1)) - 1
|
||||
VN = 0
|
||||
D0 = 0
|
||||
PM_j_old = 0
|
||||
currDist = len(s1)
|
||||
mask = 1 << (len(s1) - 1)
|
||||
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
for ch2 in s2:
|
||||
# Step 1: Computing D0
|
||||
PM_j = block_get(ch2, 0)
|
||||
TR = (((~D0) & PM_j) << 1) & PM_j_old
|
||||
D0 = (((PM_j & VP) + VP) ^ VP) | PM_j | VN
|
||||
D0 = D0 | TR
|
||||
|
||||
# Step 2: Computing HP and HN
|
||||
HP = VN | ~(D0 | VP)
|
||||
HN = D0 & VP
|
||||
|
||||
# Step 3: Computing the value D[m,j]
|
||||
currDist += (HP & mask) != 0
|
||||
currDist -= (HN & mask) != 0
|
||||
|
||||
# Step 4: Computing Vp and VN
|
||||
HP = (HP << 1) | 1
|
||||
HN = HN << 1
|
||||
VP = HN | ~(D0 | HP)
|
||||
VN = HP & D0
|
||||
PM_j_old = PM_j
|
||||
|
||||
return currDist
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the optimal string alignment (OSA) distance.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
|
||||
Examples
|
||||
--------
|
||||
Find the OSA distance between two strings:
|
||||
|
||||
>>> from rapidfuzz.distance import OSA
|
||||
>>> OSA.distance("CA", "AC")
|
||||
2
|
||||
>>> OSA.distance("CA", "ABC")
|
||||
3
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
dist = _osa_distance_hyrroe2003(s1, s2)
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the optimal string alignment (OSA) similarity in the range [max, 0].
|
||||
|
||||
This is calculated as ``max(len1, len2) - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : int
|
||||
similarity between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
sim = maximum - dist
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized optimal string alignment (OSA) similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / max(len1, len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
dist = distance(s1, s2)
|
||||
norm_dist = dist / maximum if maximum else 0
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized optimal string alignment (OSA) similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
norm_dist = normalized_distance(s1, s2)
|
||||
norm_sim = 1.0 - norm_dist
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
postfix_distance as distance,
|
||||
postfix_normalized_distance as normalized_distance,
|
||||
postfix_normalized_similarity as normalized_similarity,
|
||||
postfix_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the postfix distance between two strings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int or None, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
dist = maximum - sim
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the postfix similarity between two strings.
|
||||
|
||||
This is calculated as ``len1 - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
sim = 0
|
||||
for ch1, ch2 in zip(reversed(s1), reversed(s2)):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
sim += 1
|
||||
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized postfix similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
norm_sim = normalized_similarity(s1, s2, processor=processor)
|
||||
norm_dist = 1.0 - norm_sim
|
||||
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized postfix similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
norm_sim = sim / maximum if maximum else 1.0
|
||||
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0
|
||||
@@ -0,0 +1,93 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["distance", "normalized_distance", "normalized_similarity", "similarity"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance.metrics_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance.metrics_py import (
|
||||
prefix_distance as distance,
|
||||
prefix_normalized_distance as normalized_distance,
|
||||
prefix_normalized_similarity as normalized_similarity,
|
||||
prefix_similarity as similarity,
|
||||
)
|
||||
@@ -0,0 +1,75 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_distance(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: int | None = None,
|
||||
) -> int: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def normalized_similarity(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@@ -0,0 +1,182 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import is_none, setupPandas
|
||||
|
||||
|
||||
def distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the Prefix distance between two strings.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int or None, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the distance is bigger than score_cutoff,
|
||||
score_cutoff + 1 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
dist = maximum - sim
|
||||
|
||||
return dist if (score_cutoff is None or dist <= score_cutoff) else score_cutoff + 1
|
||||
|
||||
|
||||
def similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the prefix similarity between two strings.
|
||||
|
||||
This is calculated as ``len1 - distance``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : int, optional
|
||||
Maximum distance between s1 and s2, that is
|
||||
considered as a result. If the similarity is smaller than score_cutoff,
|
||||
0 is returned instead. Default is None, which deactivates
|
||||
this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
distance : int
|
||||
distance between s1 and s2
|
||||
"""
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
sim = 0
|
||||
for ch1, ch2 in zip(s1, s2):
|
||||
if ch1 != ch2:
|
||||
break
|
||||
sim += 1
|
||||
|
||||
return sim if (score_cutoff is None or sim >= score_cutoff) else 0
|
||||
|
||||
|
||||
def normalized_distance(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized prefix similarity in the range [1, 0].
|
||||
|
||||
This is calculated as ``distance / (len1 + len2)``.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_dist > score_cutoff 1.0 is returned instead. Default is 1.0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_dist : float
|
||||
normalized distance between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 1.0
|
||||
|
||||
norm_sim = normalized_similarity(s1, s2, processor=processor)
|
||||
norm_dist = 1.0 - norm_sim
|
||||
|
||||
return norm_dist if (score_cutoff is None or norm_dist <= score_cutoff) else 1.0
|
||||
|
||||
|
||||
def normalized_similarity(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a normalized prefix similarity in the range [0, 1].
|
||||
|
||||
This is calculated as ``1 - normalized_distance``
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 1.0.
|
||||
For norm_sim < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
norm_sim : float
|
||||
normalized similarity between s1 and s2 as a float between 0 and 1.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0.0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
maximum = max(len(s1), len(s2))
|
||||
sim = similarity(s1, s2)
|
||||
norm_sim = sim / maximum if maximum else 1.0
|
||||
|
||||
return norm_sim if (score_cutoff is None or norm_sim >= score_cutoff) else 0.0
|
||||
@@ -0,0 +1,37 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from . import (
|
||||
OSA,
|
||||
DamerauLevenshtein,
|
||||
Hamming,
|
||||
Indel,
|
||||
Jaro,
|
||||
JaroWinkler,
|
||||
LCSseq,
|
||||
Levenshtein,
|
||||
Postfix,
|
||||
Prefix,
|
||||
)
|
||||
from ._initialize import Editop, Editops, MatchingBlock, Opcode, Opcodes, ScoreAlignment
|
||||
|
||||
__all__ = [
|
||||
"OSA",
|
||||
"DamerauLevenshtein",
|
||||
"Editop",
|
||||
"Editops",
|
||||
"Hamming",
|
||||
"Indel",
|
||||
"Jaro",
|
||||
"JaroWinkler",
|
||||
"LCSseq",
|
||||
"Levenshtein",
|
||||
"MatchingBlock",
|
||||
"Opcode",
|
||||
"Opcodes",
|
||||
"Postfix",
|
||||
"Prefix",
|
||||
"ScoreAlignment",
|
||||
]
|
||||
@@ -0,0 +1,25 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from . import (
|
||||
OSA as OSA,
|
||||
DamerauLevenshtein as DamerauLevenshtein,
|
||||
Hamming as Hamming,
|
||||
Indel as Indel,
|
||||
Jaro as Jaro,
|
||||
JaroWinkler as JaroWinkler,
|
||||
LCSseq as LCSseq,
|
||||
Levenshtein as Levenshtein,
|
||||
Postfix as Postfix,
|
||||
Prefix as Prefix,
|
||||
)
|
||||
from ._initialize import (
|
||||
Editop as Editop,
|
||||
Editops as Editops,
|
||||
MatchingBlock as MatchingBlock,
|
||||
Opcode as Opcode,
|
||||
Opcodes as Opcodes,
|
||||
ScoreAlignment as ScoreAlignment,
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,109 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["Editop", "Editops", "MatchingBlock", "Opcode", "Opcodes", "ScoreAlignment"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance._initialize_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.distance._initialize_py import (
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.distance._initialize_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.distance._initialize_py import (
|
||||
Editop,
|
||||
Editops,
|
||||
MatchingBlock,
|
||||
Opcode,
|
||||
Opcodes,
|
||||
ScoreAlignment,
|
||||
)
|
||||
@@ -0,0 +1,133 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Iterator
|
||||
|
||||
_AnyOpList = list[Editop | tuple[str, int, int]] | list[Opcode | tuple[str, int, int, int, int]]
|
||||
|
||||
class MatchingBlock:
|
||||
a: int
|
||||
b: int
|
||||
size: int
|
||||
|
||||
def __init__(self, a: int, b: int, size: int): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int: ...
|
||||
def __iter__(self) -> Iterator[int]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class Editop:
|
||||
tag: str
|
||||
src_pos: int
|
||||
dest_pos: int
|
||||
|
||||
def __init__(self, tag: str, src_pos: int, dest_pos: int): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int | str: ...
|
||||
def __iter__(self) -> Iterator[int | str]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class Editops:
|
||||
_src_len: int
|
||||
_dest_len: int
|
||||
_editops: list[Editop]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
editops: _AnyOpList | None = None,
|
||||
src_len: int = 0,
|
||||
dest_len: int = 0,
|
||||
): ...
|
||||
@classmethod
|
||||
def from_opcodes(cls, opcodes: Opcodes) -> Editops: ...
|
||||
def as_matching_blocks(self) -> list[MatchingBlock]: ...
|
||||
def as_list(self) -> list[Editop]: ...
|
||||
def copy(self) -> Editops: ...
|
||||
def inverse(self) -> Editops: ...
|
||||
def remove_subsequence(self, subsequence: Editops) -> None: ...
|
||||
def apply(self, source_string: str, destination_string: str) -> str: ...
|
||||
@property
|
||||
def src_len(self) -> int: ...
|
||||
@src_len.setter
|
||||
def src_len(self, value: int) -> None: ...
|
||||
@property
|
||||
def dest_len(self) -> int: ...
|
||||
@dest_len.setter
|
||||
def dest_len(self, value: int) -> None: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __delitem__(self, key: int | slice) -> None: ...
|
||||
def __getitem__(self, key: int | slice) -> Editops | Editop: ...
|
||||
def __iter__(self) -> Iterator[Editop]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class Opcode:
|
||||
tag: str
|
||||
src_start: int
|
||||
src_end: int
|
||||
dest_start: int
|
||||
dest_end: int
|
||||
|
||||
def __init__(self, tag: str, src_start: int, src_end: int, dest_start: int, dest_end: int): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int | str: ...
|
||||
def __iter__(self) -> Iterator[int | str]: ...
|
||||
|
||||
class Opcodes:
|
||||
_src_len: int
|
||||
_dest_len: int
|
||||
_opcodes: list[Opcode]
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
opcodes: _AnyOpList | None = None,
|
||||
src_len: int = 0,
|
||||
dest_len: int = 0,
|
||||
): ...
|
||||
@classmethod
|
||||
def from_editops(cls, editops: Editops) -> Opcodes: ...
|
||||
def as_editops(self) -> Editops: ...
|
||||
def as_matching_blocks(self) -> list[MatchingBlock]: ...
|
||||
def as_list(self) -> list[Opcode]: ...
|
||||
def copy(self) -> Opcodes: ...
|
||||
def inverse(self) -> Opcodes: ...
|
||||
def apply(self, source_string: str, destination_string: str) -> str: ...
|
||||
@property
|
||||
def src_len(self) -> int: ...
|
||||
@src_len.setter
|
||||
def src_len(self, value: int) -> None: ...
|
||||
@property
|
||||
def dest_len(self) -> int: ...
|
||||
@dest_len.setter
|
||||
def dest_len(self, value: int) -> None: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __len__(self) -> int: ...
|
||||
def __getitem__(self, key: int) -> Opcode: ...
|
||||
def __iter__(self) -> Iterator[Opcode]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
|
||||
class ScoreAlignment:
|
||||
score: int | float
|
||||
src_start: int
|
||||
src_end: int
|
||||
dest_start: int
|
||||
dest_end: int
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
score: int | float,
|
||||
src_start: int,
|
||||
src_end: int,
|
||||
dest_start: int,
|
||||
dest_end: int,
|
||||
): ...
|
||||
def __len__(self) -> int: ...
|
||||
def __eq__(self, other: object) -> bool: ...
|
||||
def __getitem__(self, i: int) -> int | float: ...
|
||||
def __iter__(self) -> Iterator[int | float]: ...
|
||||
def __repr__(self) -> str: ...
|
||||
Binary file not shown.
@@ -0,0 +1,884 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
|
||||
def _list_to_editops(
|
||||
ops,
|
||||
src_len,
|
||||
dest_len,
|
||||
):
|
||||
if not ops:
|
||||
return []
|
||||
|
||||
if len(ops[0]) == 5:
|
||||
return Opcodes(ops, src_len, dest_len).as_editops()._editops
|
||||
|
||||
blocks = []
|
||||
for op in ops:
|
||||
edit_type, src_pos, dest_pos = op
|
||||
|
||||
if src_pos > src_len or dest_pos > dest_len:
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
if src_pos == src_len and edit_type != "insert":
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if dest_pos == dest_len and edit_type != "delete":
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
# keep operations are not relevant in editops
|
||||
if edit_type == "equal":
|
||||
continue
|
||||
|
||||
blocks.append(Editop(edit_type, src_pos, dest_pos))
|
||||
|
||||
# validate order of editops
|
||||
for i in range(len(blocks) - 1):
|
||||
if blocks[i + 1].src_pos < blocks[i].src_pos or blocks[i + 1].dest_pos < blocks[i].dest_pos:
|
||||
msg = "List of edit operations out of order"
|
||||
raise ValueError(msg)
|
||||
if blocks[i + 1].src_pos == blocks[i].src_pos and blocks[i + 1].dest_pos == blocks[i].dest_pos:
|
||||
msg = "Duplicated edit operation"
|
||||
raise ValueError(msg)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
def _list_to_opcodes(
|
||||
ops,
|
||||
src_len,
|
||||
dest_len,
|
||||
):
|
||||
if not ops or len(ops[0]) == 3:
|
||||
return Editops(ops, src_len, dest_len).as_opcodes()._opcodes
|
||||
|
||||
blocks = []
|
||||
for op in ops:
|
||||
edit_type, src_start, src_end, dest_start, dest_end = op
|
||||
|
||||
if src_end > src_len or dest_end > dest_len:
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if src_end < src_start or dest_end < dest_start:
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
if edit_type in {"equal", "replace"} and (src_end - src_start != dest_end - dest_start or src_start == src_end):
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if edit_type == "insert" and (src_start != src_end or dest_start == dest_end):
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
if edit_type == "delete" and (src_start == src_end or dest_start != dest_end):
|
||||
msg = "List of edit operations invalid"
|
||||
raise ValueError(msg)
|
||||
|
||||
# merge similar adjacent blocks
|
||||
if blocks and (
|
||||
blocks[-1].tag == edit_type and blocks[-1].src_end == src_start and blocks[-1].dest_end == dest_start
|
||||
):
|
||||
blocks[-1].src_end = src_end
|
||||
blocks[-1].dest_end = dest_end
|
||||
continue
|
||||
|
||||
blocks.append(Opcode(edit_type, src_start, src_end, dest_start, dest_end))
|
||||
|
||||
# check if edit operations span the complete string
|
||||
if blocks[0].src_start != 0 or blocks[0].dest_start != 0:
|
||||
msg = "List of edit operations does not start at position 0"
|
||||
raise ValueError(msg)
|
||||
if blocks[-1].src_end != src_len or blocks[-1].dest_end != dest_len:
|
||||
msg = "List of edit operations does not end at the string ends"
|
||||
raise ValueError(msg)
|
||||
for i in range(len(blocks) - 1):
|
||||
if blocks[i + 1].src_start != blocks[i].src_end or blocks[i + 1].dest_start != blocks[i].dest_end:
|
||||
msg = "List of edit operations is not continuous"
|
||||
raise ValueError(msg)
|
||||
|
||||
return blocks
|
||||
|
||||
|
||||
class MatchingBlock:
|
||||
"""
|
||||
Triple describing matching subsequences
|
||||
"""
|
||||
|
||||
def __init__(self, a, b, size):
|
||||
self.a = a
|
||||
self.b = b
|
||||
self.size = size
|
||||
|
||||
def __len__(self):
|
||||
return 3
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 3:
|
||||
return False
|
||||
|
||||
return bool(other[0] == self.a and other[1] == self.b and other[2] == self.size)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -3}:
|
||||
return self.a
|
||||
if i in {1, -2}:
|
||||
return self.b
|
||||
if i in {2, -1}:
|
||||
return self.size
|
||||
|
||||
msg = "MatchingBlock index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(3):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return f"MatchingBlock(a={self.a}, b={self.b}, size={self.size})"
|
||||
|
||||
|
||||
class Editop:
|
||||
"""
|
||||
Tuple like object describing an edit operation.
|
||||
It is in the form (tag, src_pos, dest_pos)
|
||||
|
||||
The tags are strings, with these meanings:
|
||||
|
||||
+-----------+---------------------------------------------------+
|
||||
| tag | explanation |
|
||||
+===========+===================================================+
|
||||
| 'replace' | src[src_pos] should be replaced by dest[dest_pos] |
|
||||
+-----------+---------------------------------------------------+
|
||||
| 'delete' | src[src_pos] should be deleted |
|
||||
+-----------+---------------------------------------------------+
|
||||
| 'insert' | dest[dest_pos] should be inserted at src[src_pos] |
|
||||
+-----------+---------------------------------------------------+
|
||||
"""
|
||||
|
||||
def __init__(self, tag, src_pos, dest_pos):
|
||||
self.tag = tag
|
||||
self.src_pos = src_pos
|
||||
self.dest_pos = dest_pos
|
||||
|
||||
def __len__(self):
|
||||
return 3
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 3:
|
||||
return False
|
||||
|
||||
return bool(other[0] == self.tag and other[1] == self.src_pos and other[2] == self.dest_pos)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -3}:
|
||||
return self.tag
|
||||
if i in {1, -2}:
|
||||
return self.src_pos
|
||||
if i in {2, -1}:
|
||||
return self.dest_pos
|
||||
|
||||
msg = "Editop index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(3):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return f"Editop(tag={self.tag!r}, src_pos={self.src_pos}, dest_pos={self.dest_pos})"
|
||||
|
||||
|
||||
class Editops:
|
||||
"""
|
||||
List like object of Editops describing how to turn s1 into s2.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
editops=None,
|
||||
src_len=0,
|
||||
dest_len=0,
|
||||
):
|
||||
self._src_len = src_len
|
||||
self._dest_len = dest_len
|
||||
self._editops = _list_to_editops(editops, src_len, dest_len)
|
||||
|
||||
@classmethod
|
||||
def from_opcodes(cls, opcodes):
|
||||
"""
|
||||
Create Editops from Opcodes
|
||||
|
||||
Parameters
|
||||
----------
|
||||
opcodes : Opcodes
|
||||
opcodes to convert to editops
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
Opcodes converted to Editops
|
||||
"""
|
||||
return opcodes.as_editops()
|
||||
|
||||
def as_opcodes(self):
|
||||
"""
|
||||
Convert to Opcodes
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
Editops converted to Opcodes
|
||||
"""
|
||||
x = Opcodes.__new__(Opcodes)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
blocks = []
|
||||
src_pos = 0
|
||||
dest_pos = 0
|
||||
i = 0
|
||||
while i < len(self._editops):
|
||||
if src_pos < self._editops[i].src_pos or dest_pos < self._editops[i].dest_pos:
|
||||
blocks.append(
|
||||
Opcode(
|
||||
"equal",
|
||||
src_pos,
|
||||
self._editops[i].src_pos,
|
||||
dest_pos,
|
||||
self._editops[i].dest_pos,
|
||||
)
|
||||
)
|
||||
src_pos = self._editops[i].src_pos
|
||||
dest_pos = self._editops[i].dest_pos
|
||||
|
||||
src_begin = src_pos
|
||||
dest_begin = dest_pos
|
||||
tag = self._editops[i].tag
|
||||
while (
|
||||
i < len(self._editops)
|
||||
and self._editops[i].tag == tag
|
||||
and src_pos == self._editops[i].src_pos
|
||||
and dest_pos == self._editops[i].dest_pos
|
||||
):
|
||||
if tag == "replace":
|
||||
src_pos += 1
|
||||
dest_pos += 1
|
||||
elif tag == "insert":
|
||||
dest_pos += 1
|
||||
elif tag == "delete":
|
||||
src_pos += 1
|
||||
|
||||
i += 1
|
||||
|
||||
blocks.append(Opcode(tag, src_begin, src_pos, dest_begin, dest_pos))
|
||||
|
||||
if src_pos < self.src_len or dest_pos < self.dest_len:
|
||||
blocks.append(Opcode("equal", src_pos, self.src_len, dest_pos, self.dest_len))
|
||||
|
||||
x._opcodes = blocks
|
||||
return x
|
||||
|
||||
def as_matching_blocks(self):
|
||||
"""
|
||||
Convert to matching blocks
|
||||
|
||||
Returns
|
||||
-------
|
||||
matching blocks : list[MatchingBlock]
|
||||
Editops converted to matching blocks
|
||||
"""
|
||||
blocks = []
|
||||
src_pos = 0
|
||||
dest_pos = 0
|
||||
for op in self:
|
||||
if src_pos < op.src_pos or dest_pos < op.dest_pos:
|
||||
length = min(op.src_pos - src_pos, op.dest_pos - dest_pos)
|
||||
if length > 0:
|
||||
blocks.append(MatchingBlock(src_pos, dest_pos, length))
|
||||
src_pos = op.src_pos
|
||||
dest_pos = op.dest_pos
|
||||
|
||||
if op.tag == "replace":
|
||||
src_pos += 1
|
||||
dest_pos += 1
|
||||
elif op.tag == "delete":
|
||||
src_pos += 1
|
||||
elif op.tag == "insert":
|
||||
dest_pos += 1
|
||||
|
||||
if src_pos < self.src_len or dest_pos < self.dest_len:
|
||||
length = min(self.src_len - src_pos, self.dest_len - dest_pos)
|
||||
if length > 0:
|
||||
blocks.append(MatchingBlock(src_pos, dest_pos, length))
|
||||
|
||||
blocks.append(MatchingBlock(self.src_len, self.dest_len, 0))
|
||||
return blocks
|
||||
|
||||
def as_list(self):
|
||||
"""
|
||||
Convert Editops to a list of tuples.
|
||||
|
||||
This is the equivalent of ``[x for x in editops]``
|
||||
"""
|
||||
return [tuple(op) for op in self._editops]
|
||||
|
||||
def copy(self):
|
||||
"""
|
||||
performs copy of Editops
|
||||
"""
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
x._editops = self._editops[::]
|
||||
return x
|
||||
|
||||
def inverse(self):
|
||||
"""
|
||||
Invert Editops, so it describes how to transform the destination string to
|
||||
the source string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
inverted Editops
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.editops('spam', 'park')
|
||||
[Editop(tag=delete, src_pos=0, dest_pos=0),
|
||||
Editop(tag=replace, src_pos=3, dest_pos=2),
|
||||
Editop(tag=insert, src_pos=4, dest_pos=3)]
|
||||
|
||||
>>> Levenshtein.editops('spam', 'park').inverse()
|
||||
[Editop(tag=insert, src_pos=0, dest_pos=0),
|
||||
Editop(tag=replace, src_pos=2, dest_pos=3),
|
||||
Editop(tag=delete, src_pos=3, dest_pos=4)]
|
||||
"""
|
||||
blocks = []
|
||||
for op in self:
|
||||
tag = op.tag
|
||||
if tag == "delete":
|
||||
tag = "insert"
|
||||
elif tag == "insert":
|
||||
tag = "delete"
|
||||
|
||||
blocks.append(Editop(tag, op.dest_pos, op.src_pos))
|
||||
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self.dest_len
|
||||
x._dest_len = self.src_len
|
||||
x._editops = blocks
|
||||
return x
|
||||
|
||||
def remove_subsequence(self, subsequence):
|
||||
"""
|
||||
remove a subsequence
|
||||
|
||||
Parameters
|
||||
----------
|
||||
subsequence : Editops
|
||||
subsequence to remove (has to be a subset of editops)
|
||||
|
||||
Returns
|
||||
-------
|
||||
sequence : Editops
|
||||
a copy of the editops without the subsequence
|
||||
"""
|
||||
result = Editops.__new__(Editops)
|
||||
result._src_len = self._src_len
|
||||
result._dest_len = self._dest_len
|
||||
|
||||
if len(subsequence) > len(self):
|
||||
msg = "subsequence is not a subsequence"
|
||||
raise ValueError(msg)
|
||||
|
||||
result._editops = [None] * (len(self) - len(subsequence))
|
||||
|
||||
# offset to correct removed edit operation
|
||||
offset = 0
|
||||
op_pos = 0
|
||||
result_pos = 0
|
||||
|
||||
for sop in subsequence:
|
||||
while op_pos != len(self) and sop != self._editops[op_pos]:
|
||||
result[result_pos] = self._editops[op_pos]
|
||||
result[result_pos].src_pos += offset
|
||||
result_pos += 1
|
||||
op_pos += 1
|
||||
|
||||
# element of subsequence not part of the sequence
|
||||
if op_pos == len(self):
|
||||
msg = "subsequence is not a subsequence"
|
||||
raise ValueError(msg)
|
||||
|
||||
if sop.tag == "insert":
|
||||
offset += 1
|
||||
elif sop.tag == "delete":
|
||||
offset -= 1
|
||||
|
||||
op_pos += 1
|
||||
|
||||
# add remaining elements
|
||||
while op_pos != len(self):
|
||||
result[result_pos] = self._editops[op_pos]
|
||||
result[result_pos].src_pos += offset
|
||||
result_pos += 1
|
||||
op_pos += 1
|
||||
|
||||
return result
|
||||
|
||||
def apply(self, source_string, destination_string):
|
||||
"""
|
||||
apply editops to source_string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_string : str | bytes
|
||||
string to apply editops to
|
||||
destination_string : str | bytes
|
||||
string to use for replacements / insertions into source_string
|
||||
|
||||
Returns
|
||||
-------
|
||||
mod_string : str
|
||||
modified source_string
|
||||
|
||||
"""
|
||||
res_str = ""
|
||||
src_pos = 0
|
||||
|
||||
for op in self._editops:
|
||||
# matches between last and current editop
|
||||
while src_pos < op.src_pos:
|
||||
res_str += source_string[src_pos]
|
||||
src_pos += 1
|
||||
|
||||
if op.tag == "replace":
|
||||
res_str += destination_string[op.dest_pos]
|
||||
src_pos += 1
|
||||
elif op.tag == "insert":
|
||||
res_str += destination_string[op.dest_pos]
|
||||
elif op.tag == "delete":
|
||||
src_pos += 1
|
||||
|
||||
# matches after the last editop
|
||||
while src_pos < len(source_string):
|
||||
res_str += source_string[src_pos]
|
||||
src_pos += 1
|
||||
|
||||
return res_str
|
||||
|
||||
@property
|
||||
def src_len(self):
|
||||
return self._src_len
|
||||
|
||||
@src_len.setter
|
||||
def src_len(self, value):
|
||||
self._src_len = value
|
||||
|
||||
@property
|
||||
def dest_len(self):
|
||||
return self._dest_len
|
||||
|
||||
@dest_len.setter
|
||||
def dest_len(self, value):
|
||||
self._dest_len = value
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, Editops):
|
||||
return False
|
||||
|
||||
return self.dest_len == other.dest_len and self.src_len == other.src_len and self._editops == other._editops
|
||||
|
||||
def __len__(self):
|
||||
return len(self._editops)
|
||||
|
||||
def __delitem__(self, key):
|
||||
del self._editops[key]
|
||||
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, int):
|
||||
return self._editops[key]
|
||||
|
||||
start, stop, step = key.indices(len(self._editops))
|
||||
if step < 0:
|
||||
msg = "step sizes below 0 lead to an invalid order of editops"
|
||||
raise ValueError(msg)
|
||||
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
x._editops = self._editops[start:stop:step]
|
||||
return x
|
||||
|
||||
def __iter__(self):
|
||||
yield from self._editops
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"Editops([" + ", ".join(repr(op) for op in self) + f"], src_len={self.src_len}, dest_len={self.dest_len})"
|
||||
)
|
||||
|
||||
|
||||
class Opcode:
|
||||
"""
|
||||
Tuple like object describing an edit operation.
|
||||
It is in the form (tag, src_start, src_end, dest_start, dest_end)
|
||||
|
||||
The tags are strings, with these meanings:
|
||||
|
||||
+-----------+-----------------------------------------------------+
|
||||
| tag | explanation |
|
||||
+===========+=====================================================+
|
||||
| 'replace' | src[src_start:src_end] should be |
|
||||
| | replaced by dest[dest_start:dest_end] |
|
||||
+-----------+-----------------------------------------------------+
|
||||
| 'delete' | src[src_start:src_end] should be deleted. |
|
||||
| | Note that dest_start==dest_end in this case. |
|
||||
+-----------+-----------------------------------------------------+
|
||||
| 'insert' | dest[dest_start:dest_end] should be inserted |
|
||||
| | at src[src_start:src_start]. |
|
||||
| | Note that src_start==src_end in this case. |
|
||||
+-----------+-----------------------------------------------------+
|
||||
| 'equal' | src[src_start:src_end] == dest[dest_start:dest_end] |
|
||||
+-----------+-----------------------------------------------------+
|
||||
|
||||
Note
|
||||
----
|
||||
Opcode is compatible with the tuples returned by difflib's SequenceMatcher to make them
|
||||
interoperable
|
||||
"""
|
||||
|
||||
def __init__(self, tag, src_start, src_end, dest_start, dest_end):
|
||||
self.tag = tag
|
||||
self.src_start = src_start
|
||||
self.src_end = src_end
|
||||
self.dest_start = dest_start
|
||||
self.dest_end = dest_end
|
||||
|
||||
def __len__(self):
|
||||
return 5
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 5:
|
||||
return False
|
||||
|
||||
return bool(
|
||||
other[0] == self.tag
|
||||
and other[1] == self.src_start
|
||||
and other[2] == self.src_end
|
||||
and other[3] == self.dest_start
|
||||
and other[4] == self.dest_end
|
||||
)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -5}:
|
||||
return self.tag
|
||||
if i in {1, -4}:
|
||||
return self.src_start
|
||||
if i in {2, -3}:
|
||||
return self.src_end
|
||||
if i in {3, -2}:
|
||||
return self.dest_start
|
||||
if i in {4, -1}:
|
||||
return self.dest_end
|
||||
|
||||
msg = "Opcode index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(5):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"Opcode(tag={self.tag!r}, src_start={self.src_start}, src_end={self.src_end}, "
|
||||
f"dest_start={self.dest_start}, dest_end={self.dest_end})"
|
||||
)
|
||||
|
||||
|
||||
class Opcodes:
|
||||
"""
|
||||
List like object of Opcodes describing how to turn s1 into s2.
|
||||
The first Opcode has src_start == dest_start == 0, and remaining tuples
|
||||
have src_start == the src_end from the tuple preceding it,
|
||||
and likewise for dest_start == the previous dest_end.
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
opcodes=None,
|
||||
src_len=0,
|
||||
dest_len=0,
|
||||
):
|
||||
self._src_len = src_len
|
||||
self._dest_len = dest_len
|
||||
self._opcodes = _list_to_opcodes(opcodes, src_len, dest_len)
|
||||
|
||||
@classmethod
|
||||
def from_editops(cls, editops):
|
||||
"""
|
||||
Create Opcodes from Editops
|
||||
|
||||
Parameters
|
||||
----------
|
||||
editops : Editops
|
||||
editops to convert to opcodes
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
Editops converted to Opcodes
|
||||
"""
|
||||
return editops.as_opcodes()
|
||||
|
||||
def as_editops(self):
|
||||
"""
|
||||
Convert Opcodes to Editops
|
||||
|
||||
Returns
|
||||
-------
|
||||
editops : Editops
|
||||
Opcodes converted to Editops
|
||||
"""
|
||||
x = Editops.__new__(Editops)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
blocks = []
|
||||
for op in self:
|
||||
if op.tag == "replace":
|
||||
for j in range(op.src_end - op.src_start):
|
||||
blocks.append(Editop("replace", op.src_start + j, op.dest_start + j))
|
||||
elif op.tag == "insert":
|
||||
for j in range(op.dest_end - op.dest_start):
|
||||
blocks.append(Editop("insert", op.src_start, op.dest_start + j))
|
||||
elif op.tag == "delete":
|
||||
for j in range(op.src_end - op.src_start):
|
||||
blocks.append(Editop("delete", op.src_start + j, op.dest_start))
|
||||
|
||||
x._editops = blocks
|
||||
return x
|
||||
|
||||
def as_matching_blocks(self):
|
||||
"""
|
||||
Convert to matching blocks
|
||||
|
||||
Returns
|
||||
-------
|
||||
matching blocks : list[MatchingBlock]
|
||||
Opcodes converted to matching blocks
|
||||
"""
|
||||
blocks = []
|
||||
for op in self:
|
||||
if op.tag == "equal":
|
||||
length = min(op.src_end - op.src_start, op.dest_end - op.dest_start)
|
||||
if length > 0:
|
||||
blocks.append(MatchingBlock(op.src_start, op.dest_start, length))
|
||||
|
||||
blocks.append(MatchingBlock(self.src_len, self.dest_len, 0))
|
||||
return blocks
|
||||
|
||||
def as_list(self):
|
||||
"""
|
||||
Convert Opcodes to a list of tuples, which is compatible
|
||||
with the opcodes of difflibs SequenceMatcher.
|
||||
|
||||
This is the equivalent of ``[x for x in opcodes]``
|
||||
"""
|
||||
return [tuple(op) for op in self._opcodes]
|
||||
|
||||
def copy(self):
|
||||
"""
|
||||
performs copy of Opcodes
|
||||
"""
|
||||
x = Opcodes.__new__(Opcodes)
|
||||
x._src_len = self._src_len
|
||||
x._dest_len = self._dest_len
|
||||
x._opcodes = self._opcodes[::]
|
||||
return x
|
||||
|
||||
def inverse(self):
|
||||
"""
|
||||
Invert Opcodes, so it describes how to transform the destination string to
|
||||
the source string.
|
||||
|
||||
Returns
|
||||
-------
|
||||
opcodes : Opcodes
|
||||
inverted Opcodes
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> Levenshtein.opcodes('spam', 'park')
|
||||
[Opcode(tag=delete, src_start=0, src_end=1, dest_start=0, dest_end=0),
|
||||
Opcode(tag=equal, src_start=1, src_end=3, dest_start=0, dest_end=2),
|
||||
Opcode(tag=replace, src_start=3, src_end=4, dest_start=2, dest_end=3),
|
||||
Opcode(tag=insert, src_start=4, src_end=4, dest_start=3, dest_end=4)]
|
||||
|
||||
>>> Levenshtein.opcodes('spam', 'park').inverse()
|
||||
[Opcode(tag=insert, src_start=0, src_end=0, dest_start=0, dest_end=1),
|
||||
Opcode(tag=equal, src_start=0, src_end=2, dest_start=1, dest_end=3),
|
||||
Opcode(tag=replace, src_start=2, src_end=3, dest_start=3, dest_end=4),
|
||||
Opcode(tag=delete, src_start=3, src_end=4, dest_start=4, dest_end=4)]
|
||||
"""
|
||||
blocks = []
|
||||
for op in self:
|
||||
tag = op.tag
|
||||
if tag == "delete":
|
||||
tag = "insert"
|
||||
elif tag == "insert":
|
||||
tag = "delete"
|
||||
|
||||
blocks.append(Opcode(tag, op.dest_start, op.dest_end, op.src_start, op.src_end))
|
||||
|
||||
x = Opcodes.__new__(Opcodes)
|
||||
x._src_len = self.dest_len
|
||||
x._dest_len = self.src_len
|
||||
x._opcodes = blocks
|
||||
return x
|
||||
|
||||
def apply(self, source_string, destination_string):
|
||||
"""
|
||||
apply opcodes to source_string
|
||||
|
||||
Parameters
|
||||
----------
|
||||
source_string : str | bytes
|
||||
string to apply opcodes to
|
||||
destination_string : str | bytes
|
||||
string to use for replacements / insertions into source_string
|
||||
|
||||
Returns
|
||||
-------
|
||||
mod_string : str
|
||||
modified source_string
|
||||
|
||||
"""
|
||||
res_str = ""
|
||||
|
||||
for op in self._opcodes:
|
||||
if op.tag == "equal":
|
||||
res_str += source_string[op.src_start : op.src_end]
|
||||
elif op.tag in {"replace", "insert"}:
|
||||
res_str += destination_string[op.dest_start : op.dest_end]
|
||||
|
||||
return res_str
|
||||
|
||||
@property
|
||||
def src_len(self):
|
||||
return self._src_len
|
||||
|
||||
@src_len.setter
|
||||
def src_len(self, value):
|
||||
self._src_len = value
|
||||
|
||||
@property
|
||||
def dest_len(self):
|
||||
return self._dest_len
|
||||
|
||||
@dest_len.setter
|
||||
def dest_len(self, value):
|
||||
self._dest_len = value
|
||||
|
||||
def __eq__(self, other):
|
||||
if not isinstance(other, Opcodes):
|
||||
return False
|
||||
|
||||
return self.dest_len == other.dest_len and self.src_len == other.src_len and self._opcodes == other._opcodes
|
||||
|
||||
def __len__(self):
|
||||
return len(self._opcodes)
|
||||
|
||||
def __getitem__(self, key):
|
||||
if isinstance(key, int):
|
||||
return self._opcodes[key]
|
||||
|
||||
msg = "Expected index"
|
||||
raise TypeError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
yield from self._opcodes
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
"Opcodes([" + ", ".join(repr(op) for op in self) + f"], src_len={self.src_len}, dest_len={self.dest_len})"
|
||||
)
|
||||
|
||||
|
||||
class ScoreAlignment:
|
||||
"""
|
||||
Tuple like object describing the position of the compared strings in
|
||||
src and dest.
|
||||
|
||||
It indicates that the score has been calculated between
|
||||
src[src_start:src_end] and dest[dest_start:dest_end]
|
||||
"""
|
||||
|
||||
def __init__(
|
||||
self,
|
||||
score,
|
||||
src_start,
|
||||
src_end,
|
||||
dest_start,
|
||||
dest_end,
|
||||
):
|
||||
self.score = score
|
||||
self.src_start = src_start
|
||||
self.src_end = src_end
|
||||
self.dest_start = dest_start
|
||||
self.dest_end = dest_end
|
||||
|
||||
def __len__(self):
|
||||
return 5
|
||||
|
||||
def __eq__(self, other):
|
||||
try:
|
||||
if len(other) != 5:
|
||||
return False
|
||||
|
||||
return bool(
|
||||
other[0] == self.score
|
||||
and other[1] == self.src_start
|
||||
and other[2] == self.src_end
|
||||
and other[3] == self.dest_start
|
||||
and other[4] == self.dest_end
|
||||
)
|
||||
except TypeError:
|
||||
return False
|
||||
|
||||
def __getitem__(self, i):
|
||||
if i in {0, -5}:
|
||||
return self.score
|
||||
if i in {1, -4}:
|
||||
return self.src_start
|
||||
if i in {2, -3}:
|
||||
return self.src_end
|
||||
if i in {3, -2}:
|
||||
return self.dest_start
|
||||
if i in {4, -1}:
|
||||
return self.dest_end
|
||||
|
||||
msg = "Opcode index out of range"
|
||||
raise IndexError(msg)
|
||||
|
||||
def __iter__(self):
|
||||
for i in range(5):
|
||||
yield self[i]
|
||||
|
||||
def __repr__(self):
|
||||
return (
|
||||
f"ScoreAlignment(score={self.score}, src_start={self.src_start}, "
|
||||
f"src_end={self.src_end}, dest_start={self.dest_start}, dest_end={self.dest_end})"
|
||||
)
|
||||
Binary file not shown.
Binary file not shown.
@@ -0,0 +1,299 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from typing import Any, Callable
|
||||
|
||||
from rapidfuzz._utils import (
|
||||
ScorerFlag,
|
||||
add_scorer_attrs,
|
||||
default_distance_attribute as dist_attr,
|
||||
default_normalized_distance_attribute as norm_dist_attr,
|
||||
default_normalized_similarity_attribute as norm_sim_attr,
|
||||
default_similarity_attribute as sim_attr,
|
||||
)
|
||||
|
||||
# DamerauLevenshtein
|
||||
from rapidfuzz.distance.DamerauLevenshtein_py import (
|
||||
distance as damerau_levenshtein_distance,
|
||||
normalized_distance as damerau_levenshtein_normalized_distance,
|
||||
normalized_similarity as damerau_levenshtein_normalized_similarity,
|
||||
similarity as damerau_levenshtein_similarity,
|
||||
)
|
||||
|
||||
# Hamming
|
||||
from rapidfuzz.distance.Hamming_py import (
|
||||
distance as hamming_distance,
|
||||
editops as hamming_editops,
|
||||
normalized_distance as hamming_normalized_distance,
|
||||
normalized_similarity as hamming_normalized_similarity,
|
||||
opcodes as hamming_opcodes,
|
||||
similarity as hamming_similarity,
|
||||
)
|
||||
|
||||
# Indel
|
||||
from rapidfuzz.distance.Indel_py import (
|
||||
distance as indel_distance,
|
||||
editops as indel_editops,
|
||||
normalized_distance as indel_normalized_distance,
|
||||
normalized_similarity as indel_normalized_similarity,
|
||||
opcodes as indel_opcodes,
|
||||
similarity as indel_similarity,
|
||||
)
|
||||
|
||||
# Jaro
|
||||
from rapidfuzz.distance.Jaro_py import (
|
||||
distance as jaro_distance,
|
||||
normalized_distance as jaro_normalized_distance,
|
||||
normalized_similarity as jaro_normalized_similarity,
|
||||
similarity as jaro_similarity,
|
||||
)
|
||||
|
||||
# JaroWinkler
|
||||
from rapidfuzz.distance.JaroWinkler_py import (
|
||||
distance as jaro_winkler_distance,
|
||||
normalized_distance as jaro_winkler_normalized_distance,
|
||||
normalized_similarity as jaro_winkler_normalized_similarity,
|
||||
similarity as jaro_winkler_similarity,
|
||||
)
|
||||
|
||||
# LCSseq
|
||||
from rapidfuzz.distance.LCSseq_py import (
|
||||
distance as lcs_seq_distance,
|
||||
editops as lcs_seq_editops,
|
||||
normalized_distance as lcs_seq_normalized_distance,
|
||||
normalized_similarity as lcs_seq_normalized_similarity,
|
||||
opcodes as lcs_seq_opcodes,
|
||||
similarity as lcs_seq_similarity,
|
||||
)
|
||||
|
||||
# Levenshtein
|
||||
from rapidfuzz.distance.Levenshtein_py import (
|
||||
distance as levenshtein_distance,
|
||||
editops as levenshtein_editops,
|
||||
normalized_distance as levenshtein_normalized_distance,
|
||||
normalized_similarity as levenshtein_normalized_similarity,
|
||||
opcodes as levenshtein_opcodes,
|
||||
similarity as levenshtein_similarity,
|
||||
)
|
||||
|
||||
# OSA
|
||||
from rapidfuzz.distance.OSA_py import (
|
||||
distance as osa_distance,
|
||||
normalized_distance as osa_normalized_distance,
|
||||
normalized_similarity as osa_normalized_similarity,
|
||||
similarity as osa_similarity,
|
||||
)
|
||||
|
||||
# Postfix
|
||||
from rapidfuzz.distance.Postfix_py import (
|
||||
distance as postfix_distance,
|
||||
normalized_distance as postfix_normalized_distance,
|
||||
normalized_similarity as postfix_normalized_similarity,
|
||||
similarity as postfix_similarity,
|
||||
)
|
||||
|
||||
# Prefix
|
||||
from rapidfuzz.distance.Prefix_py import (
|
||||
distance as prefix_distance,
|
||||
normalized_distance as prefix_normalized_distance,
|
||||
normalized_similarity as prefix_normalized_similarity,
|
||||
similarity as prefix_similarity,
|
||||
)
|
||||
|
||||
__all__ = []
|
||||
|
||||
add_scorer_attrs(osa_distance, dist_attr)
|
||||
add_scorer_attrs(osa_similarity, sim_attr)
|
||||
add_scorer_attrs(osa_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(osa_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"osa_distance",
|
||||
"osa_normalized_distance",
|
||||
"osa_normalized_similarity",
|
||||
"osa_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(prefix_distance, dist_attr)
|
||||
add_scorer_attrs(prefix_similarity, sim_attr)
|
||||
add_scorer_attrs(prefix_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(prefix_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"prefix_distance",
|
||||
"prefix_normalized_distance",
|
||||
"prefix_normalized_similarity",
|
||||
"prefix_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(postfix_distance, dist_attr)
|
||||
add_scorer_attrs(postfix_similarity, sim_attr)
|
||||
add_scorer_attrs(postfix_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(postfix_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"postfix_distance",
|
||||
"postfix_normalized_distance",
|
||||
"postfix_normalized_similarity",
|
||||
"postfix_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(jaro_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_similarity, norm_sim_attr)
|
||||
add_scorer_attrs(jaro_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"jaro_distance",
|
||||
"jaro_normalized_distance",
|
||||
"jaro_normalized_similarity",
|
||||
"jaro_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(jaro_winkler_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_winkler_similarity, norm_sim_attr)
|
||||
add_scorer_attrs(jaro_winkler_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(jaro_winkler_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"jaro_winkler_distance",
|
||||
"jaro_winkler_normalized_distance",
|
||||
"jaro_winkler_normalized_similarity",
|
||||
"jaro_winkler_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(damerau_levenshtein_distance, dist_attr)
|
||||
add_scorer_attrs(damerau_levenshtein_similarity, sim_attr)
|
||||
add_scorer_attrs(damerau_levenshtein_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(damerau_levenshtein_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"damerau_levenshtein_distance",
|
||||
"damerau_levenshtein_normalized_distance",
|
||||
"damerau_levenshtein_normalized_similarity",
|
||||
"damerau_levenshtein_similarity",
|
||||
]
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_distance(weights: tuple[int, int, int] | None = (1, 1, 1)) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_SIZE_T
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {
|
||||
"optimal_score": 0,
|
||||
"worst_score": 2**63 - 1,
|
||||
"flags": flags,
|
||||
}
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_similarity(weights: tuple[int, int, int] | None = (1, 1, 1)) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_SIZE_T
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {
|
||||
"optimal_score": 2**63 - 1,
|
||||
"worst_score": 0,
|
||||
"flags": flags,
|
||||
}
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_normalized_distance(
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1)
|
||||
) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_F64
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {"optimal_score": 0, "worst_score": 1, "flags": flags}
|
||||
|
||||
|
||||
def _get_scorer_flags_levenshtein_normalized_similarity(
|
||||
weights: tuple[int, int, int] | None = (1, 1, 1)
|
||||
) -> dict[str, Any]:
|
||||
flags = ScorerFlag.RESULT_F64
|
||||
if weights is None or weights[0] == weights[1]:
|
||||
flags |= ScorerFlag.SYMMETRIC
|
||||
|
||||
return {"optimal_score": 1, "worst_score": 0, "flags": flags}
|
||||
|
||||
|
||||
levenshtein_dist_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_distance
|
||||
}
|
||||
levenshtein_sim_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_similarity
|
||||
}
|
||||
levenshtein_norm_dist_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_normalized_distance
|
||||
}
|
||||
levenshtein_norm_sim_attr: dict[str, Callable[..., dict[str, Any]]] = {
|
||||
"get_scorer_flags": _get_scorer_flags_levenshtein_normalized_similarity
|
||||
}
|
||||
|
||||
add_scorer_attrs(levenshtein_distance, levenshtein_dist_attr)
|
||||
add_scorer_attrs(levenshtein_similarity, levenshtein_sim_attr)
|
||||
add_scorer_attrs(levenshtein_normalized_distance, levenshtein_norm_dist_attr)
|
||||
add_scorer_attrs(levenshtein_normalized_similarity, levenshtein_norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"levenshtein_distance",
|
||||
"levenshtein_editops",
|
||||
"levenshtein_normalized_distance",
|
||||
"levenshtein_normalized_similarity",
|
||||
"levenshtein_opcodes",
|
||||
"levenshtein_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(lcs_seq_distance, dist_attr)
|
||||
add_scorer_attrs(lcs_seq_similarity, sim_attr)
|
||||
add_scorer_attrs(lcs_seq_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(lcs_seq_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"lcs_seq_distance",
|
||||
"lcs_seq_editops",
|
||||
"lcs_seq_normalized_distance",
|
||||
"lcs_seq_normalized_similarity",
|
||||
"lcs_seq_opcodes",
|
||||
"lcs_seq_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(indel_distance, dist_attr)
|
||||
add_scorer_attrs(indel_similarity, sim_attr)
|
||||
add_scorer_attrs(indel_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(indel_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"indel_distance",
|
||||
"indel_editops",
|
||||
"indel_normalized_distance",
|
||||
"indel_normalized_similarity",
|
||||
"indel_opcodes",
|
||||
"indel_similarity",
|
||||
]
|
||||
|
||||
|
||||
add_scorer_attrs(hamming_distance, dist_attr)
|
||||
add_scorer_attrs(hamming_similarity, sim_attr)
|
||||
add_scorer_attrs(hamming_normalized_distance, norm_dist_attr)
|
||||
add_scorer_attrs(hamming_normalized_similarity, norm_sim_attr)
|
||||
|
||||
__all__ += [
|
||||
"hamming_distance",
|
||||
"hamming_editops",
|
||||
"hamming_normalized_distance",
|
||||
"hamming_normalized_similarity",
|
||||
"hamming_opcodes",
|
||||
"hamming_similarity",
|
||||
]
|
||||
161
.venv/lib/python3.11/site-packages/rapidfuzz/fuzz.py
Normal file
161
.venv/lib/python3.11/site-packages/rapidfuzz/fuzz.py
Normal file
@@ -0,0 +1,161 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = [
|
||||
"QRatio",
|
||||
"WRatio",
|
||||
"partial_ratio",
|
||||
"partial_ratio_alignment",
|
||||
"partial_token_ratio",
|
||||
"partial_token_set_ratio",
|
||||
"partial_token_sort_ratio",
|
||||
"ratio",
|
||||
"token_ratio",
|
||||
"token_set_ratio",
|
||||
"token_sort_ratio",
|
||||
]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.fuzz_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.fuzz_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.fuzz_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.fuzz_py import (
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.fuzz_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.fuzz_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.fuzz_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.fuzz_py import (
|
||||
QRatio,
|
||||
WRatio,
|
||||
partial_ratio,
|
||||
partial_ratio_alignment,
|
||||
partial_token_ratio,
|
||||
partial_token_set_ratio,
|
||||
partial_token_sort_ratio,
|
||||
ratio,
|
||||
token_ratio,
|
||||
token_set_ratio,
|
||||
token_sort_ratio,
|
||||
)
|
||||
189
.venv/lib/python3.11/site-packages/rapidfuzz/fuzz.pyi
Normal file
189
.venv/lib/python3.11/site-packages/rapidfuzz/fuzz.pyi
Normal file
@@ -0,0 +1,189 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2021 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import Callable, TypeVar, overload
|
||||
|
||||
from rapidfuzz.distance import ScoreAlignment
|
||||
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
|
||||
@overload
|
||||
def ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_ratio_alignment(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> ScoreAlignment | None: ...
|
||||
@overload
|
||||
def partial_ratio_alignment(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> ScoreAlignment | None: ...
|
||||
@overload
|
||||
def token_sort_ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def token_sort_ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def token_set_ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def token_set_ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def token_ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def token_ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_token_sort_ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_token_sort_ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_token_set_ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_token_set_ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_token_ratio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def partial_token_ratio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def WRatio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def WRatio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def QRatio(
|
||||
s1: Sequence[Hashable],
|
||||
s2: Sequence[Hashable],
|
||||
*,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
@overload
|
||||
def QRatio(
|
||||
s1: _UnprocessedType1,
|
||||
s2: _UnprocessedType2,
|
||||
*,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = 0,
|
||||
) -> float: ...
|
||||
Binary file not shown.
Binary file not shown.
877
.venv/lib/python3.11/site-packages/rapidfuzz/fuzz_py.py
Normal file
877
.venv/lib/python3.11/site-packages/rapidfuzz/fuzz_py.py
Normal file
@@ -0,0 +1,877 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from math import ceil
|
||||
|
||||
from rapidfuzz._common_py import conv_sequences
|
||||
from rapidfuzz._utils import ScorerFlag, add_scorer_attrs, is_none, setupPandas
|
||||
from rapidfuzz.distance import ScoreAlignment
|
||||
from rapidfuzz.distance.Indel_py import (
|
||||
_block_normalized_similarity as indel_block_normalized_similarity,
|
||||
distance as indel_distance,
|
||||
normalized_similarity as indel_normalized_similarity,
|
||||
)
|
||||
|
||||
|
||||
def get_scorer_flags_fuzz(**_kwargs):
|
||||
return {
|
||||
"optimal_score": 100,
|
||||
"worst_score": 0,
|
||||
"flags": ScorerFlag.RESULT_F64 | ScorerFlag.SYMMETRIC,
|
||||
}
|
||||
|
||||
|
||||
fuzz_attribute = {"get_scorer_flags": get_scorer_flags_fuzz}
|
||||
|
||||
|
||||
def _norm_distance(dist, lensum, score_cutoff):
|
||||
score = (100 - 100 * dist / lensum) if lensum else 100
|
||||
return score if score >= score_cutoff else 0
|
||||
|
||||
|
||||
def _split_sequence(seq):
|
||||
if isinstance(seq, (str, bytes)):
|
||||
return seq.split()
|
||||
|
||||
splitted_seq = [[]]
|
||||
for x in seq:
|
||||
ch = x if isinstance(x, str) else chr(x)
|
||||
if ch.isspace():
|
||||
splitted_seq.append([])
|
||||
else:
|
||||
splitted_seq[-1].append(x)
|
||||
|
||||
return [tuple(x) for x in splitted_seq if x]
|
||||
|
||||
|
||||
def _join_splitted_sequence(seq_list):
|
||||
if not seq_list:
|
||||
return ""
|
||||
if isinstance(next(iter(seq_list)), str):
|
||||
return " ".join(seq_list)
|
||||
if isinstance(next(iter(seq_list)), bytes):
|
||||
return b" ".join(seq_list)
|
||||
|
||||
joined = []
|
||||
for seq in seq_list:
|
||||
joined += seq
|
||||
joined += [ord(" ")]
|
||||
return joined[:-1]
|
||||
|
||||
|
||||
def ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates the normalized Indel similarity.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
See Also
|
||||
--------
|
||||
rapidfuzz.distance.Indel.normalized_similarity : Normalized Indel similarity
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/ratio.svg
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fuzz.ratio("this is a test", "this is a test!")
|
||||
96.55171966552734
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if score_cutoff is not None:
|
||||
score_cutoff /= 100
|
||||
|
||||
score = indel_normalized_similarity(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
return score * 100
|
||||
|
||||
|
||||
def _partial_ratio_impl(s1, s2, score_cutoff):
|
||||
"""
|
||||
implementation of partial_ratio. This assumes len(s1) <= len(s2).
|
||||
"""
|
||||
s1_char_set = set(s1)
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
|
||||
res = ScoreAlignment(0, 0, len1, 0, len1)
|
||||
|
||||
block = {}
|
||||
block_get = block.get
|
||||
x = 1
|
||||
for ch1 in s1:
|
||||
block[ch1] = block_get(ch1, 0) | x
|
||||
x <<= 1
|
||||
|
||||
for i in range(1, len1):
|
||||
substr_last = s2[i - 1]
|
||||
if substr_last not in s1_char_set:
|
||||
continue
|
||||
|
||||
# todo cache map
|
||||
ls_ratio = indel_block_normalized_similarity(block, s1, s2[:i], score_cutoff=score_cutoff)
|
||||
if ls_ratio > res.score:
|
||||
res.score = score_cutoff = ls_ratio
|
||||
res.dest_start = 0
|
||||
res.dest_end = i
|
||||
if res.score == 1:
|
||||
res.score = 100
|
||||
return res
|
||||
|
||||
for i in range(len2 - len1):
|
||||
substr_last = s2[i + len1 - 1]
|
||||
if substr_last not in s1_char_set:
|
||||
continue
|
||||
|
||||
# todo cache map
|
||||
ls_ratio = indel_block_normalized_similarity(block, s1, s2[i : i + len1], score_cutoff=score_cutoff)
|
||||
if ls_ratio > res.score:
|
||||
res.score = score_cutoff = ls_ratio
|
||||
res.dest_start = i
|
||||
res.dest_end = i + len1
|
||||
if res.score == 1:
|
||||
res.score = 100
|
||||
return res
|
||||
|
||||
for i in range(len2 - len1, len2):
|
||||
substr_first = s2[i]
|
||||
if substr_first not in s1_char_set:
|
||||
continue
|
||||
|
||||
# todo cache map
|
||||
ls_ratio = indel_block_normalized_similarity(block, s1, s2[i:], score_cutoff=score_cutoff)
|
||||
if ls_ratio > res.score:
|
||||
res.score = score_cutoff = ls_ratio
|
||||
res.dest_start = i
|
||||
res.dest_end = len2
|
||||
if res.score == 1:
|
||||
res.score = 100
|
||||
return res
|
||||
|
||||
res.score *= 100
|
||||
return res
|
||||
|
||||
|
||||
def partial_ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Searches for the optimal alignment of the shorter string in the
|
||||
longer string and returns the fuzz.ratio for this alignment.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
Depending on the length of the needle (shorter string) different
|
||||
implementations are used to improve the performance.
|
||||
|
||||
short needle (length ≤ 64):
|
||||
When using a short needle length the fuzz.ratio is calculated for all
|
||||
alignments that could result in an optimal alignment. It is
|
||||
guaranteed to find the optimal alignment. For short needles this is very
|
||||
fast, since for them fuzz.ratio runs in ``O(N)`` time. This results in a worst
|
||||
case performance of ``O(NM)``.
|
||||
|
||||
.. image:: img/partial_ratio_short_needle.svg
|
||||
|
||||
long needle (length > 64):
|
||||
For long needles a similar implementation to FuzzyWuzzy is used.
|
||||
This implementation only considers alignments which start at one
|
||||
of the longest common substrings. This results in a worst case performance
|
||||
of ``O(N[N/64]M)``. However usually most of the alignments can be skipped.
|
||||
The following Python code shows the concept:
|
||||
|
||||
.. code-block:: python
|
||||
|
||||
blocks = SequenceMatcher(None, needle, longer, False).get_matching_blocks()
|
||||
score = 0
|
||||
for block in blocks:
|
||||
long_start = block[1] - block[0] if (block[1] - block[0]) > 0 else 0
|
||||
long_end = long_start + len(shorter)
|
||||
long_substr = longer[long_start:long_end]
|
||||
score = max(score, fuzz.ratio(needle, long_substr))
|
||||
|
||||
This is a lot faster than checking all possible alignments. However it
|
||||
only finds one of the best alignments and not necessarily the optimal one.
|
||||
|
||||
.. image:: img/partial_ratio_long_needle.svg
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fuzz.partial_ratio("this is a test", "this is a test!")
|
||||
100.0
|
||||
"""
|
||||
alignment = partial_ratio_alignment(s1, s2, processor=processor, score_cutoff=score_cutoff)
|
||||
if alignment is None:
|
||||
return 0
|
||||
|
||||
return alignment.score
|
||||
|
||||
|
||||
def partial_ratio_alignment(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Searches for the optimal alignment of the shorter string in the
|
||||
longer string and returns the fuzz.ratio and the corresponding
|
||||
alignment.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str | bytes
|
||||
First string to compare.
|
||||
s2 : str | bytes
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff None is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
alignment : ScoreAlignment, optional
|
||||
alignment between s1 and s2 with the score as a float between 0 and 100
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> s1 = "a certain string"
|
||||
>>> s2 = "cetain"
|
||||
>>> res = fuzz.partial_ratio_alignment(s1, s2)
|
||||
>>> res
|
||||
ScoreAlignment(score=83.33333333333334, src_start=2, src_end=8, dest_start=0, dest_end=6)
|
||||
|
||||
Using the alignment information it is possible to calculate the same fuzz.ratio
|
||||
|
||||
>>> fuzz.ratio(s1[res.src_start:res.src_end], s2[res.dest_start:res.dest_end])
|
||||
83.33333333333334
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return None
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
if not s1 and not s2:
|
||||
return ScoreAlignment(100.0, 0, 0, 0, 0)
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
if len1 <= len2:
|
||||
shorter = s1
|
||||
longer = s2
|
||||
else:
|
||||
shorter = s2
|
||||
longer = s1
|
||||
|
||||
res = _partial_ratio_impl(shorter, longer, score_cutoff / 100)
|
||||
if res.score != 100 and len1 == len2:
|
||||
score_cutoff = max(score_cutoff, res.score)
|
||||
res2 = _partial_ratio_impl(longer, shorter, score_cutoff / 100)
|
||||
if res2.score > res.score:
|
||||
res = ScoreAlignment(res2.score, res2.dest_start, res2.dest_end, res2.src_start, res2.src_end)
|
||||
|
||||
if res.score < score_cutoff:
|
||||
return None
|
||||
|
||||
if len1 <= len2:
|
||||
return res
|
||||
|
||||
return ScoreAlignment(res.score, res.dest_start, res.dest_end, res.src_start, res.src_end)
|
||||
|
||||
|
||||
def token_sort_ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Sorts the words in the strings and calculates the fuzz.ratio between them
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str
|
||||
First string to compare.
|
||||
s2 : str
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/token_sort_ratio.svg
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fuzz.token_sort_ratio("fuzzy wuzzy was a bear", "wuzzy fuzzy was a bear")
|
||||
100.0
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
sorted_s1 = _join_splitted_sequence(sorted(_split_sequence(s1)))
|
||||
sorted_s2 = _join_splitted_sequence(sorted(_split_sequence(s2)))
|
||||
return ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff)
|
||||
|
||||
|
||||
def token_set_ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Compares the words in the strings based on unique and common words between them
|
||||
using fuzz.ratio
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str
|
||||
First string to compare.
|
||||
s2 : str
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/token_set_ratio.svg
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fuzz.token_sort_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
83.8709716796875
|
||||
>>> fuzz.token_set_ratio("fuzzy was a bear", "fuzzy fuzzy was a bear")
|
||||
100.0
|
||||
# Returns 100.0 if one string is a subset of the other, regardless of extra content in the longer string
|
||||
>>> fuzz.token_set_ratio("fuzzy was a bear but not a dog", "fuzzy was a bear")
|
||||
100.0
|
||||
# Score is reduced only when there is explicit disagreement in the two strings
|
||||
>>> fuzz.token_set_ratio("fuzzy was a bear but not a dog", "fuzzy was a bear but not a cat")
|
||||
92.3076923076923
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
|
||||
tokens_a = set(_split_sequence(s1))
|
||||
tokens_b = set(_split_sequence(s2))
|
||||
|
||||
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
||||
if not tokens_a or not tokens_b:
|
||||
return 0
|
||||
|
||||
intersect = tokens_a.intersection(tokens_b)
|
||||
diff_ab = tokens_a.difference(tokens_b)
|
||||
diff_ba = tokens_b.difference(tokens_a)
|
||||
|
||||
# one sentence is part of the other one
|
||||
if intersect and (not diff_ab or not diff_ba):
|
||||
return 100
|
||||
|
||||
diff_ab_joined = _join_splitted_sequence(sorted(diff_ab))
|
||||
diff_ba_joined = _join_splitted_sequence(sorted(diff_ba))
|
||||
|
||||
ab_len = len(diff_ab_joined)
|
||||
ba_len = len(diff_ba_joined)
|
||||
# todo is length sum without joining faster?
|
||||
sect_len = len(_join_splitted_sequence(intersect))
|
||||
|
||||
# string length sect+ab <-> sect and sect+ba <-> sect
|
||||
sect_ab_len = sect_len + (sect_len != 0) + ab_len
|
||||
sect_ba_len = sect_len + (sect_len != 0) + ba_len
|
||||
|
||||
result = 0.0
|
||||
cutoff_distance = ceil((sect_ab_len + sect_ba_len) * (1 - score_cutoff / 100))
|
||||
dist = indel_distance(diff_ab_joined, diff_ba_joined, score_cutoff=cutoff_distance)
|
||||
|
||||
if dist <= cutoff_distance:
|
||||
result = _norm_distance(dist, sect_ab_len + sect_ba_len, score_cutoff)
|
||||
|
||||
# exit early since the other ratios are 0
|
||||
if not sect_len:
|
||||
return result
|
||||
|
||||
# levenshtein distance sect+ab <-> sect and sect+ba <-> sect
|
||||
# since only sect is similar in them the distance can be calculated based on
|
||||
# the length difference
|
||||
sect_ab_dist = (sect_len != 0) + ab_len
|
||||
sect_ab_ratio = _norm_distance(sect_ab_dist, sect_len + sect_ab_len, score_cutoff)
|
||||
|
||||
sect_ba_dist = (sect_len != 0) + ba_len
|
||||
sect_ba_ratio = _norm_distance(sect_ba_dist, sect_len + sect_ba_len, score_cutoff)
|
||||
|
||||
return max(result, sect_ab_ratio, sect_ba_ratio)
|
||||
|
||||
|
||||
def token_ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Helper method that returns the maximum of fuzz.token_set_ratio and fuzz.token_sort_ratio
|
||||
(faster than manually executing the two functions)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str
|
||||
First string to compare.
|
||||
s2 : str
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/token_ratio.svg
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
# todo write combined implementation
|
||||
return max(
|
||||
token_set_ratio(s1, s2, processor=None, score_cutoff=score_cutoff),
|
||||
token_sort_ratio(s1, s2, processor=None, score_cutoff=score_cutoff),
|
||||
)
|
||||
|
||||
|
||||
def partial_token_sort_ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
sorts the words in the strings and calculates the fuzz.partial_ratio between them
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str
|
||||
First string to compare.
|
||||
s2 : str
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/partial_token_sort_ratio.svg
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
sorted_s1 = _join_splitted_sequence(sorted(_split_sequence(s1)))
|
||||
sorted_s2 = _join_splitted_sequence(sorted(_split_sequence(s2)))
|
||||
return partial_ratio(sorted_s1, sorted_s2, score_cutoff=score_cutoff)
|
||||
|
||||
|
||||
def partial_token_set_ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Compares the words in the strings based on unique and common words between them
|
||||
using fuzz.partial_ratio
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str
|
||||
First string to compare.
|
||||
s2 : str
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/partial_token_set_ratio.svg
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
|
||||
tokens_a = set(_split_sequence(s1))
|
||||
tokens_b = set(_split_sequence(s2))
|
||||
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
||||
if not tokens_a or not tokens_b:
|
||||
return 0
|
||||
|
||||
# exit early when there is a common word in both sequences
|
||||
if tokens_a.intersection(tokens_b):
|
||||
return 100
|
||||
|
||||
diff_ab = _join_splitted_sequence(sorted(tokens_a.difference(tokens_b)))
|
||||
diff_ba = _join_splitted_sequence(sorted(tokens_b.difference(tokens_a)))
|
||||
return partial_ratio(diff_ab, diff_ba, score_cutoff=score_cutoff)
|
||||
|
||||
|
||||
def partial_token_ratio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Helper method that returns the maximum of fuzz.partial_token_set_ratio and
|
||||
fuzz.partial_token_sort_ratio (faster than manually executing the two functions)
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str
|
||||
First string to compare.
|
||||
s2 : str
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/partial_token_ratio.svg
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
s1, s2 = conv_sequences(s1, s2)
|
||||
|
||||
tokens_split_a = _split_sequence(s1)
|
||||
tokens_split_b = _split_sequence(s2)
|
||||
tokens_a = set(tokens_split_a)
|
||||
tokens_b = set(tokens_split_b)
|
||||
|
||||
# exit early when there is a common word in both sequences
|
||||
if tokens_a.intersection(tokens_b):
|
||||
return 100
|
||||
|
||||
diff_ab = tokens_a.difference(tokens_b)
|
||||
diff_ba = tokens_b.difference(tokens_a)
|
||||
|
||||
result = partial_ratio(
|
||||
_join_splitted_sequence(sorted(tokens_split_a)),
|
||||
_join_splitted_sequence(sorted(tokens_split_b)),
|
||||
score_cutoff=score_cutoff,
|
||||
)
|
||||
|
||||
# do not calculate the same partial_ratio twice
|
||||
if len(tokens_split_a) == len(diff_ab) and len(tokens_split_b) == len(diff_ba):
|
||||
return result
|
||||
|
||||
score_cutoff = max(score_cutoff, result)
|
||||
return max(
|
||||
result,
|
||||
partial_ratio(
|
||||
_join_splitted_sequence(sorted(diff_ab)),
|
||||
_join_splitted_sequence(sorted(diff_ba)),
|
||||
score_cutoff=score_cutoff,
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def WRatio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a weighted ratio based on the other ratio algorithms
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : str
|
||||
First string to compare.
|
||||
s2 : str
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Notes
|
||||
-----
|
||||
.. image:: img/WRatio.svg
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
UNBASE_SCALE = 0.95
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
|
||||
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
||||
if not s1 or not s2:
|
||||
return 0
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = 0
|
||||
|
||||
len1 = len(s1)
|
||||
len2 = len(s2)
|
||||
len_ratio = len1 / len2 if len1 > len2 else len2 / len1
|
||||
|
||||
end_ratio = ratio(s1, s2, score_cutoff=score_cutoff)
|
||||
if len_ratio < 1.5:
|
||||
score_cutoff = max(score_cutoff, end_ratio) / UNBASE_SCALE
|
||||
return max(
|
||||
end_ratio,
|
||||
token_ratio(s1, s2, score_cutoff=score_cutoff, processor=None) * UNBASE_SCALE,
|
||||
)
|
||||
|
||||
PARTIAL_SCALE = 0.9 if len_ratio <= 8.0 else 0.6
|
||||
score_cutoff = max(score_cutoff, end_ratio) / PARTIAL_SCALE
|
||||
end_ratio = max(end_ratio, partial_ratio(s1, s2, score_cutoff=score_cutoff) * PARTIAL_SCALE)
|
||||
|
||||
score_cutoff = max(score_cutoff, end_ratio) / UNBASE_SCALE
|
||||
return max(
|
||||
end_ratio,
|
||||
partial_token_ratio(s1, s2, score_cutoff=score_cutoff, processor=None) * UNBASE_SCALE * PARTIAL_SCALE,
|
||||
)
|
||||
|
||||
|
||||
def QRatio(
|
||||
s1,
|
||||
s2,
|
||||
*,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
):
|
||||
"""
|
||||
Calculates a quick ratio between two strings using fuzz.ratio.
|
||||
|
||||
Since v3.0 this behaves similar to fuzz.ratio with the exception that this
|
||||
returns 0 when comparing two empty strings
|
||||
|
||||
Parameters
|
||||
----------
|
||||
s1 : Sequence[Hashable]
|
||||
First string to compare.
|
||||
s2 : Sequence[Hashable]
|
||||
Second string to compare.
|
||||
processor: callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : float, optional
|
||||
Optional argument for a score threshold as a float between 0 and 100.
|
||||
For ratio < score_cutoff 0 is returned instead. Default is 0,
|
||||
which deactivates this behaviour.
|
||||
|
||||
Returns
|
||||
-------
|
||||
similarity : float
|
||||
similarity between s1 and s2 as a float between 0 and 100
|
||||
|
||||
Examples
|
||||
--------
|
||||
>>> fuzz.QRatio("this is a test", "this is a test!")
|
||||
96.55171966552734
|
||||
"""
|
||||
setupPandas()
|
||||
if is_none(s1) or is_none(s2):
|
||||
return 0
|
||||
|
||||
if processor is not None:
|
||||
s1 = processor(s1)
|
||||
s2 = processor(s2)
|
||||
# in FuzzyWuzzy this returns 0. For sake of compatibility return 0 here as well
|
||||
# see https://github.com/rapidfuzz/RapidFuzz/issues/110
|
||||
if not s1 or not s2:
|
||||
return 0
|
||||
|
||||
return ratio(s1, s2, score_cutoff=score_cutoff)
|
||||
|
||||
|
||||
add_scorer_attrs(ratio, fuzz_attribute)
|
||||
add_scorer_attrs(partial_ratio, fuzz_attribute)
|
||||
add_scorer_attrs(token_sort_ratio, fuzz_attribute)
|
||||
add_scorer_attrs(token_set_ratio, fuzz_attribute)
|
||||
add_scorer_attrs(token_ratio, fuzz_attribute)
|
||||
add_scorer_attrs(partial_token_sort_ratio, fuzz_attribute)
|
||||
add_scorer_attrs(partial_token_set_ratio, fuzz_attribute)
|
||||
add_scorer_attrs(partial_token_ratio, fuzz_attribute)
|
||||
add_scorer_attrs(WRatio, fuzz_attribute)
|
||||
add_scorer_attrs(QRatio, fuzz_attribute)
|
||||
95
.venv/lib/python3.11/site-packages/rapidfuzz/process.py
Normal file
95
.venv/lib/python3.11/site-packages/rapidfuzz/process.py
Normal file
@@ -0,0 +1,95 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["cdist", "cpdist", "extract", "extractOne", "extract_iter"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.process_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
cdist,
|
||||
cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.process_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
cdist,
|
||||
cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.process_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
cdist,
|
||||
cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.process_py import cdist, cpdist, extract, extract_iter, extractOne
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.process_cpp_avx2 import ( # pyright: ignore[reportMissingImports]
|
||||
cdist,
|
||||
cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.process_cpp_sse2 import ( # pyright: ignore[reportMissingImports]
|
||||
cdist,
|
||||
cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.process_cpp import ( # pyright: ignore[reportMissingImports]
|
||||
cdist,
|
||||
cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.process_py import (
|
||||
cdist,
|
||||
cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
430
.venv/lib/python3.11/site-packages/rapidfuzz/process.pyi
Normal file
430
.venv/lib/python3.11/site-packages/rapidfuzz/process.pyi
Normal file
@@ -0,0 +1,430 @@
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Collection, Generator, Hashable, Iterable, Mapping, Sequence
|
||||
from typing import (
|
||||
Any,
|
||||
Callable,
|
||||
Protocol,
|
||||
TypeVar,
|
||||
overload,
|
||||
)
|
||||
|
||||
from rapidfuzz.fuzz import WRatio, ratio
|
||||
|
||||
_StringType = Sequence[Hashable]
|
||||
_StringType1 = TypeVar("_StringType1", bound=Sequence[Hashable])
|
||||
_StringType2 = TypeVar("_StringType2", bound=Sequence[Hashable])
|
||||
_UnprocessedType1 = TypeVar("_UnprocessedType1")
|
||||
_UnprocessedType2 = TypeVar("_UnprocessedType2")
|
||||
_KeyType = TypeVar("_KeyType")
|
||||
_ResultType = TypeVar("_ResultType", int, float)
|
||||
|
||||
_StringType1_contra = TypeVar("_StringType1_contra", contravariant=True, bound=Sequence[Hashable])
|
||||
_StringType2_contra = TypeVar("_StringType2_contra", contravariant=True, bound=Sequence[Hashable])
|
||||
_ResultType_contra = TypeVar("_ResultType_contra", int, float, contravariant=True)
|
||||
_ResultType_co = TypeVar("_ResultType_co", int, float, covariant=True)
|
||||
|
||||
class _Scorer(Protocol[_StringType1_contra, _StringType2_contra, _ResultType_contra, _ResultType_co]):
|
||||
def __call__(
|
||||
self, __s1: _StringType1_contra, __s2: _StringType2_contra, *, score_cutoff: _ResultType_contra | None
|
||||
) -> _ResultType_co: ...
|
||||
|
||||
# mypy wants defaults to be valid for every possible parameterization of a generic function
|
||||
# so add separate overloads for the default version
|
||||
@overload
|
||||
def extractOne(
|
||||
query: Sequence[Hashable] | None,
|
||||
choices: Mapping[_KeyType, _StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_StringType2, float, _KeyType] | None: ...
|
||||
@overload
|
||||
def extractOne(
|
||||
query: Sequence[Hashable] | None,
|
||||
choices: Iterable[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_StringType2, float, int] | None: ...
|
||||
@overload
|
||||
def extractOne(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Mapping[_KeyType, _UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_UnprocessedType2, float, _KeyType] | None: ...
|
||||
@overload
|
||||
def extractOne(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_UnprocessedType2, float, int] | None: ...
|
||||
@overload
|
||||
def extractOne(
|
||||
query: _StringType1 | None,
|
||||
choices: Mapping[_KeyType, _StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_StringType2, _ResultType, _KeyType] | None: ...
|
||||
@overload
|
||||
def extractOne(
|
||||
query: _StringType1 | None,
|
||||
choices: Iterable[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_StringType2, _ResultType, int] | None: ...
|
||||
@overload
|
||||
def extractOne(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Mapping[_KeyType, _UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_UnprocessedType2, _ResultType, _KeyType] | None: ...
|
||||
@overload
|
||||
def extractOne(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> tuple[_UnprocessedType2, _ResultType, int] | None: ...
|
||||
|
||||
# mypy wants defaults to be valid for every possible parameterization of a generic function
|
||||
# so add separate overloads for the default version
|
||||
@overload
|
||||
def extract(
|
||||
query: Sequence[Hashable] | None,
|
||||
choices: Mapping[_KeyType, _StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: None = None,
|
||||
limit: int | None = 5,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_StringType2, float, _KeyType]]: ...
|
||||
@overload
|
||||
def extract(
|
||||
query: Sequence[Hashable] | None,
|
||||
choices: Iterable[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: None = None,
|
||||
limit: int | None = 5,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_StringType2, float, int]]: ...
|
||||
@overload
|
||||
def extract(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Mapping[_KeyType, _UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
limit: int | None = 5,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_UnprocessedType2, float, _KeyType]]: ...
|
||||
@overload
|
||||
def extract(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
limit: int | None = 5,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_UnprocessedType2, float, int]]: ...
|
||||
@overload
|
||||
def extract(
|
||||
query: _StringType1 | None,
|
||||
choices: Mapping[_KeyType, _StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
limit: int | None = 5,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_StringType2, _ResultType, _KeyType]]: ...
|
||||
@overload
|
||||
def extract(
|
||||
query: _StringType1 | None,
|
||||
choices: Collection[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
limit: int | None = 5,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_StringType2, _ResultType, int]]: ...
|
||||
@overload
|
||||
def extract(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Mapping[_KeyType, _UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
limit: int | None = 5,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_UnprocessedType2, _ResultType, _KeyType]]: ...
|
||||
@overload
|
||||
def extract(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Collection[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
limit: int | None = 5,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> list[tuple[_UnprocessedType2, _ResultType, int]]: ...
|
||||
|
||||
# mypy wants defaults to be valid for every possible parameterization of a generic function
|
||||
# so add separate overloads for the default version
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: Sequence[Hashable] | None,
|
||||
choices: Mapping[_KeyType, _StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_StringType2, float, _KeyType], None, None]: ...
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: Sequence[Hashable] | None,
|
||||
choices: Iterable[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_StringType2, float, int], None, None]: ...
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Mapping[_KeyType, _UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_UnprocessedType2, float, _KeyType], None, None]: ...
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = WRatio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_UnprocessedType2, float, int], None, None]: ...
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: _StringType1 | None,
|
||||
choices: Mapping[_KeyType, _StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_StringType2, _ResultType, _KeyType], None, None]: ...
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: _StringType1 | None,
|
||||
choices: Iterable[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_StringType2, _ResultType, int], None, None]: ...
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Mapping[_KeyType, _UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_UnprocessedType2, _ResultType, _KeyType], None, None]: ...
|
||||
@overload
|
||||
def extract_iter(
|
||||
query: _UnprocessedType1 | None,
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> Generator[tuple[_UnprocessedType2, _ResultType, int], None, None]: ...
|
||||
|
||||
try:
|
||||
import numpy.typing as npt
|
||||
|
||||
@overload
|
||||
def cdist(
|
||||
queries: Iterable[Sequence[Hashable] | None],
|
||||
choices: Iterable[Sequence[Hashable] | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
score_multiplier: float = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
@overload
|
||||
def cdist(
|
||||
queries: Iterable[_UnprocessedType1 | None],
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
score_multiplier: float = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
@overload
|
||||
def cdist(
|
||||
queries: Iterable[_StringType1 | None],
|
||||
choices: Iterable[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
score_multiplier: _ResultType = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
@overload
|
||||
def cdist(
|
||||
queries: Iterable[_UnprocessedType1 | None],
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
score_multiplier: _ResultType = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
@overload
|
||||
def cpdist(
|
||||
queries: Iterable[Sequence[Hashable] | None],
|
||||
choices: Iterable[Sequence[Hashable] | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
|
||||
processor: None = None,
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
score_multiplier: float = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
@overload
|
||||
def cpdist(
|
||||
queries: Iterable[_UnprocessedType1 | None],
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[Sequence[Hashable], Sequence[Hashable], float, float] = ratio,
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], Sequence[Hashable]],
|
||||
score_cutoff: float | None = None,
|
||||
score_hint: float | None = None,
|
||||
score_multiplier: float = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
@overload
|
||||
def cpdist(
|
||||
queries: Iterable[_StringType1 | None],
|
||||
choices: Iterable[_StringType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType2, _ResultType, _ResultType],
|
||||
processor: None = None,
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
score_multiplier: _ResultType = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
@overload
|
||||
def cpdist(
|
||||
queries: Iterable[_UnprocessedType1 | None],
|
||||
choices: Iterable[_UnprocessedType2 | None],
|
||||
*,
|
||||
scorer: _Scorer[_StringType1, _StringType1, _ResultType, _ResultType],
|
||||
processor: Callable[[_UnprocessedType1 | _UnprocessedType2], _StringType1],
|
||||
score_cutoff: _ResultType | None = None,
|
||||
score_hint: _ResultType | None = None,
|
||||
score_multiplier: _ResultType = 1,
|
||||
dtype: npt.DTypeLike | None = None,
|
||||
workers: int = 1,
|
||||
scorer_kwargs: dict[str, Any] | None = None,
|
||||
) -> npt.NDArray[Any]: ...
|
||||
|
||||
except ImportError:
|
||||
pass
|
||||
125
.venv/lib/python3.11/site-packages/rapidfuzz/process_cpp.py
Normal file
125
.venv/lib/python3.11/site-packages/rapidfuzz/process_cpp.py
Normal file
@@ -0,0 +1,125 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
from rapidfuzz.fuzz import ratio
|
||||
from rapidfuzz.process_cpp_impl import (
|
||||
FLOAT32 as _FLOAT32,
|
||||
FLOAT64 as _FLOAT64,
|
||||
INT8 as _INT8,
|
||||
INT16 as _INT16,
|
||||
INT32 as _INT32,
|
||||
INT64 as _INT64,
|
||||
UINT8 as _UINT8,
|
||||
UINT16 as _UINT16,
|
||||
UINT32 as _UINT32,
|
||||
UINT64 as _UINT64,
|
||||
cdist as _cdist,
|
||||
cpdist as _cpdist,
|
||||
extract,
|
||||
extract_iter,
|
||||
extractOne,
|
||||
)
|
||||
|
||||
__all__ = ["cdist", "cpdist", "extract", "extractOne", "extract_iter"]
|
||||
|
||||
|
||||
def _dtype_to_type_num(dtype):
|
||||
import numpy as np
|
||||
|
||||
if dtype is None:
|
||||
return None
|
||||
|
||||
dtype = np.dtype(dtype)
|
||||
if dtype == np.int32:
|
||||
return _INT32
|
||||
if dtype == np.int8:
|
||||
return _INT8
|
||||
if dtype == np.int16:
|
||||
return _INT16
|
||||
if dtype == np.int64:
|
||||
return _INT64
|
||||
if dtype == np.uint8:
|
||||
return _UINT8
|
||||
if dtype == np.uint16:
|
||||
return _UINT16
|
||||
if dtype == np.uint32:
|
||||
return _UINT32
|
||||
if dtype == np.uint64:
|
||||
return _UINT64
|
||||
if dtype == np.float32:
|
||||
return _FLOAT32
|
||||
if dtype == np.float64:
|
||||
return _FLOAT64
|
||||
|
||||
msg = f"unsupported dtype: {dtype}"
|
||||
raise TypeError(msg)
|
||||
|
||||
|
||||
def cdist(
|
||||
queries,
|
||||
choices,
|
||||
*,
|
||||
scorer=ratio,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
score_multiplier=1,
|
||||
dtype=None,
|
||||
workers=1,
|
||||
**kwargs,
|
||||
):
|
||||
import numpy as np
|
||||
|
||||
dtype = _dtype_to_type_num(dtype)
|
||||
return np.asarray(
|
||||
_cdist(
|
||||
queries,
|
||||
choices,
|
||||
scorer=scorer,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
score_hint=score_hint,
|
||||
score_multiplier=score_multiplier,
|
||||
dtype=dtype,
|
||||
workers=workers,
|
||||
**kwargs,
|
||||
)
|
||||
)
|
||||
|
||||
|
||||
cdist.__doc__ = _cdist.__doc__
|
||||
|
||||
|
||||
def cpdist(
|
||||
queries,
|
||||
choices,
|
||||
*,
|
||||
scorer=ratio,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
score_multiplier=1,
|
||||
dtype=None,
|
||||
workers=1,
|
||||
**kwargs,
|
||||
):
|
||||
import numpy as np
|
||||
|
||||
dtype = _dtype_to_type_num(dtype)
|
||||
distance_matrix = _cpdist(
|
||||
queries,
|
||||
choices,
|
||||
scorer=scorer,
|
||||
processor=processor,
|
||||
score_cutoff=score_cutoff,
|
||||
score_hint=score_hint,
|
||||
score_multiplier=score_multiplier,
|
||||
dtype=dtype,
|
||||
workers=workers,
|
||||
**kwargs,
|
||||
)
|
||||
return np.asarray(distance_matrix)
|
||||
|
||||
|
||||
cpdist.__doc__ = _cpdist.__doc__
|
||||
Binary file not shown.
679
.venv/lib/python3.11/site-packages/rapidfuzz/process_py.py
Normal file
679
.venv/lib/python3.11/site-packages/rapidfuzz/process_py.py
Normal file
@@ -0,0 +1,679 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
from __future__ import annotations
|
||||
|
||||
import heapq
|
||||
|
||||
from rapidfuzz._utils import ScorerFlag, is_none, setupPandas
|
||||
from rapidfuzz.fuzz import WRatio, ratio
|
||||
|
||||
__all__ = ["cdist", "extract", "extractOne", "extract_iter"]
|
||||
|
||||
|
||||
def _get_scorer_flags_py(scorer, scorer_kwargs):
|
||||
params = getattr(scorer, "_RF_ScorerPy", None)
|
||||
if params is not None:
|
||||
flags = params["get_scorer_flags"](**scorer_kwargs)
|
||||
return (flags["worst_score"], flags["optimal_score"])
|
||||
return (0, 100)
|
||||
|
||||
|
||||
def extract_iter(
|
||||
query,
|
||||
choices,
|
||||
*,
|
||||
scorer=WRatio,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
scorer_kwargs=None,
|
||||
):
|
||||
"""
|
||||
Find the best match in a list of choices
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : Sequence[Hashable]
|
||||
string we want to find
|
||||
choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
|
||||
list of all strings the query should be compared with or dict with a mapping
|
||||
{<result>: <string to compare>}
|
||||
scorer : Callable, optional
|
||||
Optional callable that is used to calculate the matching score between
|
||||
the query and each choice. This can be any of the scorers included in RapidFuzz
|
||||
(both scorers that calculate the edit distance or the normalized edit distance), or
|
||||
a custom function, which returns a normalized edit distance.
|
||||
fuzz.WRatio is used by default.
|
||||
processor : Callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : Any, optional
|
||||
Optional argument for a score threshold. When an edit distance is used this represents the maximum
|
||||
edit distance and matches with a `distance > score_cutoff` are ignored. When a
|
||||
normalized edit distance is used this represents the minimal similarity
|
||||
and matches with a `similarity < score_cutoff` are ignored. Default is None, which deactivates this behaviour.
|
||||
score_hint : Any, optional
|
||||
Optional argument for an expected score to be passed to the scorer.
|
||||
This is used to select a faster implementation. Default is None,
|
||||
which deactivates this behaviour.
|
||||
scorer_kwargs : dict[str, Any], optional
|
||||
any other named parameters are passed to the scorer. This can be used to pass
|
||||
e.g. weights to `Levenshtein.distance`
|
||||
|
||||
Yields
|
||||
-------
|
||||
tuple[Sequence[Hashable], Any, Any]
|
||||
Yields similarity between the query and each choice in form of a Tuple with 3 elements.
|
||||
The values stored in the tuple depend on the types of the input arguments.
|
||||
|
||||
* The first element is always the current `choice`, which is the value that's compared to the query.
|
||||
|
||||
* The second value represents the similarity calculated by the scorer. This can be:
|
||||
|
||||
* An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
|
||||
In this case only choices which have a `distance <= score_cutoff` are yielded.
|
||||
An example of a scorer with this behavior is `Levenshtein.distance`.
|
||||
* A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
|
||||
In this case only choices which have a `similarity >= score_cutoff` are yielded.
|
||||
An example of a scorer with this behavior is `Levenshtein.normalized_similarity`.
|
||||
|
||||
Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.
|
||||
|
||||
* The third parameter depends on the type of the `choices` argument it is:
|
||||
|
||||
* The `index of choice` when choices is a simple iterable like a list
|
||||
* The `key of choice` when choices is a mapping like a dict, or a pandas Series
|
||||
|
||||
"""
|
||||
_ = score_hint
|
||||
scorer_kwargs = scorer_kwargs or {}
|
||||
worst_score, optimal_score = _get_scorer_flags_py(scorer, scorer_kwargs)
|
||||
lowest_score_worst = optimal_score > worst_score
|
||||
|
||||
setupPandas()
|
||||
|
||||
if is_none(query):
|
||||
return
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = worst_score
|
||||
|
||||
# preprocess the query
|
||||
if processor is not None:
|
||||
query = processor(query)
|
||||
|
||||
choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices)
|
||||
for key, choice in choices_iter:
|
||||
if is_none(choice):
|
||||
continue
|
||||
|
||||
if processor is None:
|
||||
score = scorer(query, choice, score_cutoff=score_cutoff, **scorer_kwargs)
|
||||
else:
|
||||
score = scorer(
|
||||
query,
|
||||
processor(choice),
|
||||
score_cutoff=score_cutoff,
|
||||
**scorer_kwargs,
|
||||
)
|
||||
|
||||
if lowest_score_worst:
|
||||
if score >= score_cutoff:
|
||||
yield (choice, score, key)
|
||||
else:
|
||||
if score <= score_cutoff:
|
||||
yield (choice, score, key)
|
||||
|
||||
|
||||
def extractOne(
|
||||
query,
|
||||
choices,
|
||||
*,
|
||||
scorer=WRatio,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
scorer_kwargs=None,
|
||||
):
|
||||
"""
|
||||
Find the best match in a list of choices. When multiple elements have the same similarity,
|
||||
the first element is returned.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : Sequence[Hashable]
|
||||
string we want to find
|
||||
choices : Iterable[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
|
||||
list of all strings the query should be compared with or dict with a mapping
|
||||
{<result>: <string to compare>}
|
||||
scorer : Callable, optional
|
||||
Optional callable that is used to calculate the matching score between
|
||||
the query and each choice. This can be any of the scorers included in RapidFuzz
|
||||
(both scorers that calculate the edit distance or the normalized edit distance), or
|
||||
a custom function, which returns a normalized edit distance.
|
||||
fuzz.WRatio is used by default.
|
||||
processor : Callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : Any, optional
|
||||
Optional argument for a score threshold. When an edit distance is used this represents the maximum
|
||||
edit distance and matches with a `distance > score_cutoff` are ignored. When a
|
||||
normalized edit distance is used this represents the minimal similarity
|
||||
and matches with a `similarity < score_cutoff` are ignored. Default is None, which deactivates this behaviour.
|
||||
score_hint : Any, optional
|
||||
Optional argument for an expected score to be passed to the scorer.
|
||||
This is used to select a faster implementation. Default is None,
|
||||
which deactivates this behaviour.
|
||||
scorer_kwargs : dict[str, Any], optional
|
||||
any other named parameters are passed to the scorer. This can be used to pass
|
||||
e.g. weights to `Levenshtein.distance`
|
||||
|
||||
Returns
|
||||
-------
|
||||
tuple[Sequence[Hashable], Any, Any]
|
||||
Returns the best match in form of a Tuple with 3 elements. The values stored in the
|
||||
tuple depend on the types of the input arguments.
|
||||
|
||||
* The first element is always the `choice`, which is the value that's compared to the query.
|
||||
|
||||
* The second value represents the similarity calculated by the scorer. This can be:
|
||||
|
||||
* An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
|
||||
In this case only choices which have a `distance <= score_cutoff` are returned.
|
||||
An example of a scorer with this behavior is `Levenshtein.distance`.
|
||||
* A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
|
||||
In this case only choices which have a `similarity >= score_cutoff` are returned.
|
||||
An example of a scorer with this behavior is `Levenshtein.normalized_similarity`.
|
||||
|
||||
Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.
|
||||
|
||||
* The third parameter depends on the type of the `choices` argument it is:
|
||||
|
||||
* The `index of choice` when choices is a simple iterable like a list
|
||||
* The `key of choice` when choices is a mapping like a dict, or a pandas Series
|
||||
|
||||
None
|
||||
When no choice has a `similarity >= score_cutoff`/`distance <= score_cutoff` None is returned
|
||||
|
||||
Examples
|
||||
--------
|
||||
|
||||
>>> from rapidfuzz.process import extractOne
|
||||
>>> from rapidfuzz.distance import Levenshtein
|
||||
>>> from rapidfuzz.fuzz import ratio
|
||||
|
||||
extractOne can be used with normalized edit distances.
|
||||
|
||||
>>> extractOne("abcd", ["abce"], scorer=ratio)
|
||||
("abcd", 75.0, 1)
|
||||
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.normalized_similarity)
|
||||
("abcd", 0.75, 1)
|
||||
|
||||
extractOne can be used with edit distances as well.
|
||||
|
||||
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance)
|
||||
("abce", 1, 0)
|
||||
|
||||
additional settings of the scorer can be passed via the scorer_kwargs argument to extractOne
|
||||
|
||||
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance, scorer_kwargs={"weights":(1,1,2)})
|
||||
("abcde", 2, 1)
|
||||
|
||||
when a mapping is used for the choices the key of the choice is returned instead of the List index
|
||||
|
||||
>>> extractOne("abcd", {"key": "abce"}, scorer=ratio)
|
||||
("abcd", 75.0, "key")
|
||||
|
||||
It is possible to specify a processor function which is used to preprocess the strings before comparing them.
|
||||
|
||||
>>> extractOne("abcd", ["abcD"], scorer=ratio)
|
||||
("abcD", 75.0, 0)
|
||||
>>> extractOne("abcd", ["abcD"], scorer=ratio, processor=utils.default_process)
|
||||
("abcD", 100.0, 0)
|
||||
>>> extractOne("abcd", ["abcD"], scorer=ratio, processor=lambda s: s.upper())
|
||||
("abcD", 100.0, 0)
|
||||
|
||||
When only results with a similarity above a certain threshold are relevant, the parameter score_cutoff can be
|
||||
used to filter out results with a lower similarity. This threshold is used by some of the scorers to exit early,
|
||||
when they are sure, that the similarity is below the threshold.
|
||||
For normalized edit distances all results with a similarity below score_cutoff are filtered out
|
||||
|
||||
>>> extractOne("abcd", ["abce"], scorer=ratio)
|
||||
("abce", 75.0, 0)
|
||||
>>> extractOne("abcd", ["abce"], scorer=ratio, score_cutoff=80)
|
||||
None
|
||||
|
||||
For edit distances all results with an edit distance above the score_cutoff are filtered out
|
||||
|
||||
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance, scorer_kwargs={"weights":(1,1,2)})
|
||||
("abce", 2, 0)
|
||||
>>> extractOne("abcd", ["abce"], scorer=Levenshtein.distance, scorer_kwargs={"weights":(1,1,2)}, score_cutoff=1)
|
||||
None
|
||||
|
||||
"""
|
||||
_ = score_hint
|
||||
scorer_kwargs = scorer_kwargs or {}
|
||||
worst_score, optimal_score = _get_scorer_flags_py(scorer, scorer_kwargs)
|
||||
lowest_score_worst = optimal_score > worst_score
|
||||
|
||||
setupPandas()
|
||||
|
||||
if is_none(query):
|
||||
return None
|
||||
|
||||
if score_cutoff is None:
|
||||
score_cutoff = worst_score
|
||||
|
||||
# preprocess the query
|
||||
if processor is not None:
|
||||
query = processor(query)
|
||||
|
||||
result = None
|
||||
|
||||
choices_iter = choices.items() if hasattr(choices, "items") else enumerate(choices)
|
||||
for key, choice in choices_iter:
|
||||
if is_none(choice):
|
||||
continue
|
||||
|
||||
if processor is None:
|
||||
score = scorer(query, choice, score_cutoff=score_cutoff, **scorer_kwargs)
|
||||
else:
|
||||
score = scorer(
|
||||
query,
|
||||
processor(choice),
|
||||
score_cutoff=score_cutoff,
|
||||
**scorer_kwargs,
|
||||
)
|
||||
|
||||
if lowest_score_worst:
|
||||
if score >= score_cutoff and (result is None or score > result[1]):
|
||||
score_cutoff = score
|
||||
result = (choice, score, key)
|
||||
else:
|
||||
if score <= score_cutoff and (result is None or score < result[1]):
|
||||
score_cutoff = score
|
||||
result = (choice, score, key)
|
||||
|
||||
if score == optimal_score:
|
||||
break
|
||||
|
||||
return result
|
||||
|
||||
|
||||
def extract(
|
||||
query,
|
||||
choices,
|
||||
*,
|
||||
scorer=WRatio,
|
||||
processor=None,
|
||||
limit=5,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
scorer_kwargs=None,
|
||||
):
|
||||
"""
|
||||
Find the best matches in a list of choices. The list is sorted by the similarity.
|
||||
When multiple choices have the same similarity, they are sorted by their index
|
||||
|
||||
Parameters
|
||||
----------
|
||||
query : Sequence[Hashable]
|
||||
string we want to find
|
||||
choices : Collection[Sequence[Hashable]] | Mapping[Sequence[Hashable]]
|
||||
list of all strings the query should be compared with or dict with a mapping
|
||||
{<result>: <string to compare>}
|
||||
scorer : Callable, optional
|
||||
Optional callable that is used to calculate the matching score between
|
||||
the query and each choice. This can be any of the scorers included in RapidFuzz
|
||||
(both scorers that calculate the edit distance or the normalized edit distance), or
|
||||
a custom function, which returns a normalized edit distance.
|
||||
fuzz.WRatio is used by default.
|
||||
processor : Callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
limit : int, optional
|
||||
maximum amount of results to return. None can be passed to disable this behavior.
|
||||
Default is 5.
|
||||
score_cutoff : Any, optional
|
||||
Optional argument for a score threshold. When an edit distance is used this represents the maximum
|
||||
edit distance and matches with a `distance > score_cutoff` are ignored. When a
|
||||
normalized edit distance is used this represents the minimal similarity
|
||||
and matches with a `similarity < score_cutoff` are ignored. Default is None, which deactivates this behaviour.
|
||||
score_hint : Any, optional
|
||||
Optional argument for an expected score to be passed to the scorer.
|
||||
This is used to select a faster implementation. Default is None,
|
||||
which deactivates this behaviour.
|
||||
scorer_kwargs : dict[str, Any], optional
|
||||
any other named parameters are passed to the scorer. This can be used to pass
|
||||
e.g. weights to `Levenshtein.distance`
|
||||
|
||||
Returns
|
||||
-------
|
||||
list[tuple[Sequence[Hashable], Any, Any]]
|
||||
The return type is always a List of Tuples with 3 elements. However the values stored in the
|
||||
tuple depend on the types of the input arguments.
|
||||
|
||||
* The first element is always the `choice`, which is the value that's compared to the query.
|
||||
|
||||
* The second value represents the similarity calculated by the scorer. This can be:
|
||||
|
||||
* An edit distance (distance is 0 for a perfect match and > 0 for non perfect matches).
|
||||
In this case only choices which have a `distance <= score_cutoff` are returned.
|
||||
An example of a scorer with this behavior is `Levenshtein.distance`.
|
||||
* A normalized edit distance (similarity is a score between 0 and 100, with 100 being a perfect match).
|
||||
In this case only choices which have a `similarity >= score_cutoff` are returned.
|
||||
An example of a scorer with this behavior is `Levenshtein.normalized_similarity`.
|
||||
|
||||
Note, that for all scorers, which are not provided by RapidFuzz, only normalized edit distances are supported.
|
||||
|
||||
* The third parameter depends on the type of the `choices` argument it is:
|
||||
|
||||
* The `index of choice` when choices is a simple iterable like a list
|
||||
* The `key of choice` when choices is a mapping like a dict, or a pandas Series
|
||||
|
||||
The list is sorted by similarity or distance depending on the scorer used. The first element in the list
|
||||
has the `highest similarity`/`smallest distance`.
|
||||
|
||||
"""
|
||||
scorer_kwargs = scorer_kwargs or {}
|
||||
worst_score, optimal_score = _get_scorer_flags_py(scorer, scorer_kwargs)
|
||||
lowest_score_worst = optimal_score > worst_score
|
||||
|
||||
if limit == 1:
|
||||
res = extractOne(
|
||||
query,
|
||||
choices,
|
||||
processor=processor,
|
||||
scorer=scorer,
|
||||
score_cutoff=score_cutoff,
|
||||
score_hint=score_hint,
|
||||
scorer_kwargs=scorer_kwargs,
|
||||
)
|
||||
if res is None:
|
||||
return []
|
||||
return [res]
|
||||
|
||||
result_iter = extract_iter(
|
||||
query,
|
||||
choices,
|
||||
processor=processor,
|
||||
scorer=scorer,
|
||||
score_cutoff=score_cutoff,
|
||||
score_hint=score_hint,
|
||||
scorer_kwargs=scorer_kwargs,
|
||||
)
|
||||
|
||||
if limit is None:
|
||||
return sorted(result_iter, key=lambda i: i[1], reverse=lowest_score_worst)
|
||||
|
||||
if lowest_score_worst:
|
||||
return heapq.nlargest(limit, result_iter, key=lambda i: i[1])
|
||||
return heapq.nsmallest(limit, result_iter, key=lambda i: i[1])
|
||||
|
||||
|
||||
def _dtype_to_type_num(
|
||||
dtype,
|
||||
scorer,
|
||||
scorer_kwargs,
|
||||
):
|
||||
import numpy as np
|
||||
|
||||
if dtype is not None:
|
||||
return np.dtype(dtype)
|
||||
|
||||
params = getattr(scorer, "_RF_ScorerPy", None)
|
||||
if params is not None:
|
||||
flags = params["get_scorer_flags"](**scorer_kwargs)
|
||||
if flags["flags"] & ScorerFlag.RESULT_I64:
|
||||
return np.int32
|
||||
if flags["flags"] & ScorerFlag.RESULT_SIZE_T:
|
||||
return np.uint32
|
||||
return np.float32
|
||||
|
||||
return np.float32
|
||||
|
||||
|
||||
def _is_symmetric(scorer, scorer_kwargs):
|
||||
params = getattr(scorer, "_RF_ScorerPy", None)
|
||||
if params is not None:
|
||||
flags = params["get_scorer_flags"](**scorer_kwargs)
|
||||
if flags["flags"] & ScorerFlag.SYMMETRIC:
|
||||
return True
|
||||
|
||||
return False
|
||||
|
||||
|
||||
def cdist(
|
||||
queries,
|
||||
choices,
|
||||
*,
|
||||
scorer=ratio,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
score_multiplier=1,
|
||||
dtype=None,
|
||||
workers=1,
|
||||
scorer_kwargs=None,
|
||||
):
|
||||
"""
|
||||
Compute distance/similarity between each pair of the two collections of inputs.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
queries : Collection[Sequence[Hashable]]
|
||||
list of all strings the queries
|
||||
choices : Collection[Sequence[Hashable]]
|
||||
list of all strings the query should be compared
|
||||
scorer : Callable, optional
|
||||
Optional callable that is used to calculate the matching score between
|
||||
the query and each choice. This can be any of the scorers included in RapidFuzz
|
||||
(both scorers that calculate the edit distance or the normalized edit distance), or
|
||||
a custom function, which returns a normalized edit distance.
|
||||
fuzz.ratio is used by default.
|
||||
processor : Callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : Any, optional
|
||||
Optional argument for a score threshold to be passed to the scorer.
|
||||
Default is None, which deactivates this behaviour.
|
||||
score_hint : Any, optional
|
||||
Optional argument for an expected score to be passed to the scorer.
|
||||
This is used to select a faster implementation. Default is None,
|
||||
which deactivates this behaviour.
|
||||
score_multiplier: Any, optional
|
||||
Optional argument to multiply the calculated score with. This is applied as the final step,
|
||||
so e.g. score_cutoff is applied on the unmodified score. This is mostly useful to map from
|
||||
a floating point range to an integer to reduce the memory usage. Default is 1,
|
||||
which deactivates this behaviour.
|
||||
dtype : data-type, optional
|
||||
The desired data-type for the result array. Depending on the scorer type the following
|
||||
dtypes are supported:
|
||||
|
||||
- similarity:
|
||||
- np.float32, np.float64
|
||||
- np.uint8 -> stores fixed point representation of the result scaled to a range 0-100
|
||||
- distance:
|
||||
- np.int8, np.int16, np.int32, np.int64
|
||||
|
||||
If not given, then the type will be np.float32 for similarities and np.int32 for distances.
|
||||
workers : int, optional
|
||||
The calculation is subdivided into workers sections and evaluated in parallel.
|
||||
Supply -1 to use all available CPU cores.
|
||||
This argument is only available for scorers using the RapidFuzz C-API so far, since it
|
||||
releases the Python GIL.
|
||||
scorer_kwargs : dict[str, Any], optional
|
||||
any other named parameters are passed to the scorer. This can be used to pass
|
||||
e.g. weights to `Levenshtein.distance`
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray
|
||||
Returns a matrix of dtype with the distance/similarity between each pair
|
||||
of the two collections of inputs.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
_ = workers, score_hint
|
||||
scorer_kwargs = scorer_kwargs or {}
|
||||
dtype = _dtype_to_type_num(dtype, scorer, scorer_kwargs)
|
||||
results = np.zeros((len(queries), len(choices)), dtype=dtype)
|
||||
|
||||
setupPandas()
|
||||
|
||||
if processor is None:
|
||||
proc_choices = list(choices)
|
||||
else:
|
||||
proc_choices = [x if is_none(x) else processor(x) for x in choices]
|
||||
|
||||
if queries is choices and _is_symmetric(scorer, scorer_kwargs):
|
||||
for i, proc_query in enumerate(proc_choices):
|
||||
score = scorer(proc_query, proc_query, score_cutoff=score_cutoff, **scorer_kwargs) * score_multiplier
|
||||
|
||||
if np.issubdtype(dtype, np.integer):
|
||||
score = round(score)
|
||||
|
||||
results[i, i] = score
|
||||
for j in range(i + 1, len(proc_choices)):
|
||||
score = (
|
||||
scorer(
|
||||
proc_query,
|
||||
proc_choices[j],
|
||||
score_cutoff=score_cutoff,
|
||||
**scorer_kwargs,
|
||||
)
|
||||
* score_multiplier
|
||||
)
|
||||
|
||||
if np.issubdtype(dtype, np.integer):
|
||||
score = round(score)
|
||||
|
||||
results[i, j] = results[j, i] = score
|
||||
else:
|
||||
for i, query in enumerate(queries):
|
||||
proc_query = processor(query) if (processor and not is_none(query)) else query
|
||||
for j, choice in enumerate(proc_choices):
|
||||
score = (
|
||||
scorer(
|
||||
proc_query,
|
||||
choice,
|
||||
score_cutoff=score_cutoff,
|
||||
**scorer_kwargs,
|
||||
)
|
||||
* score_multiplier
|
||||
)
|
||||
|
||||
if np.issubdtype(dtype, np.integer):
|
||||
score = round(score)
|
||||
|
||||
results[i, j] = score
|
||||
|
||||
return results
|
||||
|
||||
|
||||
def cpdist(
|
||||
queries,
|
||||
choices,
|
||||
*,
|
||||
scorer=ratio,
|
||||
processor=None,
|
||||
score_cutoff=None,
|
||||
score_hint=None,
|
||||
score_multiplier=1,
|
||||
dtype=None,
|
||||
workers=1,
|
||||
scorer_kwargs=None,
|
||||
):
|
||||
"""
|
||||
Compute the pairwise distance/similarity between corresponding elements of the queries & choices.
|
||||
|
||||
Parameters
|
||||
----------
|
||||
queries : Collection[Sequence[Hashable]]
|
||||
list of strings used to compute the distance/similarity.
|
||||
choices : Collection[Sequence[Hashable]]
|
||||
list of strings the queries should be compared with. Must be the same length as the queries.
|
||||
scorer : Callable, optional
|
||||
Optional callable that is used to calculate the matching score between
|
||||
the query and each choice. This can be any of the scorers included in RapidFuzz
|
||||
(both scorers that calculate the edit distance or the normalized edit distance), or
|
||||
a custom function, which returns a normalized edit distance.
|
||||
fuzz.ratio is used by default.
|
||||
processor : Callable, optional
|
||||
Optional callable that is used to preprocess the strings before
|
||||
comparing them. Default is None, which deactivates this behaviour.
|
||||
score_cutoff : Any, optional
|
||||
Optional argument for a score threshold to be passed to the scorer.
|
||||
Default is None, which deactivates this behaviour.
|
||||
score_hint : Any, optional
|
||||
Optional argument for an expected score to be passed to the scorer.
|
||||
This is used to select a faster implementation. Default is None,
|
||||
which deactivates this behaviour.
|
||||
score_multiplier: Any, optional
|
||||
Optional argument to multiply the calculated score with. This is applied as the final step,
|
||||
so e.g. score_cutoff is applied on the unmodified score. This is mostly useful to map from
|
||||
a floating point range to an integer to reduce the memory usage. Default is 1,
|
||||
which deactivates this behaviour.
|
||||
dtype : data-type, optional
|
||||
The desired data-type for the result array. Depending on the scorer type the following
|
||||
dtypes are supported:
|
||||
|
||||
- similarity:
|
||||
- np.float32, np.float64
|
||||
- np.uint8 -> stores fixed point representation of the result scaled to a range 0-100
|
||||
- distance:
|
||||
- np.int8, np.int16, np.int32, np.int64
|
||||
|
||||
If not given, then the type will be np.float32 for similarities and np.int32 for distances.
|
||||
workers : int, optional
|
||||
The calculation is subdivided into workers sections and evaluated in parallel.
|
||||
Supply -1 to use all available CPU cores.
|
||||
This argument is only available for scorers using the RapidFuzz C-API so far, since it
|
||||
releases the Python GIL.
|
||||
scorer_kwargs : dict[str, Any], optional
|
||||
any other named parameters are passed to the scorer. This can be used to pass
|
||||
e.g. weights to `Levenshtein.distance`
|
||||
|
||||
Returns
|
||||
-------
|
||||
ndarray
|
||||
Returns a matrix of size (n x 1) of dtype with the distance/similarity between each pair
|
||||
of the two collections of inputs.
|
||||
"""
|
||||
import numpy as np
|
||||
|
||||
len_queries = len(queries)
|
||||
len_choices = len(choices)
|
||||
|
||||
if len_queries != len_choices:
|
||||
error_message = "Length of queries and choices must be the same!"
|
||||
raise ValueError(error_message)
|
||||
|
||||
_ = workers, score_hint
|
||||
scorer_kwargs = scorer_kwargs or {}
|
||||
dtype = _dtype_to_type_num(dtype, scorer, scorer_kwargs)
|
||||
results = np.zeros((len_queries,), dtype=dtype)
|
||||
|
||||
setupPandas()
|
||||
|
||||
for i, (query, choice) in enumerate(zip(queries, choices)):
|
||||
proc_query = processor(query) if (processor and not is_none(query)) else query
|
||||
proc_choice = processor(choice) if (processor and not is_none(choice)) else choice
|
||||
score = scorer(
|
||||
proc_query,
|
||||
proc_choice,
|
||||
score_cutoff=score_cutoff,
|
||||
**scorer_kwargs,
|
||||
)
|
||||
|
||||
# Apply score multiplier
|
||||
score *= score_multiplier
|
||||
|
||||
# Round the result if dtype is integral
|
||||
if np.issubdtype(dtype, np.integer):
|
||||
score = round(score)
|
||||
|
||||
# Store the score in the results matrix
|
||||
results[i] = score
|
||||
|
||||
return results
|
||||
65
.venv/lib/python3.11/site-packages/rapidfuzz/utils.py
Normal file
65
.venv/lib/python3.11/site-packages/rapidfuzz/utils.py
Normal file
@@ -0,0 +1,65 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2025 Max Bachmann
|
||||
# This file is generated by tools/generate_python.py
|
||||
from __future__ import annotations
|
||||
|
||||
import contextlib
|
||||
import os
|
||||
|
||||
from rapidfuzz._feature_detector import AVX2, SSE2, supports
|
||||
|
||||
__all__ = ["default_process"]
|
||||
|
||||
_impl = os.environ.get("RAPIDFUZZ_IMPLEMENTATION")
|
||||
if _impl == "cpp":
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.utils_cpp_avx2 import (
|
||||
default_process, # pyright: ignore[reportMissingImports]
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.utils_cpp_sse2 import (
|
||||
default_process, # pyright: ignore[reportMissingImports]
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.utils_cpp import (
|
||||
default_process, # pyright: ignore[reportMissingImports]
|
||||
)
|
||||
elif _impl == "python":
|
||||
from rapidfuzz.utils_py import default_process
|
||||
else:
|
||||
imported = False
|
||||
if supports(AVX2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.utils_cpp_avx2 import (
|
||||
default_process, # pyright: ignore[reportMissingImports]
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported and supports(SSE2):
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.utils_cpp_sse2 import (
|
||||
default_process, # pyright: ignore[reportMissingImports]
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
with contextlib.suppress(ImportError):
|
||||
from rapidfuzz.utils_cpp import (
|
||||
default_process, # pyright: ignore[reportMissingImports]
|
||||
)
|
||||
|
||||
imported = True
|
||||
|
||||
if not imported:
|
||||
from rapidfuzz.utils_py import default_process
|
||||
11
.venv/lib/python3.11/site-packages/rapidfuzz/utils.pyi
Normal file
11
.venv/lib/python3.11/site-packages/rapidfuzz/utils.pyi
Normal file
@@ -0,0 +1,11 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
from collections.abc import Hashable, Sequence
|
||||
from typing import TypeVar
|
||||
|
||||
_StringType = TypeVar("_StringType", bound=Sequence[Hashable])
|
||||
|
||||
def default_process(sentence: _StringType) -> _StringType: ...
|
||||
Binary file not shown.
32
.venv/lib/python3.11/site-packages/rapidfuzz/utils_py.py
Normal file
32
.venv/lib/python3.11/site-packages/rapidfuzz/utils_py.py
Normal file
@@ -0,0 +1,32 @@
|
||||
# SPDX-License-Identifier: MIT
|
||||
# Copyright (C) 2022 Max Bachmann
|
||||
|
||||
from __future__ import annotations
|
||||
|
||||
import re
|
||||
|
||||
_alnum_regex = re.compile(r"(?ui)\W")
|
||||
|
||||
|
||||
def default_process(sentence: str) -> str:
|
||||
"""
|
||||
This function preprocesses a string by:
|
||||
|
||||
* removing all non alphanumeric characters
|
||||
|
||||
* trimming whitespaces
|
||||
|
||||
* converting all characters to lower case
|
||||
|
||||
Parameters
|
||||
----------
|
||||
sentence : str
|
||||
String to preprocess
|
||||
|
||||
Returns
|
||||
-------
|
||||
processed_string : str
|
||||
processed string
|
||||
"""
|
||||
string_out = _alnum_regex.sub(" ", sentence)
|
||||
return string_out.strip().lower()
|
||||
Reference in New Issue
Block a user