Files
2026-06-06 05:21:10 +00:00

129 lines
4.4 KiB
Python
Executable File
Raw Permalink Blame History

This file contains ambiguous Unicode characters
This file contains Unicode characters that might be confused with other characters. If you think that this is intentional, you can safely ignore this warning. Use the Escape button to reveal them.
#!/usr/bin/env python3
"""
profile_match.py — 把用户技能列表 vs 关键词库做匹配,输出匹配度报告
用法:
python profile_match.py --skills "SQL,Python,产品规划,A/B测试" \
--library internet \
[--out report.md]
library 取值internet / tech / finance / general
"""
from __future__ import annotations
import argparse
import re
import sys
from pathlib import Path
LIB_MAP = {
"internet": "keywords_internet.md",
"tech": "keywords_tech.md",
"finance": "keywords_finance.md",
"general": "keywords_general.md",
}
def extract_keywords(md_path: Path) -> list[str]:
"""简单解析 markdown把所有 bullet 后面的中英文词汇收集起来。"""
text = md_path.read_text(encoding="utf-8")
# 匹配 - 开头的行
bullets = re.findall(r"^\s*[-*]\s+(.+)$", text, flags=re.MULTILINE)
keywords: set[str] = set()
for line in bullets:
# 去掉括号内的解释、占位符、markdown 控制字符
clean = re.sub(r"[(][^)]*[)]", "", line)
clean = re.sub(r"_{2,}", "", clean)
clean = clean.replace("**", "").replace("__", "")
for token in re.split(r"[、,/\s]+", clean):
token = token.strip().strip(":。.\"\"''`*").lower()
# 过滤掉只含标点 / 短横 / 数字 的词
if not re.search(r"[一-龥A-Za-z]", token):
continue
if 1 < len(token) < 30:
keywords.add(token)
return sorted(keywords)
def match_score(user_skills: list[str], lib_keywords: list[str]) -> dict:
"""返回匹配命中、缺失、命中率。模糊匹配:包含即算命中。"""
user_lower = [s.strip().lower() for s in user_skills if s.strip()]
hits, missing = [], []
for kw in lib_keywords:
if any(kw in u or u in kw for u in user_lower):
hits.append(kw)
else:
missing.append(kw)
rate = len(hits) / len(lib_keywords) if lib_keywords else 0
return {
"hits": hits,
"missing": missing,
"rate": rate,
"user_skills": user_lower,
}
def render_report(result: dict, library: str) -> str:
rate_pct = f"{result['rate'] * 100:.1f}%"
# 缺口前 20 个,避免太长
top_missing = result["missing"][:20]
lines = [
f"# 岗位画像匹配报告 — {library}",
"",
f"- **命中率**{rate_pct}{len(result['hits'])} / {len(result['hits']) + len(result['missing'])}",
f"- **用户提供技能**{', '.join(result['user_skills'])}",
"",
"## ✅ 命中的关键词",
"",
", ".join(result["hits"]) if result["hits"] else "(无)",
"",
"## ⚠️ 缺口(前 20 个,按字典序)",
"",
", ".join(top_missing) if top_missing else "(无)",
"",
"## 解读",
"",
"- 命中率 < 20%:方向不匹配,建议重新评估目标岗",
"- 命中率 20-50%:可投但需要补关键缺口",
"- 命中率 > 50%:核心匹配,可以重点投",
"",
"注意:本工具是关键词级别的粗筛,不能替代真实 JD 对照(用 jd-resume-tailor 做精准对比)。",
]
return "\n".join(lines)
def main() -> None:
parser = argparse.ArgumentParser()
parser.add_argument("--skills", required=True, help="用户技能列表,逗号分隔")
parser.add_argument(
"--library", choices=list(LIB_MAP), required=True, help="关键词库"
)
parser.add_argument("--out", help="输出 markdown 报告路径,缺省直接打印")
parser.add_argument(
"--references-dir",
default=str(Path(__file__).resolve().parent.parent / "references"),
help="references 目录路径",
)
args = parser.parse_args()
lib_path = Path(args.references_dir) / LIB_MAP[args.library]
if not lib_path.exists():
print(f"✗ 找不到关键词库:{lib_path}", file=sys.stderr)
sys.exit(1)
lib_keywords = extract_keywords(lib_path)
user_skills = [s for s in args.skills.split(",") if s.strip()]
result = match_score(user_skills, lib_keywords)
report = render_report(result, args.library)
if args.out:
Path(args.out).write_text(report, encoding="utf-8")
print(f"✓ 报告已生成:{args.out}")
else:
print(report)
if __name__ == "__main__":
main()