mantle-ai-trader/skills/quiz-mastery/src/quiz_mastery/quiz_extractor.py

from __future__ import annotations

import json
from .models import Question


def build_extraction_prompt(content: str) -> dict:
    """Build a prompt for LLM to parse questions from a question file.

    Args:
        content: Raw text content containing questions.

    Returns:
        dict with 'system_prompt' and 'user_prompt' keys.
    """
    system_prompt = (
        "你是一个专业的题目解析助手。从用户提供的题目文件中识别并解析所有题目。\n"
        "严格按照要求的 JSON 格式输出，不要输出任何其他内容。"
    )

    user_prompt = f"""请从以下题目文件内容中解析出所有题目。

## 题目文件内容

{content}

## 解析要求
1. 识别每道题的类型：
   - single_choice: 选择题（有 A/B/C/D 等选项）
   - true_false: 判断题（判断对错）
   - fill_blank: 填空题（有空格需要填写）
   - short_answer: 简答题（需要文字回答）
2. 提取题目的所有信息
3. 如果题目有答案和解析，也一并提取
4. 为每道题分配唯一 ID

## 输出格式
输出纯 JSON 数组，每个元素格式如下：
```json
[
  {{
    "id": "q_001",
    "knowledge_point_ids": [],
    "level": 1,
    "type": "single_choice",
    "prompt": "题目内容",
    "options": ["A. 选项一", "B. 选项二", "C. 选项三", "D. 选项四"],
    "answer": "A",
    "explanation": "解析内容（如果有）"
  }}
]
```

注意：
- type 必须是 single_choice, true_false, fill_blank, short_answer 之一
- 选择题的 answer 填选项字母（A/B/C/D）
- 判断题的 answer 填 "True" 或 "False"
- 填空题的 answer 填正确答案文本
- 简答题的 answer 填参考答案（如果有）
- 如果无法确定 knowledge_point_ids，留空数组
- level 默认为 1，如果能从题目难度判断则相应调整

请直接输出 JSON 数组，不要包含 markdown 代码块标记或其他文字。"""

    return {
        "system_prompt": system_prompt,
        "user_prompt": user_prompt,
    }


def parse_questions_json(json_str: str) -> list[Question]:
    """Parse LLM-returned JSON string into a list of Question objects.

    Args:
        json_str: JSON string containing a list of question dicts.

    Returns:
        List of Question objects.

    Raises:
        json.JSONDecodeError: If json_str is not valid JSON.
        ValueError: If the parsed data is not a list.
    """
    # Try to extract JSON from possible markdown code blocks
    cleaned = json_str.strip()
    if cleaned.startswith("```"):
        # Remove markdown code block markers
        lines = cleaned.split("\n")
        # Remove first line (```json or ```)
        lines = lines[1:]
        # Remove last line (```)
        if lines and lines[-1].strip() == "```":
            lines = lines[:-1]
        cleaned = "\n".join(lines)

    data = json.loads(cleaned)
    if not isinstance(data, list):
        raise ValueError(f"Expected a JSON array, got {type(data).__name__}")

    questions: list[Question] = []
    for item in data:
        q = Question(
            id=item.get("id", "q_unknown"),
            knowledge_point_ids=item.get("knowledge_point_ids", []),
            level=item.get("level", 1),
            type=item.get("type", "single_choice"),
            prompt=item.get("prompt", ""),
            options=item.get("options", []),
            answer=item.get("answer"),
            explanation=item.get("explanation", ""),
            source_refs=item.get("source_refs", []),
        )
        questions.append(q)

    return questions