将推理分解为多个步骤

累积推理¹旨在通过将推理过程分为三个独立的步骤来生成更好的输出。
提出 (Propose)：LLM 首先根据当前上下文提出潜在步骤，启动推理循环。
验证 (Verify)：然后我们评估提出者的建议是否准确，并将有效的步骤纳入当前上下文。
报告 (Report)：然后我们确定何时结束推理过程。
通过首先生成潜在步骤并将推理过程的每个部分分开，我们能够在逻辑推理任务和数学问题中获得显著改进。
我们可以使用 instructor 实现这一点，如下所示
import instructor
from openai import AsyncOpenAI
from pydantic import BaseModel, Field
from textwrap import dedent
from typing import Literal
import asyncio

client = instructor.from_openai(AsyncOpenAI())


class Proposition(BaseModel):
    premise1: str
    premise2: str
    reasoning: str
    proposition: str


class ProposerOutput(BaseModel):
    reasoning: str
    valid_propositions: list[Proposition] = Field(
        description="Concise list of Propositions that are derived from the premises that are relevant to the hypothesis. Note that each Proposition is derived from two given premises at most",
        min_length=4,
    )
    prediction: Literal["False", "True", "Unknown"]


class VerifiedProposition(BaseModel):
    proposition: str
    reasoning: str
    is_valid: bool


class ReporterOutput(BaseModel):
    reasoning: str
    is_valid_hypothesis: bool


async def generate_propositions(premises: list[str], hypothesis: str) -> ProposerOutput:
    formatted_premises = "\n- ".join(premises)
    return await client.chat.completions.create(
        messages=[
            {
                "role": "system",
                "content": dedent(
                    """
                Suppose you are one of the greatest AI
                scientists, logicians, and mathematicians.

                Let us think step by step. Please use
                First-Order Logic (FOL) to deduce a list
                of Propositions. Each Proposition is
                derived from two given Premises and
                should be logically correct. Most
                importantly, each Proposition should
                not duplicate the two premises that it
                is derived from. Please make sure your
                reasoning is directly deduced from the
                Premises and Propositions rather than
                introducing unsourced common knowledge
                and unsourced information by common
                sense reasoning.
                """
                ),
            },
            {
                "role": "user",
                "content": dedent(
                    f"""
                Premises:
                {formatted_premises}

                We want to deduce more Propositions to
                determine the correctness of the following
                Hypothesis:
                Hypothesis: {hypothesis}
                """
                ),
            },
        ],
        response_model=ProposerOutput,
        model="gpt-4o",
    )


async def verify_propositions(
    premise_evaluation: ProposerOutput,
) -> list[VerifiedProposition]:
    async def create_verification_task(proposition: Proposition) -> VerifiedProposition:
        return await client.chat.completions.create(
            messages=[
                {
                    "role": "system",
                    "content": """
                    Suppose you are one of the greatest AI
                    scientists, logicians, and mathematicians.
                    Let us think step by step. Please use
                    First-Order Logic (FOL) to determine
                    whether the deduction of two given
                    Premises to a Proposition is valid or not,
                    and reply with True or False.
                    """,
                },
                {
                    "role": "user",
                    "content": f"""
                    Premises:
                    {proposition.premise1}
                    {proposition.premise2}

                    Proposition:
                    {proposition.proposition}
                    """,
                },
            ],
            response_model=VerifiedProposition,
            model="gpt-4o",
        )

    tasks = [
        create_verification_task(proposition)
        for proposition in premise_evaluation.valid_propositions
    ]

    return await asyncio.gather(*tasks)


async def final_evaluation(
    verification_result: list[str], hypothesis: str, premises: list[str]
) -> ReporterOutput:
    formatted_premises = "\n- ".join(premises)
    formatted_propositions = "\n- ".join(verification_result)
    return await client.chat.completions.create(
        model="gpt-4o",
        messages=[
            {
                "role": "system",
                "content": """
                Suppose you are one of the greatest AI
                scientists, logicians, and mathematicians.
                Let us think step by step. Read and analyze
                the “Premises” first, then use First-Order
                Logic (FOL) to judge whether the “Hypothesis”
                is True, False, or Unknown. Please make sure
                your reasoning is directly deduced from the
                "Premises" and "Propositions" rather than
                introducing unsourced common knowledge and
                unsourced information by common sense
                reasoning.
                """,
            },
            {
                "role": "user",
                "content": f"""
                Premises:
                {formatted_premises}

                Hypothesis: {hypothesis}
                """,
            },
            {
                "role": "assistant",
                "content": f"""
                Let's think step by step. From the premises,
                we can deduce the following propositions:
                {formatted_propositions}

                Recall the Hypothesis: {hypothesis}
                """,
            },
        ],
        response_model=ReporterOutput,
    )


if __name__ == "__main__":
    hypothesis = "Hyraxes lay eggs"
    premises = [
        "The only types of mammals that lay eggs are platypuses and echidnas",
        "Platypuses are not hyrax",
        "Echidnas are not hyrax",
        "No mammals are invertebrates",
        "All animals are either vertebrates or invertebrates",
        "Mammals are animals",
        "Hyraxes are mammals",
        "Grebes lay eggs",
        "Grebes are not platypuses and also not echidnas",
    ]
    premise_evaluation = asyncio.run(generate_propositions(premises, hypothesis))

    verification_result = asyncio.run(verify_propositions(premise_evaluation))

    filtered_propositions = [
        proposition.proposition
        for proposition in verification_result
        if proposition.is_valid
    ]

    reporter_output = asyncio.run(
        final_evaluation(filtered_propositions, hypothesis, premises)
    )
    print(reporter_output.model_dump_json(indent=2))
    """
    {
      "reasoning": "Based on the premises provided, the
      only mammals that lay eggs are platypuses and
      echidnas. Hyraxes are mammals but are explicitly
      stated as not being platypuses or echidnas. Hence,
      there is no basis in the premises to conclude that
      hyraxes lay eggs. \n\nTherefore, the hypothesis that
      hyraxes lay eggs is False.",
      "is_valid_hypothesis": false
    }
    """
参考资料¶

¹: 使用大型语言模型的累积推理