跳到内容

选择有效示例

我们可以通过选择那些与查询在语义上更接近的示例,使用 KNN 来选择有效的上下文示例。

在使用 instructor 的以下实现中,我们遵循以下步骤:

  1. 嵌入查询示例
  2. 嵌入我们要回答的查询
  3. 找到与查询最接近的 k 个查询示例
  4. 使用选定的示例和它们的 作为 LLM 的上下文
import instructor
from pydantic import BaseModel
from openai import OpenAI
import math
from textwrap import dedent


class Example(BaseModel):
    question: str
    answer: str


class Response(BaseModel):
    answer: str


oai = OpenAI()
client = instructor.from_openai(oai)


def distance(a: list[float], b: list[float]):
    return 1 - sum(ai * bi for ai, bi in zip(a, b)) / (
        math.sqrt(sum(ai**2 for ai in a)) * math.sqrt(sum(bi**2 for bi in b))
    )


def embed_queries(queries: list[str]) -> list[tuple[list[float], str]]:
    return [
        (embedding_item.embedding, query)
        for embedding_item, query in zip(
            oai.embeddings.create(input=queries, model="text-embedding-3-large").data,
            queries,
        )
    ]


def knn(
    embedded_examples: list[tuple[list[float], str]],
    query_embedding: list[float],
    k: int,
):
    distances = [
        (distance(embedding, query_embedding), example)
        for embedding, example in embedded_examples
    ]
    distances.sort(key=lambda x: x[0])
    return distances[:k]


def generate_response(examples: list[str], query: str):
    formatted_examples = "\n".join(examples)
    return client.chat.completions.create(
        model="gpt-4o",
        response_model=Response,
        messages=[
            {
                "role": "user",
                "content": dedent(
                    f"""
                    Respond to the following query with the most accurate
                    and concise answer possible.
                    <examples>
                    {formatted_examples}
                    </examples>
                    <query>
                    {query}
                    </query>
                """
                ),
            }
        ],
    )


def generate_question_and_answer_pair(
    questions: list[str], question_and_answers: list[dict[str, str]]
) -> list[str]:
    question_to_answer = {}

    for question in question_and_answers:
        question_to_answer[question["question"]] = question["answer"]

    return [
        dedent(
            f"""
        <example>
        <question>{question}</question>
        <answer>{question_to_answer[question]}</answer>
        </example>
        """
        )
        for question in questions
    ]


if __name__ == "__main__":
    examples = [
        {"question": "What is the capital of France?", "answer": "Paris"},
        {"question": "Who wrote Romeo and Juliet", "answer": "Shakespeare"},
        {"question": "What is the capital of Germany?", "answer": "Berlin"},
    ]

    query = "What is the capital of Italy?"

    # Step 1 : Embed the Examples
    embeddings = embed_queries([example["question"] for example in examples] + [query])

    embedded_examples = embeddings[:-1]
    embedded_query = embeddings[-1]

    # # Step 3: Find the k closest examples to the query
    k_closest_examples = knn(embedded_examples, embedded_query[0], 2)

    for example in k_closest_examples:
        print(example)
        #> (0.4013468481736857, 'What is the capital of France?')
        #> (0.4471368596136872, 'What is the capital of Germany?')

    # Step 4: Use these examples as in-context examples
    formatted_examples = generate_question_and_answer_pair(
        [example[1] for example in k_closest_examples], examples
    )
    response = generate_response(formatted_examples, query)
    print(response.answer)
    #> Rome

参考文献

1: 什么构成 GPT-3 的优秀上下文示例?

*: 提示报告:提示技术系统调查