跳到内容

在 Instructor 中使用类型

Instructor 支持各种类型的结构化输出,从简单的基本类型到复杂的嵌套结构。

简单类型

除了 pydantic.BaseModel (推荐的方法) 外,Instructor 还支持

  • 基本类型:str, int, float, bool
  • 集合类型:List, Dict
  • 类型组合:Union, Literal, Optional
  • 特殊输出:可迭代类型, 部分类型

您可以在 response_model 参数中直接使用这些类型,而无需将它们包装在 Pydantic 模型中。

为了更好的文档和控制,请使用 typing.Annotated 为您的类型添加更多上下文。

幕后发生了什么?

实际上,我们将把响应模型包装成以下形式的 pydantic.BaseModel

from typing import Annotated
from pydantic import create_model, Field, BaseModel

typehint = Annotated[bool, Field(description="Sample Description")]

model = create_model("Response", content=(typehint, ...), __base__=BaseModel)

print(model.model_json_schema())
"""
{
    'properties': {
        'content': {
            'description': 'Sample Description',
            'title': 'Content',
            'type': 'boolean',
        }
    },
    'required': ['content'],
    'title': 'Response',
    'type': 'object',
}
"""

基本类型 (str, int, float, bool)

import instructor
import openai

client = instructor.from_openai(openai.OpenAI())

# Response model with simple types like str, int, float, bool
resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=bool,
    messages=[
        {
            "role": "user",
            "content": "Is it true that Paris is the capital of France?",
        },
    ],
)
assert resp is True, "Paris is the capital of France"
print(resp)
#> True

Annotated

注解(Annotations)可用于添加有关类型的更多信息。这对于为类型添加描述以及更复杂的信息(如字段名等)非常有用。

import instructor
import openai
from typing import Annotated
from pydantic import Field

client = instructor.from_openai(openai.OpenAI())

UpperCaseStr = Annotated[str, Field(description="string must be upper case")]

# Response model with simple types like str, int, float, bool
resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=UpperCaseStr,
    messages=[
        {
            "role": "user",
            "content": "What is the capital of france?",
        },
    ],
)
assert resp == "PARIS", "Paris is the capital of France"
print(resp)
#> PARIS

Literal

在进行简单分类时,Literal 很好用,它们支持字符串、整数、布尔值的字面量。

import instructor
import openai
from typing import Literal

client = instructor.from_openai(openai.OpenAI())

resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=Literal["BILLING", "SHIPPING"],
    messages=[
        {
            "role": "user",
            "content": "Classify the following messages: 'I am having trouble with my billing'",
        },
    ],
)
assert resp == "BILLING"
print(resp)
#> BILLING

Enum

Enum 在没有额外提示的情况下更难准确获取,但如果这些值在应用程序中共享,它们将非常有用。

import instructor
import openai
from enum import Enum


class Label(str, Enum):
    BILLING = "BILLING"
    SHIPPING = "SHIPPING"


client = instructor.from_openai(openai.OpenAI())

resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=Label,
    messages=[
        {
            "role": "user",
            "content": "Classify the following messages: 'I am having trouble with my billing'",
        },
    ],
)
assert resp == Label.BILLING
print(resp)
#> BILLING

List

import instructor
import openai
from typing import List

client = instructor.from_openai(openai.OpenAI())

resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=List[int],
    messages=[
        {
            "role": "user",
            "content": "Give me the first 5 prime numbers",
        },
    ],
)

assert resp == [2, 3, 5, 7, 11]
print(resp)
#> [2, 3, 5, 7, 11]

Union

Union 是处理多种类型响应的好方法,类似于多个函数调用,但不受函数调用 API 的限制,例如在 JSON_SCHEMA 模式中。

import instructor
import openai
from pydantic import BaseModel
from typing import Union

client = instructor.from_openai(openai.OpenAI())


class Add(BaseModel):
    a: int
    b: int


class Weather(BaseModel):
    location: str


resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=Union[Add, Weather],
    messages=[
        {
            "role": "user",
            "content": "What is 5 + 5?",
        },
    ],
)

assert resp == Add(a=5, b=5)
print(resp)
#> a=5 b=5

复杂类型

Pandas DataFrame

这是一个更复杂的示例,我们使用自定义类型将 Markdown 转换为 Pandas DataFrame。

from io import StringIO
from typing import Annotated, Any
from pydantic import BeforeValidator, PlainSerializer, InstanceOf, WithJsonSchema
import pandas as pd
import instructor
import openai


def md_to_df(data: Any) -> Any:
    # Convert markdown to DataFrame
    if isinstance(data, str):
        return (
            pd.read_csv(
                StringIO(data),  # Process data
                sep="|",
                index_col=1,
            )
            .dropna(axis=1, how="all")
            .iloc[1:]
            .applymap(lambda x: x.strip())
        )
    return data


MarkdownDataFrame = Annotated[
    # Validates final type
    InstanceOf[pd.DataFrame],
    # Converts markdown to DataFrame
    BeforeValidator(md_to_df),
    # Converts DataFrame to markdown on model_dump_json
    PlainSerializer(lambda df: df.to_markdown()),
    # Adds a description to the type
    WithJsonSchema(
        {
            "type": "string",
            "description": """
            The markdown representation of the table,
            each one should be tidy, do not try to join
            tables that should be seperate""",
        }
    ),
]


client = instructor.from_openai(openai.OpenAI())

resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=MarkdownDataFrame,
    messages=[
        {
            "role": "user",
            "content": "Jason is 20, Sarah is 30, and John is 40",
        },
    ],
)

assert isinstance(resp, pd.DataFrame)
print(resp)
"""
       Age
 Name
Jason    20
Sarah    30
John     40
"""

Union 列表

就像 Union 一样,我们可以使用 List of Unions 来表示多种类型的响应。这感觉类似于并行函数调用,但不受函数调用 API 的限制,例如在 JSON_SCHEMA 模式中。

import instructor
import openai
from pydantic import BaseModel
from typing import Union, List

client = instructor.from_openai(openai.OpenAI())


class Weather(BaseModel, frozen=True):
    location: str


class Add(BaseModel, frozen=True):
    a: int
    b: int


resp = client.chat.completions.create(
    model="gpt-3.5-turbo",
    response_model=List[Union[Add, Weather]],
    messages=[
        {
            "role": "user",
            "content": "Add 5 and 5, and also whats the weather in Toronto?",
        },
    ],
)

assert resp == [Add(a=5, b=5), Weather(location="Toronto")]
print(resp)
#> [Add(a=5, b=5), Weather(location='Toronto')]