在 Instructor 中使用类型¶
Instructor 支持各种类型的结构化输出,从简单的基本类型到复杂的嵌套结构。
简单类型¶
除了 pydantic.BaseModel
(推荐的方法) 外,Instructor 还支持
您可以在 response_model
参数中直接使用这些类型,而无需将它们包装在 Pydantic 模型中。
为了更好的文档和控制,请使用 typing.Annotated
为您的类型添加更多上下文。
幕后发生了什么?¶
实际上,我们将把响应模型包装成以下形式的 pydantic.BaseModel
from typing import Annotated
from pydantic import create_model, Field, BaseModel
typehint = Annotated[bool, Field(description="Sample Description")]
model = create_model("Response", content=(typehint, ...), __base__=BaseModel)
print(model.model_json_schema())
"""
{
'properties': {
'content': {
'description': 'Sample Description',
'title': 'Content',
'type': 'boolean',
}
},
'required': ['content'],
'title': 'Response',
'type': 'object',
}
"""
基本类型 (str, int, float, bool)¶
import instructor
import openai
client = instructor.from_openai(openai.OpenAI())
# Response model with simple types like str, int, float, bool
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=bool,
messages=[
{
"role": "user",
"content": "Is it true that Paris is the capital of France?",
},
],
)
assert resp is True, "Paris is the capital of France"
print(resp)
#> True
Annotated¶
注解(Annotations)可用于添加有关类型的更多信息。这对于为类型添加描述以及更复杂的信息(如字段名等)非常有用。
import instructor
import openai
from typing import Annotated
from pydantic import Field
client = instructor.from_openai(openai.OpenAI())
UpperCaseStr = Annotated[str, Field(description="string must be upper case")]
# Response model with simple types like str, int, float, bool
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=UpperCaseStr,
messages=[
{
"role": "user",
"content": "What is the capital of france?",
},
],
)
assert resp == "PARIS", "Paris is the capital of France"
print(resp)
#> PARIS
Literal¶
在进行简单分类时,Literal 很好用,它们支持字符串、整数、布尔值的字面量。
import instructor
import openai
from typing import Literal
client = instructor.from_openai(openai.OpenAI())
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=Literal["BILLING", "SHIPPING"],
messages=[
{
"role": "user",
"content": "Classify the following messages: 'I am having trouble with my billing'",
},
],
)
assert resp == "BILLING"
print(resp)
#> BILLING
Enum¶
Enum 在没有额外提示的情况下更难准确获取,但如果这些值在应用程序中共享,它们将非常有用。
import instructor
import openai
from enum import Enum
class Label(str, Enum):
BILLING = "BILLING"
SHIPPING = "SHIPPING"
client = instructor.from_openai(openai.OpenAI())
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=Label,
messages=[
{
"role": "user",
"content": "Classify the following messages: 'I am having trouble with my billing'",
},
],
)
assert resp == Label.BILLING
print(resp)
#> BILLING
List¶
import instructor
import openai
from typing import List
client = instructor.from_openai(openai.OpenAI())
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=List[int],
messages=[
{
"role": "user",
"content": "Give me the first 5 prime numbers",
},
],
)
assert resp == [2, 3, 5, 7, 11]
print(resp)
#> [2, 3, 5, 7, 11]
Union¶
Union 是处理多种类型响应的好方法,类似于多个函数调用,但不受函数调用 API 的限制,例如在 JSON_SCHEMA 模式中。
import instructor
import openai
from pydantic import BaseModel
from typing import Union
client = instructor.from_openai(openai.OpenAI())
class Add(BaseModel):
a: int
b: int
class Weather(BaseModel):
location: str
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=Union[Add, Weather],
messages=[
{
"role": "user",
"content": "What is 5 + 5?",
},
],
)
assert resp == Add(a=5, b=5)
print(resp)
#> a=5 b=5
复杂类型¶
Pandas DataFrame¶
这是一个更复杂的示例,我们使用自定义类型将 Markdown 转换为 Pandas DataFrame。
from io import StringIO
from typing import Annotated, Any
from pydantic import BeforeValidator, PlainSerializer, InstanceOf, WithJsonSchema
import pandas as pd
import instructor
import openai
def md_to_df(data: Any) -> Any:
# Convert markdown to DataFrame
if isinstance(data, str):
return (
pd.read_csv(
StringIO(data), # Process data
sep="|",
index_col=1,
)
.dropna(axis=1, how="all")
.iloc[1:]
.applymap(lambda x: x.strip())
)
return data
MarkdownDataFrame = Annotated[
# Validates final type
InstanceOf[pd.DataFrame],
# Converts markdown to DataFrame
BeforeValidator(md_to_df),
# Converts DataFrame to markdown on model_dump_json
PlainSerializer(lambda df: df.to_markdown()),
# Adds a description to the type
WithJsonSchema(
{
"type": "string",
"description": """
The markdown representation of the table,
each one should be tidy, do not try to join
tables that should be seperate""",
}
),
]
client = instructor.from_openai(openai.OpenAI())
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=MarkdownDataFrame,
messages=[
{
"role": "user",
"content": "Jason is 20, Sarah is 30, and John is 40",
},
],
)
assert isinstance(resp, pd.DataFrame)
print(resp)
"""
Age
Name
Jason 20
Sarah 30
John 40
"""
Union 列表¶
就像 Union 一样,我们可以使用 List of Unions 来表示多种类型的响应。这感觉类似于并行函数调用,但不受函数调用 API 的限制,例如在 JSON_SCHEMA 模式中。
import instructor
import openai
from pydantic import BaseModel
from typing import Union, List
client = instructor.from_openai(openai.OpenAI())
class Weather(BaseModel, frozen=True):
location: str
class Add(BaseModel, frozen=True):
a: int
b: int
resp = client.chat.completions.create(
model="gpt-3.5-turbo",
response_model=List[Union[Add, Weather]],
messages=[
{
"role": "user",
"content": "Add 5 and 5, and also whats the weather in Toronto?",
},
],
)
assert resp == [Add(a=5, b=5), Weather(location="Toronto")]
print(resp)
#> [Add(a=5, b=5), Weather(location='Toronto')]