列表提取¶
本指南解释了如何使用 Instructor 提取结构化数据列表(数组)。列表是从文本中提取多个相似项的最有用模式之一。
基本列表提取¶
要提取项目列表,你需要定义一个用于单个项目的模型,然后使用 Python 的类型系统来指定你想要该类型的列表
from typing import List
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
# Initialize the client
client = instructor.from_openai(OpenAI())
# Define a single item model
class Person(BaseModel):
name: str = Field(..., description="The person's full name")
age: int = Field(..., description="The person's age in years")
# Define a wrapper model for the list
class PeopleList(BaseModel):
people: List[Person] = Field(..., description="List of people mentioned in the text")
# Extract the list
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": """
Here's information about some people:
- John Smith is 35 years old
- Mary Johnson is 28 years old
- Robert Davis is 42 years old
"""}
],
response_model=PeopleList
)
# Access the extracted data
for i, person in enumerate(response.people):
print(f"Person {i+1}: {person.name}, {person.age} years old")
此示例展示了如何: 1. 定义单个项目的模型 (Person
) 2. 创建一个包含项目列表的包装模型 (PeopleList
) 3. 通过响应访问列表中的每个项目
直接列表提取¶
你也可以不使用包装模型直接提取列表
from typing import List
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
client = instructor.from_openai(OpenAI())
class Book(BaseModel):
title: str
author: str
publication_year: int
# Extract a list directly
books = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": """
Classic novels:
1. To Kill a Mockingbird by Harper Lee (1960)
2. 1984 by George Orwell (1949)
3. The Great Gatsby by F. Scott Fitzgerald (1925)
"""}
],
response_model=List[Book] # Direct list extraction
)
# Access the extracted data
for book in books:
print(f"{book.title} by {book.author} ({book.publication_year})")
嵌套列表¶
你可以通过组合列表类型来提取嵌套列表
from typing import List
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
client = instructor.from_openai(OpenAI())
class Author(BaseModel):
name: str
nationality: str
class Book(BaseModel):
title: str
authors: List[Author] # Nested list of authors
publication_year: int
# Extract data with nested lists
books = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": """
Book 1: "Good Omens" (1990)
Authors: Terry Pratchett (British), Neil Gaiman (British)
Book 2: "The Talisman" (1984)
Authors: Stephen King (American), Peter Straub (American)
"""}
],
response_model=List[Book]
)
# Access the nested data
for book in books:
author_names = ", ".join([author.name for author in book.authors])
print(f"{book.title} ({book.publication_year}) by {author_names}")
将流式处理与列表结合使用¶
你可以使用 Instructor 的流式处理功能流式处理列表提取结果
from typing import List
import instructor
from openai import OpenAI
from pydantic import BaseModel, Field
client = instructor.from_openai(OpenAI())
class Task(BaseModel):
description: str
priority: str
deadline: str
# Stream a list of tasks
for task in client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "Generate a list of 5 sample tasks for a project manager"}
],
response_model=List[Task],
stream=True
):
print(f"Received task: {task.description} (Priority: {task.priority}, Deadline: {task.deadline})")
有关流式处理的更多信息,请参阅流式处理基础和流式处理列表指南。
列表验证¶
你可以为单个项目和整个列表添加验证
from typing import List
from pydantic import BaseModel, Field, field_validator, model_validator
import instructor
from openai import OpenAI
client = instructor.from_openai(OpenAI())
class Product(BaseModel):
name: str
price: float
@field_validator('price')
@classmethod
def validate_price(cls, v):
if v <= 0:
raise ValueError("Price must be greater than zero")
return v
class ProductList(BaseModel):
products: List[Product] = Field(..., min_items=1)
@model_validator(mode='after')
def validate_unique_names(self):
names = [p.name for p in self.products]
if len(names) != len(set(names)):
raise ValueError("All product names must be unique")
return self
# Extract list with validation
response = client.chat.completions.create(
model="gpt-3.5-turbo",
messages=[
{"role": "user", "content": "List of products: Headphones ($50), Speakers ($80), Earbuds ($30)"}
],
response_model=ProductList
)
列表约束¶
你可以使用 Pydantic 的 Field 为列表添加约束
from typing import List
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
client = instructor.from_openai(OpenAI())
class Ingredient(BaseModel):
name: str
amount: str
class Recipe(BaseModel):
title: str
ingredients: List[Ingredient] = Field(
...,
min_items=2, # Minimum 2 ingredients
max_items=10, # Maximum 10 ingredients
description="List of ingredients needed for the recipe"
)
steps: List[str] = Field(
...,
min_items=1,
description="Step-by-step instructions to prepare the recipe"
)
实际示例:任务提取¶
这是一个从会议记录中提取任务列表的更完整的示例
from typing import List, Optional
from pydantic import BaseModel, Field
import instructor
from openai import OpenAI
from datetime import date
client = instructor.from_openai(OpenAI())
class Assignee(BaseModel):
name: str
email: Optional[str] = None
class ActionItem(BaseModel):
description: str = Field(..., description="The task that needs to be completed")
assignee: Assignee = Field(..., description="The person responsible for the task")
due_date: Optional[date] = Field(None, description="The deadline for the task")
priority: str = Field(..., description="Priority level: Low, Medium, or High")
# Extract action items from meeting notes
action_items = client.chat.completions.create(
model="gpt-4",
messages=[
{"role": "user", "content": """
Meeting Notes - Project Kickoff
Date: 2023-05-15
Attendees: John (john@example.com), Sarah (sarah@example.com), Mike
Discussion points:
1. John will prepare the project timeline by next Friday. This is high priority.
2. Sarah needs to contact the client for requirements clarification by Wednesday. Medium priority.
3. Mike is responsible for setting up the development environment. Due by tomorrow, high priority.
"""}
],
response_model=List[ActionItem]
)
# Process the extracted action items
for item in action_items:
due_str = item.due_date.isoformat() if item.due_date else "Not specified"
print(f"Task: {item.description}")
print(f"Assignee: {item.assignee.name} ({item.assignee.email or 'No email'})")
print(f"Due: {due_str}, Priority: {item.priority}")
print("---")
有关更详细的示例,请参阅行动项提取示例。