mirror of
https://github.com/blakeblackshear/frigate.git
synced 2026-06-02 07:00:32 +00:00
Implement semantic query for chat (#23206)
This commit is contained in:
parent
c6eadfebb8
commit
b712e1fbd9
@ -68,62 +68,123 @@ class VLMMonitorRequest(BaseModel):
|
||||
zones: List[str] = []
|
||||
|
||||
|
||||
def get_tool_definitions() -> List[Dict[str, Any]]:
|
||||
def get_tool_definitions(
|
||||
semantic_search_enabled: bool = False,
|
||||
) -> List[Dict[str, Any]]:
|
||||
"""
|
||||
Get OpenAI-compatible tool definitions for Frigate.
|
||||
|
||||
Returns a list of tool definitions that can be used with OpenAI-compatible
|
||||
function calling APIs.
|
||||
function calling APIs. When semantic search is enabled, the search_objects
|
||||
tool exposes an additional `semantic_query` parameter for descriptive
|
||||
queries (e.g. "person riding a lawn mower") and find_similar_objects is
|
||||
included.
|
||||
"""
|
||||
search_objects_properties: Dict[str, Any] = {
|
||||
"camera": {
|
||||
"type": "string",
|
||||
"description": "Camera name to filter by (optional).",
|
||||
},
|
||||
"label": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Generic object class to filter by — one of the tracked detector "
|
||||
"labels such as 'person', 'package', 'car', 'dog', 'bird'. Use "
|
||||
"this for broad queries like 'show me all cars today'. Combine "
|
||||
"with semantic_query when the user also describes appearance or "
|
||||
"behavior (e.g. label='person', semantic_query='riding a lawn "
|
||||
"mower')."
|
||||
),
|
||||
},
|
||||
"sub_label": {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Filter by a DISCRETE NAMED entity recognized in the detection. "
|
||||
"Use this for: a known person's name ('John'), a delivery "
|
||||
"company ('Amazon', 'UPS'), a recognized animal species or "
|
||||
"breed ('blue jay', 'cardinal', 'golden retriever'), or a "
|
||||
"license plate string. When filtering by a specific name, set "
|
||||
"only sub_label and leave label unset. Do NOT use sub_label "
|
||||
"for descriptions of appearance, clothing, or actions — those "
|
||||
"belong in semantic_query."
|
||||
),
|
||||
},
|
||||
"after": {
|
||||
"type": "string",
|
||||
"description": "Start time in ISO 8601 format (e.g., '2024-01-01T00:00:00Z').",
|
||||
},
|
||||
"before": {
|
||||
"type": "string",
|
||||
"description": "End time in ISO 8601 format (e.g., '2024-01-01T23:59:59Z').",
|
||||
},
|
||||
"zones": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of zone names to filter by.",
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of objects to return (default: 25).",
|
||||
"default": 25,
|
||||
},
|
||||
}
|
||||
|
||||
if semantic_search_enabled:
|
||||
search_objects_properties["semantic_query"] = {
|
||||
"type": "string",
|
||||
"description": (
|
||||
"Optional natural-language description of a PHYSICAL "
|
||||
"CHARACTERISTIC, APPEARANCE, or ACTIVITY the user mentioned, "
|
||||
"used to semantically narrow results. Only set this when the "
|
||||
"user describes something beyond what label and sub_label can "
|
||||
"express on their own.\n"
|
||||
"USE for descriptive phrases like: 'riding a lawn mower', "
|
||||
"'wearing a red jacket', 'carrying a package', 'walking a "
|
||||
"dog', 'on a bicycle', 'holding an umbrella'.\n"
|
||||
"DO NOT USE for:\n"
|
||||
"- specific named people, pets, or delivery companies → use sub_label\n"
|
||||
"- animal species or breed names like 'blue jay', 'cardinal', "
|
||||
"'golden retriever' → use sub_label\n"
|
||||
"- license plate strings → use sub_label\n"
|
||||
"- generic object queries like 'all cars today' or 'every "
|
||||
"person' → use label alone with no semantic_query\n"
|
||||
"When set, combine with label/time/camera/zone filters as "
|
||||
"usual (e.g. label='person', semantic_query='riding a lawn "
|
||||
"mower', after='2024-05-01T00:00:00Z')."
|
||||
),
|
||||
}
|
||||
|
||||
search_objects_description = (
|
||||
"Search the historical record of detected objects in Frigate. "
|
||||
"Use this ONLY for questions about the PAST — e.g. 'did anyone come by today?', "
|
||||
"'when was the last car?', 'show me detections from yesterday'. "
|
||||
"Do NOT use this for monitoring or alerting requests about future events — "
|
||||
"use start_camera_watch instead for those. "
|
||||
"An 'object' in Frigate represents a tracked detection (e.g., a person, package, car).\n\n"
|
||||
"Choose filters based on what the user is asking for:\n"
|
||||
"- Generic class query ('show me all cars today'): set `label` only.\n"
|
||||
"- Specific NAMED entity (known person, delivery company, animal "
|
||||
"species/breed like 'blue jay' or 'golden retriever', license "
|
||||
"plate): set `sub_label` only and leave `label` unset.\n"
|
||||
)
|
||||
if semantic_search_enabled:
|
||||
search_objects_description += (
|
||||
"- Physical CHARACTERISTIC, APPEARANCE, or ACTIVITY that is not a "
|
||||
"discrete name ('person riding a lawn mower', 'someone in a red "
|
||||
"jacket', 'person carrying a package'): set `semantic_query` with "
|
||||
"the descriptive phrase, optionally alongside `label` for the "
|
||||
"object class. Do NOT put descriptive phrases in sub_label."
|
||||
)
|
||||
|
||||
return [
|
||||
{
|
||||
"type": "function",
|
||||
"function": {
|
||||
"name": "search_objects",
|
||||
"description": (
|
||||
"Search the historical record of detected objects in Frigate. "
|
||||
"Use this ONLY for questions about the PAST — e.g. 'did anyone come by today?', "
|
||||
"'when was the last car?', 'show me detections from yesterday'. "
|
||||
"Do NOT use this for monitoring or alerting requests about future events — "
|
||||
"use start_camera_watch instead for those. "
|
||||
"An 'object' in Frigate represents a tracked detection (e.g., a person, package, car). "
|
||||
"When the user asks about a specific name (person, delivery company, animal, etc.), "
|
||||
"filter by sub_label only and do not set label."
|
||||
),
|
||||
"description": search_objects_description,
|
||||
"parameters": {
|
||||
"type": "object",
|
||||
"properties": {
|
||||
"camera": {
|
||||
"type": "string",
|
||||
"description": "Camera name to filter by (optional).",
|
||||
},
|
||||
"label": {
|
||||
"type": "string",
|
||||
"description": "Object label to filter by (e.g., 'person', 'package', 'car').",
|
||||
},
|
||||
"sub_label": {
|
||||
"type": "string",
|
||||
"description": "Name of a person, delivery company, animal, etc. When filtering by a specific name, use only sub_label; do not set label.",
|
||||
},
|
||||
"after": {
|
||||
"type": "string",
|
||||
"description": "Start time in ISO 8601 format (e.g., '2024-01-01T00:00:00Z').",
|
||||
},
|
||||
"before": {
|
||||
"type": "string",
|
||||
"description": "End time in ISO 8601 format (e.g., '2024-01-01T23:59:59Z').",
|
||||
},
|
||||
"zones": {
|
||||
"type": "array",
|
||||
"items": {"type": "string"},
|
||||
"description": "List of zone names to filter by.",
|
||||
},
|
||||
"limit": {
|
||||
"type": "integer",
|
||||
"description": "Maximum number of objects to return (default: 25).",
|
||||
"default": 25,
|
||||
},
|
||||
},
|
||||
"properties": search_objects_properties,
|
||||
},
|
||||
"required": [],
|
||||
},
|
||||
@ -397,9 +458,12 @@ def get_tool_definitions() -> List[Dict[str, Any]]:
|
||||
summary="Get available tools",
|
||||
description="Returns OpenAI-compatible tool definitions for function calling.",
|
||||
)
|
||||
def get_tools() -> JSONResponse:
|
||||
def get_tools(request: Request) -> JSONResponse:
|
||||
"""Get list of available tools for LLM function calling."""
|
||||
tools = get_tool_definitions()
|
||||
semantic_search_enabled = bool(
|
||||
getattr(request.app.frigate_config.semantic_search, "enabled", False)
|
||||
)
|
||||
tools = get_tool_definitions(semantic_search_enabled=semantic_search_enabled)
|
||||
return JSONResponse(content={"tools": tools})
|
||||
|
||||
|
||||
@ -432,16 +496,29 @@ def _resolve_zones(
|
||||
|
||||
|
||||
async def _execute_search_objects(
|
||||
request: Request,
|
||||
arguments: Dict[str, Any],
|
||||
allowed_cameras: List[str],
|
||||
config: FrigateConfig,
|
||||
) -> JSONResponse:
|
||||
"""
|
||||
Execute the search_objects tool.
|
||||
|
||||
This searches for detected objects (events) in Frigate using the same
|
||||
logic as the events API endpoint.
|
||||
Routes to the semantic path when the LLM supplied a `semantic_query`
|
||||
and semantic search is enabled; otherwise delegates to the standard
|
||||
events API logic.
|
||||
"""
|
||||
config = request.app.frigate_config
|
||||
semantic_query = arguments.get("semantic_query")
|
||||
if isinstance(semantic_query, str):
|
||||
semantic_query = semantic_query.strip() or None
|
||||
else:
|
||||
semantic_query = None
|
||||
|
||||
if semantic_query and getattr(config.semantic_search, "enabled", False):
|
||||
return await _execute_search_objects_semantic(
|
||||
request, arguments, allowed_cameras, semantic_query
|
||||
)
|
||||
|
||||
# Parse after/before as server local time; convert to Unix timestamp
|
||||
after = arguments.get("after")
|
||||
before = arguments.get("before")
|
||||
@ -508,6 +585,119 @@ async def _execute_search_objects(
|
||||
)
|
||||
|
||||
|
||||
async def _execute_search_objects_semantic(
|
||||
request: Request,
|
||||
arguments: Dict[str, Any],
|
||||
allowed_cameras: List[str],
|
||||
semantic_query: str,
|
||||
) -> JSONResponse:
|
||||
"""Search objects via fused thumbnail + description embeddings.
|
||||
|
||||
Runs both visual and description vec searches against `semantic_query`,
|
||||
intersects the candidates with the structured filters (camera, label,
|
||||
sub_label, zones, time window) the LLM supplied, and ranks the survivors
|
||||
by fused similarity. Mirrors the candidate-then-filter pattern used by
|
||||
find_similar_objects since sqlite-vec's IN filter is unreliable.
|
||||
"""
|
||||
from peewee import fn
|
||||
|
||||
config = request.app.frigate_config
|
||||
context = request.app.embeddings
|
||||
if context is None:
|
||||
logger.warning(
|
||||
"semantic_query supplied but embeddings context is unavailable; "
|
||||
"returning empty results."
|
||||
)
|
||||
return JSONResponse(content=[])
|
||||
|
||||
after = parse_iso_to_timestamp(arguments.get("after"))
|
||||
before = parse_iso_to_timestamp(arguments.get("before"))
|
||||
|
||||
camera_arg = arguments.get("camera")
|
||||
if camera_arg and camera_arg != "all":
|
||||
if camera_arg not in allowed_cameras:
|
||||
return JSONResponse(content=[])
|
||||
cameras = [camera_arg]
|
||||
else:
|
||||
cameras = list(allowed_cameras) if allowed_cameras else []
|
||||
|
||||
if not cameras:
|
||||
return JSONResponse(content=[])
|
||||
|
||||
label = arguments.get("label")
|
||||
sub_label = arguments.get("sub_label")
|
||||
|
||||
zones = arguments.get("zones")
|
||||
if isinstance(zones, list) and zones:
|
||||
zones = _resolve_zones(zones, config, cameras)
|
||||
else:
|
||||
zones = None
|
||||
|
||||
limit = int(arguments.get("limit", 25))
|
||||
limit = max(1, min(limit, 100))
|
||||
|
||||
visual_distances: Dict[str, float] = {}
|
||||
description_distances: Dict[str, float] = {}
|
||||
try:
|
||||
rows = context.search_thumbnail(semantic_query)
|
||||
visual_distances = {row[0]: row[1] for row in rows}
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"search_thumbnail failed for semantic_query: %s", semantic_query
|
||||
)
|
||||
|
||||
try:
|
||||
rows = context.search_description(semantic_query)
|
||||
description_distances = {row[0]: row[1] for row in rows}
|
||||
except Exception:
|
||||
logger.exception(
|
||||
"search_description failed for semantic_query: %s", semantic_query
|
||||
)
|
||||
|
||||
vec_ids = set(visual_distances) | set(description_distances)
|
||||
if not vec_ids:
|
||||
return JSONResponse(content=[])
|
||||
|
||||
clauses = [Event.id.in_(list(vec_ids)), Event.camera.in_(cameras)]
|
||||
if after is not None:
|
||||
clauses.append(Event.start_time >= after)
|
||||
if before is not None:
|
||||
clauses.append(Event.start_time <= before)
|
||||
if label:
|
||||
clauses.append(Event.label == label)
|
||||
if sub_label:
|
||||
# case-insensitive match to mirror events() behavior
|
||||
clauses.append(fn.LOWER(Event.sub_label.cast("text")) == sub_label.lower())
|
||||
if zones:
|
||||
zone_clauses = [Event.zones.cast("text") % f'*"{zone}"*' for zone in zones]
|
||||
clauses.append(reduce(operator.or_, zone_clauses))
|
||||
|
||||
eligible = {e.id: e for e in Event.select().where(reduce(operator.and_, clauses))}
|
||||
|
||||
scored: List[tuple[str, float]] = []
|
||||
for eid in eligible:
|
||||
v_score = (
|
||||
distance_to_score(visual_distances[eid], context.thumb_stats)
|
||||
if eid in visual_distances
|
||||
else None
|
||||
)
|
||||
d_score = (
|
||||
distance_to_score(description_distances[eid], context.desc_stats)
|
||||
if eid in description_distances
|
||||
else None
|
||||
)
|
||||
fused = fuse_scores(v_score, d_score)
|
||||
if fused is None:
|
||||
continue
|
||||
scored.append((eid, fused))
|
||||
|
||||
scored.sort(key=lambda pair: pair[1], reverse=True)
|
||||
scored = scored[:limit]
|
||||
|
||||
results = [hydrate_event(eligible[eid], score=score) for eid, score in scored]
|
||||
return JSONResponse(content=results)
|
||||
|
||||
|
||||
async def _execute_find_similar_objects(
|
||||
request: Request,
|
||||
arguments: Dict[str, Any],
|
||||
@ -696,9 +886,7 @@ async def execute_tool(
|
||||
logger.debug(f"Executing tool: {tool_name} with arguments: {arguments}")
|
||||
|
||||
if tool_name == "search_objects":
|
||||
return await _execute_search_objects(
|
||||
arguments, allowed_cameras, request.app.frigate_config
|
||||
)
|
||||
return await _execute_search_objects(request, arguments, allowed_cameras)
|
||||
|
||||
if tool_name == "find_similar_objects":
|
||||
result = await _execute_find_similar_objects(
|
||||
@ -878,9 +1066,7 @@ async def _execute_tool_internal(
|
||||
This is used by the chat completion endpoint to execute tools.
|
||||
"""
|
||||
if tool_name == "search_objects":
|
||||
response = await _execute_search_objects(
|
||||
arguments, allowed_cameras, request.app.frigate_config
|
||||
)
|
||||
response = await _execute_search_objects(request, arguments, allowed_cameras)
|
||||
try:
|
||||
if hasattr(response, "body"):
|
||||
body_str = response.body.decode("utf-8")
|
||||
@ -1293,7 +1479,9 @@ async def chat_completion(
|
||||
status_code=400,
|
||||
)
|
||||
|
||||
tools = get_tool_definitions()
|
||||
config = request.app.frigate_config
|
||||
semantic_search_enabled = bool(getattr(config.semantic_search, "enabled", False))
|
||||
tools = get_tool_definitions(semantic_search_enabled=semantic_search_enabled)
|
||||
conversation = []
|
||||
|
||||
current_datetime = datetime.now()
|
||||
@ -1301,7 +1489,6 @@ async def chat_completion(
|
||||
current_time_str = current_datetime.strftime("%I:%M:%S %p")
|
||||
|
||||
cameras_info = []
|
||||
config = request.app.frigate_config
|
||||
has_speed_zone = False
|
||||
for camera_id in allowed_cameras:
|
||||
if camera_id not in config.cameras:
|
||||
@ -1339,6 +1526,15 @@ async def chat_completion(
|
||||
)
|
||||
speed_units_section = f"\n\nReport object speeds to the user in {speed_unit}."
|
||||
|
||||
semantic_search_section = ""
|
||||
if semantic_search_enabled:
|
||||
semantic_search_section = (
|
||||
"\n\nWhen routing a search_objects call, pick filters by the shape of the user's request:\n"
|
||||
"- Generic class ('show me all cars today'): set `label` only.\n"
|
||||
"- Specific named entity — a known person ('John'), delivery company ('Amazon'), animal species/breed ('blue jay', 'cardinal', 'golden retriever'), or license plate: set `sub_label` only and leave `label` unset.\n"
|
||||
"- Physical characteristic, appearance, or activity that is NOT a discrete name ('find me people riding a lawn mower', 'someone in a red jacket', 'a person carrying a package'): set `semantic_query` with the descriptive phrase, optionally combined with `label` for the object class. Never put descriptive phrases in `sub_label`."
|
||||
)
|
||||
|
||||
system_prompt = f"""You are a helpful assistant for Frigate, a security camera NVR system. You help users answer questions about their cameras, detected objects, and events.
|
||||
|
||||
Current server local date and time: {current_date_str} at {current_time_str}
|
||||
@ -1350,7 +1546,7 @@ When users ask about "today", "yesterday", "this week", etc., use the current da
|
||||
When searching for objects or events, use ISO 8601 format for dates (e.g., {current_date_str}T00:00:00Z for the start of today).
|
||||
Always be accurate with time calculations based on the current date provided.
|
||||
|
||||
When a user refers to a specific object they have seen or describe with identifying details ("that green car", "the person in the red jacket", "a package left today"), prefer the find_similar_objects tool over search_objects. Use search_objects first only to locate the anchor event, then pass its id to find_similar_objects. For generic queries like "show me all cars today", keep using search_objects. If a user message begins with [attached_event:<id>], treat that event id as the anchor for any similarity or "tell me more" request in the same message and call find_similar_objects with that id.{cameras_section}{speed_units_section}"""
|
||||
When a user refers to a specific object they have seen or describe with identifying details ("that green car", "the person in the red jacket", "a package left today"), prefer the find_similar_objects tool over search_objects. Use search_objects first only to locate the anchor event, then pass its id to find_similar_objects. For generic queries like "show me all cars today", keep using search_objects. If a user message begins with [attached_event:<id>], treat that event id as the anchor for any similarity or "tell me more" request in the same message and call find_similar_objects with that id.{semantic_search_section}{cameras_section}{speed_units_section}"""
|
||||
|
||||
conversation.append(
|
||||
{
|
||||
|
||||
Loading…
x
Reference in New Issue
Block a user