diff --git a/docker/main/Dockerfile b/docker/main/Dockerfile
index ca5705e49..78e303a8b 100644
--- a/docker/main/Dockerfile
+++ b/docker/main/Dockerfile
@@ -243,11 +243,9 @@ RUN wget -q https://bootstrap.pypa.io/get-pip.py -O get-pip.py \
 RUN --mount=type=bind,from=wheels,source=/wheels,target=/deps/wheels \
     pip3 install -U /deps/wheels/*.whl
 
-# Copy memryx setup script
-COPY ./docker/main/install_memryx.sh /deps/install_memryx.sh
-
-# Install MemryX runtime for Frigate
-RUN chmod +x /deps/install_memryx.sh && /deps/install_memryx.sh
+# Install MemryX runtime (requires libgomp (OpenMP) in the final docker image)
+RUN --mount=type=bind,source=docker/main/install_memryx.sh,target=/deps/install_memryx.sh \
+    bash -c "apt update && apt install libgomp1 && bash /deps/install_memryx.sh && rm -rf /var/lib/apt/lists/*"
 
 COPY --from=deps-rootfs / /
 
diff --git a/docker/main/install_memryx.sh b/docker/main/install_memryx.sh
index e82da7577..f96181ae0 100644
--- a/docker/main/install_memryx.sh
+++ b/docker/main/install_memryx.sh
@@ -1,11 +1,11 @@
 #!/bin/bash
 set -e
 
-# Update and install required system packages
-apt-get update && apt-get install -y git libgomp1
-
-# Clone the MemryX runtime repo
-git clone https://github.com/memryx/mx_accl_frigate.git /opt/mx_accl_frigate
+# Download the MxAccl for Frigate github release
+wget https://github.com/memryx/mx_accl_frigate/archive/refs/heads/main.zip -O /tmp/mxaccl.zip
+unzip /tmp/mxaccl.zip -d /tmp
+mv /tmp/mx_accl_frigate-main /opt/mx_accl_frigate
+rm /tmp/mxaccl.zip
 
 # Install Python dependencies
 pip3 install -r /opt/mx_accl_frigate/freeze
diff --git a/docker/memryx/user_installation.sh b/docker/memryx/user_installation.sh
index ee6c219d3..20c9b8ece 100644
--- a/docker/memryx/user_installation.sh
+++ b/docker/memryx/user_installation.sh
@@ -9,6 +9,8 @@ arch=$(uname -m)
 
 # Purge existing packages and repo
 echo "Removing old MemryX installations..."
+# Remove any holds on MemryX packages (if they exist)
+sudo apt-mark unhold memx-* mxa-manager || true
 sudo apt purge -y memx-* mxa-manager || true
 sudo rm -f /etc/apt/sources.list.d/memryx.list /etc/apt/trusted.gpg.d/memryx.asc
 
diff --git a/docs/docs/configuration/object_detectors.md b/docs/docs/configuration/object_detectors.md
index 56f4a1722..8f2c5287b 100644
--- a/docs/docs/configuration/object_detectors.md
+++ b/docs/docs/configuration/object_detectors.md
@@ -241,155 +241,7 @@ Hailo8 supports all models in the Hailo Model Zoo that include HailoRT post-proc
 
 ---
 
-## MemryX MX3  
 
-This detector is available for use with the MemryX MX3 accelerator M.2 module. Frigate supports the MX3 on compatible hardware platforms, providing efficient and high-performance object detection.  
-
-See the [installation docs](../frigate/installation.md#memryx-mx3) for information on configuring the MemryX hardware.
-
-To configure a MemryX detector, simply set the `type` attribute to `memryx` and follow the configuration guide below.
-
-### Configuration  
-
-To configure the MemryX detector, use the following example configuration:  
-
-#### Single PCIe MemryX MX3  
-
-```yaml
-detectors:
-  memx0:
-    type: memryx
-    device: PCIe:0
-```
-
-#### Multiple PCIe MemryX MX3 Modules
-
-```yaml
-detectors:
-  memx0:
-    type: memryx
-    device: PCIe:0
-
-  memx1:
-    type: memryx
-    device: PCIe:1
-
-  memx2:
-    type: memryx
-    device: PCIe:2
-```
-
-### Supported Models 
-
-MemryX `.dfp` models are automatically downloaded at runtime, if enabled, to the container at `/memryx_models/model_folder/`.
-
-#### YOLO-NAS
-
-The [YOLO-NAS](https://github.com/Deci-AI/super-gradients/blob/master/YOLONAS.md) model included in this detector is downloaded from the [Models Section](#downloading-yolo-nas-model) and compiled to DFP with [mx_nc](https://developer.memryx.com/tools/neural_compiler.html#usage).
-
-The input size for **YOLO-NAS** can be set to either **320x320** (default) or **640x640**.
-
-- The default size of **320x320** is optimized for lower CPU usage and faster inference times.
-
-##### Configuration  
-
-Below is the recommended configuration for using the **YOLO-NAS** (small) model with the MemryX detector:  
-
-```yaml
-detectors:
-  memx0:
-    type: memryx
-    device: PCIe:0
-
-model:
-  model_type: yolonas
-  width: 320   # (Can be set to 640 for higher resolution)
-  height: 320  # (Can be set to 640 for higher resolution)
-  input_tensor: nchw
-  input_dtype: float
-  # path: yolo_nas_s.dfp  ##Model is normally fetched through the runtime, so 'path' can be omitted.##
-  labelmap_path: /labelmap/coco-80.txt
-```
-
-#### YOLOX  
-
-The model is sourced from the [OpenCV Model Zoo](https://github.com/opencv/opencv_zoo) and precompiled to DFP.
-
-##### Configuration  
-
-Below is the recommended configuration for using the **YOLOX** (small) model with the MemryX detector:  
-
-```yaml
-detectors:
-  memx0:
-    type: memryx
-    device: PCIe:0
-
-model:
-  model_type: yolox
-  width: 640
-  height: 640
-  input_tensor: nchw
-  input_dtype: float_denorm
-  # path: YOLOX_640_640_3_onnx.dfp  ##Model is normally fetched through the runtime, so 'path' can be omitted.##
-  labelmap_path: /labelmap/coco-80.txt
-```
-
-#### YOLOv9  
-
-The YOLOv9s model included in this detector is downloaded from [the original GitHub](https://github.com/WongKinYiu/yolov9) like in the [Models Section](#yolov9-1) and compiled to DFP with [mx_nc](https://developer.memryx.com/tools/neural_compiler.html#usage).
-
-##### Configuration
-
-Below is the recommended configuration for using the **YOLOv9** (small) model with the MemryX detector:  
-
-```yaml
-detectors:
-  memx0:
-    type: memryx
-    device: PCIe:0
-
-model:
-  model_type: yolo-generic   
-  width: 320   # (Can be set to 640 for higher resolution)
-  height: 320  # (Can be set to 640 for higher resolution)
-  input_tensor: nchw
-  input_dtype: float
-  # path: YOLO_v9_small_onnx.dfp  ##Model is normally fetched through the runtime, so 'path' can be omitted.##
-  labelmap_path: /labelmap/coco-80.txt
-```
-
-#### SSDLite MobileNet v2  
-
-The model is sourced from the [OpenMMLab Model Zoo](https://mmdeploy-oss.openmmlab.com/model/mmdet-det/ssdlite-e8679f.onnx) and has been converted to DFP.
-
-##### Configuration  
-
-Below is the recommended configuration for using the **SSDLite MobileNet v2** model with the MemryX detector:  
-
-```yaml
-detectors:
-  memx0:
-    type: memryx
-    device: PCIe:0
-
-model:
-  model_type: ssd
-  width: 320
-  height: 320
-  input_tensor: nchw
-  input_dtype: float
-  # path: SSDlite_MobileNet_v2_320_320_3_onnx.dfp  ##Model is normally fetched during runtime, so 'path' can be omitted.##
-  labelmap_path: /labelmap/coco-80.txt
-```
-
-#### Using a Custom Model  
-
-To use your own model, bind-mount the path to your compiled `.dfp` file into the container and specify its path using `model.path`. You will also have to update the `labelmap` accordingly.
-
-For detailed instructions on compiling models, refer to the [MemryX Compiler](https://developer.memryx.com/tools/neural_compiler.html#usage) docs and [Tutorials](https://developer.memryx.com/tutorials/tutorials.html).
-
----
 
 ## OpenVINO Detector
 
@@ -850,6 +702,196 @@ To verify that the integration is working correctly, start Frigate and observe t
 
 # Community Supported Detectors
 
+## MemryX MX3  
+
+This detector is available for use with the MemryX MX3 accelerator M.2 module. Frigate supports the MX3 on compatible hardware platforms, providing efficient and high-performance object detection.  
+
+See the [installation docs](../frigate/installation.md#memryx-mx3) for information on configuring the MemryX hardware.
+
+To configure a MemryX detector, simply set the `type` attribute to `memryx` and follow the configuration guide below.
+
+### Configuration  
+
+To configure the MemryX detector, use the following example configuration:  
+
+#### Single PCIe MemryX MX3  
+
+```yaml
+detectors:
+  memx0:
+    type: memryx
+    device: PCIe:0
+```
+
+#### Multiple PCIe MemryX MX3 Modules
+
+```yaml
+detectors:
+  memx0:
+    type: memryx
+    device: PCIe:0
+
+  memx1:
+    type: memryx
+    device: PCIe:1
+
+  memx2:
+    type: memryx
+    device: PCIe:2
+```
+
+### Supported Models 
+
+MemryX `.dfp` models are automatically downloaded at runtime, if enabled, to the container at `/memryx_models/model_folder/`.
+
+#### YOLO-NAS
+
+The [YOLO-NAS](https://github.com/Deci-AI/super-gradients/blob/master/YOLONAS.md) model included in this detector is downloaded from the [Models Section](#downloading-yolo-nas-model) and compiled to DFP with [mx_nc](https://developer.memryx.com/tools/neural_compiler.html#usage).
+
+**Note:** The default model for the MemryX detector is YOLO-NAS 320x320.
+
+The input size for **YOLO-NAS** can be set to either **320x320** (default) or **640x640**.
+
+- The default size of **320x320** is optimized for lower CPU usage and faster inference times.
+
+##### Configuration  
+
+Below is the recommended configuration for using the **YOLO-NAS** (small) model with the MemryX detector:  
+
+```yaml
+detectors:
+  memx0:
+    type: memryx
+    device: PCIe:0
+
+model:
+  model_type: yolonas
+  width: 320   # (Can be set to 640 for higher resolution)
+  height: 320  # (Can be set to 640 for higher resolution)
+  input_tensor: nchw
+  input_dtype: float
+  labelmap_path: /labelmap/coco-80.txt
+  # Optional: The model is normally fetched through the runtime, so 'path' can be omitted unless you want to use a custom or local model.
+  # path: /config/yolonas.zip
+          # The .zip file must contain:
+          # ├── yolonas.dfp          (a file ending with .dfp)
+          # └── yolonas_post.onnx    (optional; only if the model includes a cropped post-processing network)
+```
+
+#### YOLOv9  
+
+The YOLOv9s model included in this detector is downloaded from [the original GitHub](https://github.com/WongKinYiu/yolov9) like in the [Models Section](#yolov9-1) and compiled to DFP with [mx_nc](https://developer.memryx.com/tools/neural_compiler.html#usage).
+
+##### Configuration
+
+Below is the recommended configuration for using the **YOLOv9** (small) model with the MemryX detector:  
+
+```yaml
+detectors:
+  memx0:
+    type: memryx
+    device: PCIe:0
+
+model:
+  model_type: yolo-generic   
+  width: 320   # (Can be set to 640 for higher resolution)
+  height: 320  # (Can be set to 640 for higher resolution)
+  input_tensor: nchw
+  input_dtype: float
+  labelmap_path: /labelmap/coco-80.txt
+  # Optional: The model is normally fetched through the runtime, so 'path' can be omitted unless you want to use a custom or local model.
+  # path: /config/yolov9.zip
+          # The .zip file must contain:
+          # ├── yolov9.dfp          (a file ending with .dfp)
+          # └── yolov9_post.onnx    (optional; only if the model includes a cropped post-processing network)
+```
+
+#### YOLOX  
+
+The model is sourced from the [OpenCV Model Zoo](https://github.com/opencv/opencv_zoo) and precompiled to DFP.
+
+##### Configuration  
+
+Below is the recommended configuration for using the **YOLOX** (small) model with the MemryX detector:  
+
+```yaml
+detectors:
+  memx0:
+    type: memryx
+    device: PCIe:0
+
+model:
+  model_type: yolox
+  width: 640
+  height: 640
+  input_tensor: nchw
+  input_dtype: float_denorm
+  labelmap_path: /labelmap/coco-80.txt
+  # Optional: The model is normally fetched through the runtime, so 'path' can be omitted unless you want to use a custom or local model.
+  # path: /config/yolox.zip
+          # The .zip file must contain:
+          # ├── yolox.dfp          (a file ending with .dfp)
+```
+
+#### SSDLite MobileNet v2  
+
+The model is sourced from the [OpenMMLab Model Zoo](https://mmdeploy-oss.openmmlab.com/model/mmdet-det/ssdlite-e8679f.onnx) and has been converted to DFP.
+
+##### Configuration  
+
+Below is the recommended configuration for using the **SSDLite MobileNet v2** model with the MemryX detector:  
+
+```yaml
+detectors:
+  memx0:
+    type: memryx
+    device: PCIe:0
+
+model:
+  model_type: ssd
+  width: 320
+  height: 320
+  input_tensor: nchw
+  input_dtype: float
+  labelmap_path: /labelmap/coco-80.txt
+  # Optional: The model is normally fetched through the runtime, so 'path' can be omitted unless you want to use a custom or local model.
+  # path: /config/ssdlite_mobilenet.zip
+          # The .zip file must contain:
+          # ├── ssdlite_mobilenet.dfp          (a file ending with .dfp)
+          # └── ssdlite_mobilenet_post.onnx    (optional; only if the model includes a cropped post-processing network)
+```
+
+#### Using a Custom Model
+
+To use your own model:
+
+1.  Package your compiled model into a `.zip` file.
+
+2.  The `.zip` must contain the compiled `.dfp` file.
+
+3.  Depending on the model, the compiler may also generate a cropped post-processing network. If present, it will be named with the suffix `_post.onnx`.
+
+4.  Bind-mount the `.zip` file into the container and specify its path using `model.path` in your config.
+
+5.  Update the `labelmap_path` to match your custom model's labels.
+
+For detailed instructions on compiling models, refer to the [MemryX Compiler](https://developer.memryx.com/tools/neural_compiler.html#usage) docs and [Tutorials](https://developer.memryx.com/tutorials/tutorials.html).
+
+```yaml
+  # The detector automatically selects the default model if nothing is provided in the config.
+  #
+  # Optionally, you can specify a local model path as a .zip file to override the default.
+  # If a local path is provided and the file exists, it will be used instead of downloading.
+  #
+  # Example:
+  # path: /config/yolonas.zip
+  #
+  # The .zip file must contain:
+  # ├── yolonas.dfp          (a file ending with .dfp)
+  # └── yolonas_post.onnx    (optional; only if the model includes a cropped post-processing network)
+```
+---
+
 ## NVidia TensorRT Detector
 
 Nvidia Jetson devices may be used for object detection using the TensorRT libraries. Due to the size of the additional libraries, this detector is only provided in images with the `-tensorrt-jp6` tag suffix, e.g. `ghcr.io/blakeblackshear/frigate:stable-tensorrt-jp6`. This detector is designed to work with Yolo models for object detection.
diff --git a/frigate/detectors/plugins/memryx.py b/frigate/detectors/plugins/memryx.py
index 906333ebb..4c251ef0f 100644
--- a/frigate/detectors/plugins/memryx.py
+++ b/frigate/detectors/plugins/memryx.py
@@ -1,5 +1,7 @@
+import glob
 import logging
 import os
+import shutil
 import time
 import urllib.request
 import zipfile
@@ -184,10 +186,72 @@ class MemryXDetector(DetectionApi):
         self.const_C = np.load(f"{base}/_model_22_Constant_12_output_0.npy")
 
     def check_and_prepare_model(self):
-        """Check if models exist; if not, download and extract them."""
         if not os.path.exists(self.cache_dir):
-            os.makedirs(self.cache_dir)
+            os.makedirs(self.cache_dir, exist_ok=True)
 
+        # ---------- CASE 1: user provided a custom model path ----------
+        if self.memx_model_path:
+            if not self.memx_model_path.endswith(".zip"):
+                raise ValueError(
+                    f"Invalid model path: {self.memx_model_path}. "
+                    "Only .zip files are supported. Please provide a .zip model archive."
+                )
+            if not os.path.exists(self.memx_model_path):
+                raise FileNotFoundError(
+                    f"Custom model zip not found: {self.memx_model_path}"
+                )
+
+            logger.info(f"User provided zip model: {self.memx_model_path}")
+
+            # Extract custom zip into a separate area so it never clashes with MemryX cache
+            custom_dir = os.path.join(
+                self.cache_dir, "custom_models", self.model_folder
+            )
+            if os.path.isdir(custom_dir):
+                shutil.rmtree(custom_dir)
+            os.makedirs(custom_dir, exist_ok=True)
+
+            with zipfile.ZipFile(self.memx_model_path, "r") as zip_ref:
+                zip_ref.extractall(custom_dir)
+            logger.info(f"Custom model extracted to {custom_dir}.")
+
+            # Find .dfp and optional *_post.onnx recursively
+            dfp_candidates = glob.glob(
+                os.path.join(custom_dir, "**", "*.dfp"), recursive=True
+            )
+            post_candidates = glob.glob(
+                os.path.join(custom_dir, "**", "*_post.onnx"), recursive=True
+            )
+
+            if not dfp_candidates:
+                raise FileNotFoundError(
+                    "No .dfp file found in custom model zip after extraction."
+                )
+
+            self.memx_model_path = dfp_candidates[0]
+
+            # Handle post model requirements by model type
+            if self.memx_model_type in [
+                ModelTypeEnum.yologeneric,
+                ModelTypeEnum.yolonas,
+                ModelTypeEnum.ssd,
+            ]:
+                if not post_candidates:
+                    raise FileNotFoundError(
+                        f"No *_post.onnx file found in custom model zip for {self.memx_model_type.name}."
+                    )
+                self.memx_post_model = post_candidates[0]
+            elif self.memx_model_type == ModelTypeEnum.yolox:
+                # Explicitly ignore any post model even if present
+                self.memx_post_model = None
+            else:
+                # Future model types can optionally use post if present
+                self.memx_post_model = post_candidates[0] if post_candidates else None
+
+            logger.info(f"Using custom model: {self.memx_model_path}")
+            return
+
+        # ---------- CASE 2: no custom model path -> use MemryX cached models ----------
         model_subdir = os.path.join(self.cache_dir, self.model_folder)
         dfp_path = os.path.join(model_subdir, self.expected_dfp_model)
         post_path = (
@@ -207,7 +271,10 @@ class MemryXDetector(DetectionApi):
                 self.load_yolo_constants()
             return
 
-        logger.info(f"Model files not found. Downloading from {self.model_url}...")
+        # ---------- CASE 3: download MemryX model (no cache) ----------
+        logger.info(
+            f"Model files not found locally. Downloading from {self.model_url}..."
+        )
         zip_path = os.path.join(self.cache_dir, f"{self.model_folder}.zip")
 
         try:
@@ -231,14 +298,13 @@ class MemryXDetector(DetectionApi):
             if self.memx_model_type == ModelTypeEnum.yologeneric:
                 self.load_yolo_constants()
 
-        except Exception as e:
-            logger.error(f"Failed to prepare model: {e}")
-            raise
-
         finally:
             if os.path.exists(zip_path):
-                os.remove(zip_path)
-                logger.info("Cleaned up ZIP file after extraction.")
+                try:
+                    os.remove(zip_path)
+                    logger.info("Cleaned up ZIP file after extraction.")
+                except Exception as e:
+                    logger.warning(f"Failed to remove downloaded zip {zip_path}: {e}")
 
     def send_input(self, connection_id, tensor_input: np.ndarray):
         """Pre-process (if needed) and send frame to MemryX input queue"""
@@ -545,91 +611,102 @@ class MemryXDetector(DetectionApi):
     def process_output(self, *outputs):
         """Output callback function -- receives frames from the MX3 and triggers post-processing"""
         if self.memx_model_type == ModelTypeEnum.yologeneric:
-            conv_out1 = outputs[0]
-            conv_out2 = outputs[1]
-            conv_out3 = outputs[2]
-            conv_out4 = outputs[3]
-            conv_out5 = outputs[4]
-            conv_out6 = outputs[5]
+            if not self.memx_post_model:
+                conv_out1 = outputs[0]
+                conv_out2 = outputs[1]
+                conv_out3 = outputs[2]
+                conv_out4 = outputs[3]
+                conv_out5 = outputs[4]
+                conv_out6 = outputs[5]
 
-            concat_1 = self.onnx_concat([conv_out1, conv_out2], axis=1)
-            concat_2 = self.onnx_concat([conv_out3, conv_out4], axis=1)
-            concat_3 = self.onnx_concat([conv_out5, conv_out6], axis=1)
+                concat_1 = self.onnx_concat([conv_out1, conv_out2], axis=1)
+                concat_2 = self.onnx_concat([conv_out3, conv_out4], axis=1)
+                concat_3 = self.onnx_concat([conv_out5, conv_out6], axis=1)
 
-            shape = np.array([1, 144, -1], dtype=np.int64)
+                shape = np.array([1, 144, -1], dtype=np.int64)
 
-            reshaped_1 = self.onnx_reshape_with_allowzero(concat_1, shape, allowzero=0)
-            reshaped_2 = self.onnx_reshape_with_allowzero(concat_2, shape, allowzero=0)
-            reshaped_3 = self.onnx_reshape_with_allowzero(concat_3, shape, allowzero=0)
+                reshaped_1 = self.onnx_reshape_with_allowzero(
+                    concat_1, shape, allowzero=0
+                )
+                reshaped_2 = self.onnx_reshape_with_allowzero(
+                    concat_2, shape, allowzero=0
+                )
+                reshaped_3 = self.onnx_reshape_with_allowzero(
+                    concat_3, shape, allowzero=0
+                )
 
-            concat_4 = self.onnx_concat([reshaped_1, reshaped_2, reshaped_3], 2)
+                concat_4 = self.onnx_concat([reshaped_1, reshaped_2, reshaped_3], 2)
 
-            axis = 1
-            split_sizes = [64, 80]
+                axis = 1
+                split_sizes = [64, 80]
 
-            # Calculate indices at which to split
-            indices = np.cumsum(split_sizes)[
-                :-1
-            ]  # [64] — split before the second chunk
+                # Calculate indices at which to split
+                indices = np.cumsum(split_sizes)[
+                    :-1
+                ]  # [64] — split before the second chunk
 
-            # Perform split along axis 1
-            split_0, split_1 = np.split(concat_4, indices, axis=axis)
+                # Perform split along axis 1
+                split_0, split_1 = np.split(concat_4, indices, axis=axis)
 
-            num_boxes = 2100 if self.memx_model_height == 320 else 8400
-            shape1 = np.array([1, 4, 16, num_boxes])
-            reshape_4 = self.onnx_reshape_with_allowzero(split_0, shape1, allowzero=0)
+                num_boxes = 2100 if self.memx_model_height == 320 else 8400
+                shape1 = np.array([1, 4, 16, num_boxes])
+                reshape_4 = self.onnx_reshape_with_allowzero(
+                    split_0, shape1, allowzero=0
+                )
 
-            transpose_1 = reshape_4.transpose(0, 2, 1, 3)
+                transpose_1 = reshape_4.transpose(0, 2, 1, 3)
 
-            axis = 1  # As per ONNX softmax node
+                axis = 1  # As per ONNX softmax node
 
-            # Subtract max for numerical stability
-            x_max = np.max(transpose_1, axis=axis, keepdims=True)
-            x_exp = np.exp(transpose_1 - x_max)
-            x_sum = np.sum(x_exp, axis=axis, keepdims=True)
-            softmax_output = x_exp / x_sum
+                # Subtract max for numerical stability
+                x_max = np.max(transpose_1, axis=axis, keepdims=True)
+                x_exp = np.exp(transpose_1 - x_max)
+                x_sum = np.sum(x_exp, axis=axis, keepdims=True)
+                softmax_output = x_exp / x_sum
 
-            # Weight W from the ONNX initializer (1, 16, 1, 1) with values 0 to 15
-            W = np.arange(16, dtype=np.float32).reshape(1, 16, 1, 1)  # (1, 16, 1, 1)
+                # Weight W from the ONNX initializer (1, 16, 1, 1) with values 0 to 15
+                W = np.arange(16, dtype=np.float32).reshape(
+                    1, 16, 1, 1
+                )  # (1, 16, 1, 1)
 
-            # Apply 1x1 convolution: this is a weighted sum over channels
-            conv_output = np.sum(
-                softmax_output * W, axis=1, keepdims=True
-            )  # shape: (1, 1, 4, 8400)
+                # Apply 1x1 convolution: this is a weighted sum over channels
+                conv_output = np.sum(
+                    softmax_output * W, axis=1, keepdims=True
+                )  # shape: (1, 1, 4, 8400)
 
-            shape2 = np.array([1, 4, num_boxes])
-            reshape_5 = self.onnx_reshape_with_allowzero(
-                conv_output, shape2, allowzero=0
-            )
+                shape2 = np.array([1, 4, num_boxes])
+                reshape_5 = self.onnx_reshape_with_allowzero(
+                    conv_output, shape2, allowzero=0
+                )
 
-            # ONNX Slice — get first 2 channels: [0:2] along axis 1
-            slice_output1 = reshape_5[:, 0:2, :]  # Result: (1, 2, 8400)
+                # ONNX Slice — get first 2 channels: [0:2] along axis 1
+                slice_output1 = reshape_5[:, 0:2, :]  # Result: (1, 2, 8400)
 
-            # Slice channels 2 to 4 → axis = 1
-            slice_output2 = reshape_5[:, 2:4, :]
+                # Slice channels 2 to 4 → axis = 1
+                slice_output2 = reshape_5[:, 2:4, :]
 
-            # Perform Subtraction
-            sub_output = self.const_A - slice_output1  # Equivalent to ONNX Sub
+                # Perform Subtraction
+                sub_output = self.const_A - slice_output1  # Equivalent to ONNX Sub
 
-            # Perform the ONNX-style Add
-            add_output = self.const_B + slice_output2
+                # Perform the ONNX-style Add
+                add_output = self.const_B + slice_output2
 
-            sub1 = add_output - sub_output
+                sub1 = add_output - sub_output
 
-            add1 = sub_output + add_output
+                add1 = sub_output + add_output
 
-            div_output = add1 / 2.0
+                div_output = add1 / 2.0
 
-            concat_5 = self.onnx_concat([div_output, sub1], axis=1)
+                concat_5 = self.onnx_concat([div_output, sub1], axis=1)
 
-            # Expand B to (1, 1, 8400) so it can broadcast across axis=1 (4 channels)
-            const_C_expanded = self.const_C[:, np.newaxis, :]  # Shape: (1, 1, 8400)
+                # Expand B to (1, 1, 8400) so it can broadcast across axis=1 (4 channels)
+                const_C_expanded = self.const_C[:, np.newaxis, :]  # Shape: (1, 1, 8400)
 
-            # Perform ONNX-style element-wise multiplication
-            mul_output = concat_5 * const_C_expanded  # Result: (1, 4, 8400)
+                # Perform ONNX-style element-wise multiplication
+                mul_output = concat_5 * const_C_expanded  # Result: (1, 4, 8400)
 
-            sigmoid_output = self.sigmoid(split_1)
-            outputs = self.onnx_concat([mul_output, sigmoid_output], axis=1)
+                sigmoid_output = self.sigmoid(split_1)
+                outputs = self.onnx_concat([mul_output, sigmoid_output], axis=1)
 
             final_detections = post_process_yolo(
                 outputs, self.memx_model_width, self.memx_model_height