Add file id col to Document db model

fix(pruning): Add pruning dashboard panels and reorder layout (#10279 )
fix(logos): github logo displays correctly in dark mode (#10269 )
2026-04-17 07:26:45 +00:00 · 2026-04-16 15:50:29 -07:00 · 2026-04-16 22:09:33 +00:00 · 2026-04-16 22:03:49 +00:00 · 2026-04-16 21:33:44 +00:00 · 2026-04-16 21:26:43 +00:00
767 changed files with 23053 additions and 14991 deletions
--- a/.devcontainer/Dockerfile
+++ b/.devcontainer/Dockerfile
@@ -1,8 +1,9 @@
 FROM ubuntu:26.04@sha256:cc925e589b7543b910fea57a240468940003fbfc0515245a495dd0ad8fe7cef1

 RUN apt-get update && apt-get install -y --no-install-recommends \
-  acl \
+  build-essential \
  curl \
+  default-jre \
  fd-find \
  fzf \
  git \
@@ -25,13 +26,11 @@ RUN apt-get update && apt-get install -y --no-install-recommends \
  && curl -fsSL https://deb.nodesource.com/setup_20.x | bash - \
  && apt-get install -y nodejs \
  && install -m 0755 -d /etc/apt/keyrings \
-  && curl -fsSL https://download.docker.com/linux/ubuntu/gpg -o /etc/apt/keyrings/docker.asc \
-  && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/docker.asc] https://download.docker.com/linux/ubuntu $(. /etc/os-release && echo "$VERSION_CODENAME") stable" > /etc/apt/sources.list.d/docker.list \
  && curl -fsSL https://cli.github.com/packages/githubcli-archive-keyring.gpg -o /etc/apt/keyrings/githubcli-archive-keyring.gpg \
  && chmod go+r /etc/apt/keyrings/githubcli-archive-keyring.gpg \
  && echo "deb [arch=$(dpkg --print-architecture) signed-by=/etc/apt/keyrings/githubcli-archive-keyring.gpg] https://cli.github.com/packages stable main" > /etc/apt/sources.list.d/github-cli.list \
  && apt-get update \
-  && apt-get install -y --no-install-recommends docker-ce-cli docker-compose-plugin gh \
+  && apt-get install -y --no-install-recommends gh \
  && apt-get clean && rm -rf /var/lib/apt/lists/*

 # fd-find installs as fdfind on Debian/Ubuntu — symlink to fd
@@ -63,3 +62,11 @@ RUN chsh -s /bin/zsh root && \
    echo '[ -f /workspace/.devcontainer/zshrc ] && . /workspace/.devcontainer/zshrc' >> "$rc"; \
  done && \
  chown dev:dev /home/dev/.zshrc
+
+# Pre-seed GitHub's SSH host keys so git-over-SSH never prompts.  Keys are
+# pinned in-repo (verified against the fingerprints GitHub publishes at
+# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/githubs-ssh-key-fingerprints)
+# rather than fetched at build time, so a compromised build-time network can't
+# inject a rogue key.
+COPY github_known_hosts /etc/ssh/ssh_known_hosts
+RUN chmod 644 /etc/ssh/ssh_known_hosts
--- a/.devcontainer/README.md
+++ b/.devcontainer/README.md
@@ -6,7 +6,7 @@ A containerized development environment for working on Onyx.

 - Ubuntu 26.04 base image
 - Node.js 20, uv, Claude Code
- Docker CLI, GitHub CLI (`gh`)
+- GitHub CLI (`gh`)
 - Neovim, ripgrep, fd, fzf, jq, make, wget, unzip
 - Zsh as default shell (sources host `~/.zshrc` if available)
 - Python venv auto-activation
@@ -14,12 +14,6 @@ A containerized development environment for working on Onyx.

 ## Usage

-### VS Code
-
-1. Install the [Dev Containers extension](https://marketplace.visualstudio.com/items?itemName=ms-vscode-remote.remote-containers)
-2. Open this repo in VS Code
-3. "Reopen in Container" when prompted
-
 ### CLI (`ods dev`)

 The [`ods` devtools CLI](../tools/ods/README.md) provides workspace-aware wrappers
@@ -39,25 +33,8 @@ ods dev exec npm test
 ods dev stop
 ```

-If you don't have `ods` installed, use the `devcontainer` CLI directly:
-
-```bash
-npm install -g @devcontainers/cli
-
-devcontainer up --workspace-folder .
-devcontainer exec --workspace-folder . zsh
-```
-
 ## Restarting the container

-### VS Code
-
-Open the Command Palette (`Ctrl+Shift+P` / `Cmd+Shift+P`) and run:
-
- **Dev Containers: Reopen in Container** — restarts the container without rebuilding
-
-### CLI
-
 ```bash
 # Restart the container
 ods dev restart
@@ -66,12 +43,6 @@ ods dev restart
 ods dev rebuild
 ```

-Or without `ods`:
-
-```bash
-devcontainer up --workspace-folder . --remove-existing-container
-```
-
 ## Image

 The devcontainer uses a prebuilt image published to `onyxdotapp/onyx-devcontainer`.
@@ -88,30 +59,19 @@ The `devcontainer` target is defined in `docker-bake.hcl` at the repo root.
 ## User & permissions

 The container runs as the `dev` user by default (`remoteUser` in devcontainer.json).
-An init script (`init-dev-user.sh`) runs at container start to ensure `dev` has
-read/write access to the bind-mounted workspace:
+An init script (`init-dev-user.sh`) runs at container start to ensure the active
+user has read/write access to the bind-mounted workspace:

 - **Standard Docker** — `dev`'s UID/GID is remapped to match the workspace owner,
  so file permissions work seamlessly.
 - **Rootless Docker** — The workspace appears as root-owned (UID 0) inside the
-  container due to user-namespace mapping. The init script grants `dev` access via
-  POSIX ACLs (`setfacl`), which adds a few seconds to the first container start on
-  large repos.
+  container due to user-namespace mapping. `ods dev up` auto-detects rootless Docker
+  and sets `DEVCONTAINER_REMOTE_USER=root` so the container runs as root — which
+  maps back to your host user via the user namespace. New files are owned by your
+  host UID and no ACL workarounds are needed.

-## Docker socket
-
-The container mounts the host's Docker socket so you can run `docker` commands
-from inside. `ods dev` auto-detects the socket path and sets `DOCKER_SOCK`:
-
-| Environment             | Socket path                    |
-| ----------------------- | ------------------------------ |
-| Linux (rootless Docker) | `$XDG_RUNTIME_DIR/docker.sock` |
-| macOS (Docker Desktop)  | `~/.docker/run/docker.sock`    |
-| Linux (standard Docker) | `/var/run/docker.sock`         |
-
-To override, set `DOCKER_SOCK` before running `ods dev up`. When using the
-VS Code extension or `devcontainer` CLI directly (without `ods`), you must set
-`DOCKER_SOCK` yourself.
+  To override the auto-detection, set `DEVCONTAINER_REMOTE_USER` before running
+  `ods dev up`.

 ## Firewall

--- a/.devcontainer/devcontainer.json
+++ b/.devcontainer/devcontainer.json
@@ -1,20 +1,28 @@
 {
  "name": "Onyx Dev Sandbox",
-  "image": "onyxdotapp/onyx-devcontainer@sha256:12184169c5bcc9cca0388286d5ffe504b569bc9c37bfa631b76ee8eee2064055",
-  "runArgs": ["--cap-add=NET_ADMIN", "--cap-add=NET_RAW"],
+  "image": "onyxdotapp/onyx-devcontainer@sha256:4986c9252289b660ce772b45f0488b938fe425d8114245e96ef64b273b3fcee4",
+  "runArgs": [
+    "--cap-add=NET_ADMIN",
+    "--cap-add=NET_RAW",
+    "--network=onyx_default"
+  ],
  "mounts": [
-    "source=${localEnv:DOCKER_SOCK},target=/var/run/docker.sock,type=bind",
    "source=${localEnv:HOME}/.claude,target=/home/dev/.claude,type=bind",
    "source=${localEnv:HOME}/.claude.json,target=/home/dev/.claude.json,type=bind",
    "source=${localEnv:HOME}/.zshrc,target=/home/dev/.zshrc.host,type=bind,readonly",
-    "source=${localEnv:HOME}/.gitconfig,target=/home/dev/.gitconfig.host,type=bind,readonly",
-    "source=${localEnv:HOME}/.ssh,target=/home/dev/.ssh.host,type=bind,readonly",
-    "source=${localEnv:HOME}/.config/nvim,target=/home/dev/.config/nvim.host,type=bind,readonly",
+    "source=${localEnv:HOME}/.gitconfig,target=/home/dev/.gitconfig,type=bind,readonly",
+    "source=${localEnv:HOME}/.config/nvim,target=/home/dev/.config/nvim,type=bind,readonly",
    "source=onyx-devcontainer-cache,target=/home/dev/.cache,type=volume",
    "source=onyx-devcontainer-local,target=/home/dev/.local,type=volume"
  ],
-  "remoteUser": "dev",
+  "containerEnv": {
+    "SSH_AUTH_SOCK": "/tmp/ssh-agent.sock",
+    "POSTGRES_HOST": "relational_db",
+    "REDIS_HOST": "cache"
+  },
+  "remoteUser": "${localEnv:DEVCONTAINER_REMOTE_USER:dev}",
  "updateRemoteUserUID": false,
+  "initializeCommand": "docker network create onyx_default 2>/dev/null || true",
  "workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=delegated",
  "workspaceFolder": "/workspace",
  "postStartCommand": "sudo bash /workspace/.devcontainer/init-dev-user.sh && sudo bash /workspace/.devcontainer/init-firewall.sh",
--- a/.devcontainer/github_known_hosts
+++ b/.devcontainer/github_known_hosts
@@ -0,0 +1,3 @@
+github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=
+github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg=
+github.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl
--- a/.devcontainer/init-dev-user.sh
+++ b/.devcontainer/init-dev-user.sh
@@ -8,38 +8,68 @@ set -euo pipefail
 #                    We remap dev to that UID -- fast and seamless.
 #
 # Rootless Docker:   Workspace appears as root-owned (UID 0) inside the
-#                    container due to user-namespace mapping.  We can't remap
-#                    dev to UID 0 (that's root), so we grant access with
-#                    POSIX ACLs instead.
+#                    container due to user-namespace mapping.  Requires
+#                    DEVCONTAINER_REMOTE_USER=root (set automatically by
+#                    ods dev up).  Container root IS the host user, so
+#                    bind-mounts and named volumes are symlinked into /root.

 WORKSPACE=/workspace
 TARGET_USER=dev
+REMOTE_USER="${SUDO_USER:-$TARGET_USER}"

 WS_UID=$(stat -c '%u' "$WORKSPACE")
 WS_GID=$(stat -c '%g' "$WORKSPACE")
 DEV_UID=$(id -u "$TARGET_USER")
 DEV_GID=$(id -g "$TARGET_USER")

-DEV_HOME=/home/"$TARGET_USER"
+# devcontainer.json bind-mounts and named volumes target /home/dev regardless
+# of remoteUser.  When running as root ($HOME=/root), Phase 1 bridges the gap
+# with symlinks from ACTIVE_HOME → MOUNT_HOME.
+MOUNT_HOME=/home/"$TARGET_USER"

-# Ensure directories that tools expect exist under ~dev.
-# ~/.local and ~/.cache are named Docker volumes -- ensure they are owned by dev.
-mkdir -p "$DEV_HOME"/.local/state "$DEV_HOME"/.local/share
-chown -R "$TARGET_USER":"$TARGET_USER" "$DEV_HOME"/.local
-chown -R "$TARGET_USER":"$TARGET_USER" "$DEV_HOME"/.cache
-
-# Copy host configs mounted as *.host into their real locations.
-# This gives the dev user owned copies without touching host originals.
-if [ -d "$DEV_HOME/.ssh.host" ]; then
-    cp -a "$DEV_HOME/.ssh.host" "$DEV_HOME/.ssh"
-    chmod 700 "$DEV_HOME/.ssh"
-    chmod 600 "$DEV_HOME"/.ssh/id_* 2>/dev/null || true
-    chown -R "$TARGET_USER":"$TARGET_USER" "$DEV_HOME/.ssh"
+if [ "$REMOTE_USER" = "root" ]; then
+    ACTIVE_HOME="/root"
+else
+    ACTIVE_HOME="$MOUNT_HOME"
 fi
-if [ -d "$DEV_HOME/.config/nvim.host" ]; then
-    mkdir -p "$DEV_HOME/.config"
-    cp -a "$DEV_HOME/.config/nvim.host" "$DEV_HOME/.config/nvim"
-    chown -R "$TARGET_USER":"$TARGET_USER" "$DEV_HOME/.config/nvim"
+
+# ── Phase 1: home directory setup ───────────────────────────────────
+
+# ~/.local and ~/.cache are named Docker volumes mounted under MOUNT_HOME.
+mkdir -p "$MOUNT_HOME"/.local/state "$MOUNT_HOME"/.local/share
+
+# When running as root, symlink bind-mounts and named volumes into /root
+# so that $HOME-relative tools (Claude Code, git, etc.) find them.
+if [ "$ACTIVE_HOME" != "$MOUNT_HOME" ]; then
+    for item in .claude .cache .local; do
+        [ -d "$MOUNT_HOME/$item" ] || continue
+        if [ -e "$ACTIVE_HOME/$item" ] && [ ! -L "$ACTIVE_HOME/$item" ]; then
+            echo "warning: replacing $ACTIVE_HOME/$item with symlink to $MOUNT_HOME/$item" >&2
+            rm -rf "$ACTIVE_HOME/$item"
+        fi
+        ln -sfn "$MOUNT_HOME/$item" "$ACTIVE_HOME/$item"
+    done
+    # Symlink files (not directories).
+    for file in .claude.json .gitconfig .zshrc.host; do
+        [ -f "$MOUNT_HOME/$file" ] && ln -sf "$MOUNT_HOME/$file" "$ACTIVE_HOME/$file"
+    done
+
+    # Nested mount: .config/nvim
+    if [ -d "$MOUNT_HOME/.config/nvim" ]; then
+        mkdir -p "$ACTIVE_HOME/.config"
+        if [ -e "$ACTIVE_HOME/.config/nvim" ] && [ ! -L "$ACTIVE_HOME/.config/nvim" ]; then
+            echo "warning: replacing $ACTIVE_HOME/.config/nvim with symlink" >&2
+            rm -rf "$ACTIVE_HOME/.config/nvim"
+        fi
+        ln -sfn "$MOUNT_HOME/.config/nvim" "$ACTIVE_HOME/.config/nvim"
+    fi
+fi
+
+# ── Phase 2: workspace access ───────────────────────────────────────
+
+# Root always has workspace access; Phase 1 handled home setup.
+if [ "$REMOTE_USER" = "root" ]; then
+    exit 0
 fi

 # Already matching -- nothing to do.
@@ -61,45 +91,17 @@ if [ "$WS_UID" != "0" ]; then
            echo "warning: failed to remap $TARGET_USER UID to $WS_UID" >&2
        fi
    fi
-    if ! chown -R "$TARGET_USER":"$TARGET_USER" /home/"$TARGET_USER" 2>&1; then
-        echo "warning: failed to chown /home/$TARGET_USER" >&2
+    if ! chown -R "$TARGET_USER":"$TARGET_USER" "$MOUNT_HOME" 2>&1; then
+        echo "warning: failed to chown $MOUNT_HOME" >&2
    fi
 else
    # ── Rootless Docker ──────────────────────────────────────────────
-    # Workspace is root-owned inside the container.  Grant dev access
-    # via POSIX ACLs (preserves ownership, works across the namespace
-    # boundary).
-    if command -v setfacl &>/dev/null; then
-        setfacl -Rm  "u:${TARGET_USER}:rwX" "$WORKSPACE"
-        setfacl -Rdm "u:${TARGET_USER}:rwX" "$WORKSPACE"   # default ACL for new files
-
-        # Git refuses to operate in repos owned by a different UID.
-        # Host gitconfig is mounted readonly as ~/.gitconfig.host.
-        # Create a real ~/.gitconfig that includes it plus container overrides.
-        printf '[include]\n\tpath = %s/.gitconfig.host\n[safe]\n\tdirectory = %s\n' \
-            "$DEV_HOME" "$WORKSPACE" > "$DEV_HOME/.gitconfig"
-        chown "$TARGET_USER":"$TARGET_USER" "$DEV_HOME/.gitconfig"
-
-        # If this is a worktree, the main .git dir is bind-mounted at its
-        # host absolute path. Grant dev access so git operations work.
-        GIT_COMMON_DIR=$(git -C "$WORKSPACE" rev-parse --git-common-dir 2>/dev/null || true)
-        if [ -n "$GIT_COMMON_DIR" ] && [ "$GIT_COMMON_DIR" != "$WORKSPACE/.git" ]; then
-            [ ! -d "$GIT_COMMON_DIR" ] && GIT_COMMON_DIR="$WORKSPACE/$GIT_COMMON_DIR"
-            if [ -d "$GIT_COMMON_DIR" ]; then
-                setfacl -Rm "u:${TARGET_USER}:rwX" "$GIT_COMMON_DIR"
-                setfacl -Rdm "u:${TARGET_USER}:rwX" "$GIT_COMMON_DIR"
-                git config -f "$DEV_HOME/.gitconfig" --add safe.directory "$(dirname "$GIT_COMMON_DIR")"
-            fi
-        fi
-
-        # Also fix bind-mounted dirs under ~dev that appear root-owned.
-        for dir in /home/"$TARGET_USER"/.claude; do
-            [ -d "$dir" ] && setfacl -Rm "u:${TARGET_USER}:rwX" "$dir" && setfacl -Rdm "u:${TARGET_USER}:rwX" "$dir"
-        done
-        [ -f /home/"$TARGET_USER"/.claude.json ] && \
-            setfacl -m "u:${TARGET_USER}:rw" /home/"$TARGET_USER"/.claude.json
-    else
-        echo "warning: setfacl not found; dev user may not have write access to workspace" >&2
-        echo "         install the 'acl' package or set remoteUser to root" >&2
-    fi
+    # Workspace is root-owned (UID 0) due to user-namespace mapping.
+    # The supported path is remoteUser=root (set DEVCONTAINER_REMOTE_USER=root),
+    # which is handled above.  If we reach here, the user is running as dev
+    # under rootless Docker without the override.
+    echo "error: rootless Docker detected but remoteUser is not root." >&2
+    echo "       Set DEVCONTAINER_REMOTE_USER=root before starting the container," >&2
+    echo "       or use 'ods dev up' which sets it automatically." >&2
+    exit 1
 fi
--- a/.devcontainer/init-firewall.sh
+++ b/.devcontainer/init-firewall.sh
@@ -4,22 +4,23 @@ set -euo pipefail

 echo "Setting up firewall..."

-# Preserve docker dns resolution
-DOCKER_DNS_RULES=$(iptables-save | grep -E "^-A.*-d 127.0.0.11/32" || true)
+# Reset default policies to ACCEPT before flushing rules.  On re-runs the
+# previous invocation's DROP policies are still in effect; flushing rules while
+# the default is DROP would block the DNS lookups below.  Register a trap so
+# that if the script exits before the DROP policies are re-applied at the end,
+# we fail closed instead of leaving the container with an unrestricted
+# firewall.
+trap 'iptables -P INPUT DROP; iptables -P OUTPUT DROP; iptables -P FORWARD DROP' EXIT
+iptables -P INPUT ACCEPT
+iptables -P OUTPUT ACCEPT
+iptables -P FORWARD ACCEPT

-# Flush all rules
-iptables -t nat -F
-iptables -t nat -X
-iptables -t mangle -F
-iptables -t mangle -X
+# Only flush the filter table.  The nat and mangle tables are managed by Docker
+# (DNS DNAT to 127.0.0.11, container networking, etc.) and must not be touched —
+# flushing them breaks Docker's embedded DNS resolver.
 iptables -F
 iptables -X

-# Restore docker dns rules
-if [ -n "$DOCKER_DNS_RULES" ]; then
-    echo "$DOCKER_DNS_RULES" | iptables-restore -n
-fi
-
 # Create ipset for allowed destinations
 ipset create allowed-domains hash:net || true
 ipset flush allowed-domains
@@ -34,6 +35,7 @@ done

 # Resolve allowed domains
 ALLOWED_DOMAINS=(
+    "github.com"
    "registry.npmjs.org"
    "api.anthropic.com"
    "api-staging.anthropic.com"
@@ -43,8 +45,16 @@ ALLOWED_DOMAINS=(
    "pypi.org"
    "files.pythonhosted.org"
    "go.dev"
+    "proxy.golang.org"
+    "sum.golang.org"
    "storage.googleapis.com"
+    "dl.google.com"
    "static.rust-lang.org"
+    "index.crates.io"
+    "static.crates.io"
+    "archive.ubuntu.com"
+    "security.ubuntu.com"
+    "deb.nodesource.com"
 )

 for domain in "${ALLOWED_DOMAINS[@]}"; do
@@ -56,14 +66,23 @@ for domain in "${ALLOWED_DOMAINS[@]}"; do
    done
 done

-# Detect host network
-if [[ "${DOCKER_HOST:-}" == "unix://"* ]]; then
-    DOCKER_GATEWAY=$(ip -4 route show | grep "^default" | awk '{print $3}')
+# Allow traffic to the Docker gateway so the container can reach host services
+# (e.g. the Onyx stack at localhost:3000, localhost:8080, etc.)
+DOCKER_GATEWAY=$(ip -4 route show default | awk '{print $3}')
+if [ -n "$DOCKER_GATEWAY" ]; then
    if ! ipset add allowed-domains "$DOCKER_GATEWAY/32" -exist 2>&1; then
        echo "warning: failed to add Docker gateway $DOCKER_GATEWAY to allowlist" >&2
    fi
 fi

+# Allow traffic to all attached Docker network subnets so the container can
+# reach sibling services (e.g. relational_db, cache) on shared compose networks.
+for subnet in $(ip -4 -o addr show scope global | awk '{print $4}'); do
+    if ! ipset add allowed-domains "$subnet" -exist 2>&1; then
+        echo "warning: failed to add Docker subnet $subnet to allowlist" >&2
+    fi
+done
+
 # Set default policies to DROP
 iptables -P FORWARD DROP
 iptables -P INPUT DROP
--- a/.github/workflows/deployment.yml
+++ b/.github/workflows/deployment.yml
@@ -44,7 +44,7 @@ jobs:
          fetch-tags: true

      - name: Setup uv
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # ratchet:astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # ratchet:astral-sh/setup-uv@v8.0.0
        with:
          version: "0.9.9"
          enable-cache: false
@@ -165,7 +165,7 @@ jobs:
          fetch-depth: 0

      - name: Setup uv
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # ratchet:astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # ratchet:astral-sh/setup-uv@v8.0.0
        with:
          version: "0.9.9"
          # NOTE: This isn't caching much and zizmor suggests this could be poisoned, so disable.
@@ -462,7 +462,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -472,7 +472,7 @@ jobs:

      - name: Build and push AMD64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -536,7 +536,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -546,7 +546,7 @@ jobs:

      - name: Build and push ARM64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -597,7 +597,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -676,7 +676,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -686,7 +686,7 @@ jobs:

      - name: Build and push AMD64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -761,7 +761,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -771,7 +771,7 @@ jobs:

      - name: Build and push ARM64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -833,7 +833,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -908,7 +908,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -918,7 +918,7 @@ jobs:

      - name: Build and push AMD64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile
@@ -981,7 +981,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -991,7 +991,7 @@ jobs:

      - name: Build and push ARM64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile
@@ -1041,7 +1041,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -1119,7 +1119,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -1129,7 +1129,7 @@ jobs:

      - name: Build and push AMD64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile
@@ -1192,7 +1192,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -1202,7 +1202,7 @@ jobs:

      - name: Build and push ARM64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile
@@ -1253,7 +1253,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -1329,7 +1329,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
        with:
          buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}

@@ -1341,7 +1341,7 @@ jobs:

      - name: Build and push AMD64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        env:
          DEBUG: ${{ vars.DOCKER_DEBUG == 'true' && 1 || 0 }}
        with:
@@ -1409,7 +1409,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
        with:
          buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}

@@ -1421,7 +1421,7 @@ jobs:

      - name: Build and push ARM64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        env:
          DEBUG: ${{ vars.DOCKER_DEBUG == 'true' && 1 || 0 }}
        with:
@@ -1475,7 +1475,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
--- a/.github/workflows/docker-tag-beta.yml
+++ b/.github/workflows/docker-tag-beta.yml
@@ -21,7 +21,7 @@ jobs:
    timeout-minutes: 45
    steps:
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
--- a/.github/workflows/docker-tag-latest.yml
+++ b/.github/workflows/docker-tag-latest.yml
@@ -21,7 +21,7 @@ jobs:
    timeout-minutes: 45
    steps:
      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
--- a/.github/workflows/post-merge-beta-cherry-pick.yml
+++ b/.github/workflows/post-merge-beta-cherry-pick.yml
@@ -114,7 +114,7 @@ jobs:
          ref: main

      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # ratchet:astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # ratchet:astral-sh/setup-uv@v8.0.0
        with:
          enable-cache: false
          version: "0.9.9"
--- a/.github/workflows/pr-integration-tests.yml
+++ b/.github/workflows/pr-integration-tests.yml
@@ -115,7 +115,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      # needed for pulling Vespa, Redis, Postgres, and Minio images
      # otherwise, we hit the "Unauthenticated users" limit
@@ -127,7 +127,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and push Backend Docker image
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile
@@ -175,7 +175,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      # needed for pulling Vespa, Redis, Postgres, and Minio images
      # otherwise, we hit the "Unauthenticated users" limit
@@ -187,7 +187,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and push Model Server Docker image
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile.model_server
@@ -220,7 +220,7 @@ jobs:
          persist-credentials: false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      # needed for pulling openapitools/openapi-generator-cli
      # otherwise, we hit the "Unauthenticated users" limit
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -94,7 +94,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      # needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
      # https://docs.docker.com/docker-hub/usage/
@@ -105,7 +105,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and push Web Docker image
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./web
          file: ./web/Dockerfile
@@ -155,7 +155,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      # needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
      # https://docs.docker.com/docker-hub/usage/
@@ -166,7 +166,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and push Backend Docker image
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile
@@ -216,7 +216,7 @@ jobs:
          echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      # needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
      # https://docs.docker.com/docker-hub/usage/
@@ -227,7 +227,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Build and push Model Server Docker image
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend
          file: ./backend/Dockerfile.model_server
@@ -471,7 +471,7 @@ jobs:

      - name: Install the latest version of uv
        if: always()
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # ratchet:astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # ratchet:astral-sh/setup-uv@v8.0.0
        with:
          enable-cache: false
          version: "0.9.9"
@@ -710,7 +710,7 @@ jobs:
      pull-requests: write
    steps:
      - name: Download visual diff summaries
-        uses: actions/download-artifact@70fc10c6e5e1ce46ad2ea6f2b72d43f7d47b13c3
+        uses: actions/download-artifact@3e5f45b2cfb9172054b4087a40e8e0b5a5461e7c
        with:
          pattern: screenshot-diff-summary-*
          path: summaries/
--- a/.github/workflows/pr-python-checks.yml
+++ b/.github/workflows/pr-python-checks.yml
@@ -19,16 +19,16 @@ permissions:
 jobs:
  mypy-check:
    # See https://runs-on.com/runners/linux/
-    # Note: Mypy seems quite optimized for x64 compared to arm64.
-    # Similarly, mypy is single-threaded and incremental, so 2cpu is sufficient.
+    # NOTE: This job is named mypy-check for branch protection compatibility,
+    # but it actually runs ty (astral-sh's Rust type checker).
    runs-on:
      [
        runs-on,
-        runner=2cpu-linux-x64,
+        runner=2cpu-linux-arm64,
        "run-id=${{ github.run_id }}-mypy-check",
        "extras=s3-cache",
      ]
-    timeout-minutes: 45
+    timeout-minutes: 15

    steps:
      - uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
@@ -46,26 +46,7 @@ jobs:
            backend/requirements/model_server.txt
            backend/requirements/ee.txt

-      - name: Generate OpenAPI schema and Python client
-        shell: bash
-        # TODO(Nik): https://linear.app/onyx-app/issue/ENG-1/update-test-infra-to-use-test-license
+      - name: Run ty
        env:
-          LICENSE_ENFORCEMENT_ENABLED: "false"
-        run: |
-          ods openapi all
-
-      - name: Cache mypy cache
-        if: ${{ vars.DISABLE_MYPY_CACHE != 'true' }}
-        uses: runs-on/cache@a5f51d6f3fece787d03b7b4e981c82538a0654ed # ratchet:runs-on/cache@v4
-        with:
-          path: .mypy_cache
-          key: mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-${{ hashFiles('**/*.py', '**/*.pyi', 'pyproject.toml') }}
-          restore-keys: |
-            mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-
-            mypy-${{ runner.os }}-
-
-      - name: Run MyPy
-        env:
-          MYPY_FORCE_COLOR: 1
          TERM: xterm-256color
-        run: mypy .
+        run: ty check --output-format github
--- a/.github/workflows/pr-python-model-tests.yml
+++ b/.github/workflows/pr-python-model-tests.yml
@@ -17,8 +17,6 @@ env:

  # API keys for testing
  COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
-  LITELLM_API_KEY: ${{ secrets.LITELLM_API_KEY }}
-  LITELLM_API_URL: ${{ secrets.LITELLM_API_URL }}
  OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
  AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
  AZURE_API_URL: ${{ vars.AZURE_API_URL }}
@@ -71,7 +69,7 @@ jobs:
          password: ${{ secrets.DOCKER_TOKEN }}

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Build and load
        uses: docker/bake-action@82490499d2e5613fcead7e128237ef0b0ea210f7 # ratchet:docker/bake-action@v7.0.0
--- a/.github/workflows/pr-quality-checks.yml
+++ b/.github/workflows/pr-quality-checks.yml
@@ -38,7 +38,7 @@ jobs:
      - name: Install node dependencies
        working-directory: ./web
        run: npm ci
-      - uses: j178/prek-action@0bb87d7f00b0c99306c8bcb8b8beba1eb581c037 # ratchet:j178/prek-action@v1
+      - uses: j178/prek-action@cbc2f23eb5539cf20d82d1aabd0d0ecbcc56f4e3
        with:
          prek-version: '0.3.4'
          extra-args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || github.event_name == 'merge_group' && format('--from-ref {0} --to-ref {1}', github.event.merge_group.base_sha, github.event.merge_group.head_sha) || github.ref_name == 'main' && '--all-files' || '' }}
--- a/.github/workflows/release-cli.yml
+++ b/.github/workflows/release-cli.yml
@@ -17,7 +17,7 @@ jobs:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v6
        with:
          persist-credentials: false
-      - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # ratchet:astral-sh/setup-uv@v7
+      - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # ratchet:astral-sh/setup-uv@v8.0.0
        with:
          enable-cache: false
          version: "0.9.9"
--- a/.github/workflows/release-devtools.yml
+++ b/.github/workflows/release-devtools.yml
@@ -26,7 +26,7 @@ jobs:
      - uses: actions/checkout@de0fac2e4500dabe0009e67214ff5f5447ce83dd # ratchet:actions/checkout@v6
        with:
          persist-credentials: false
-      - uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # ratchet:astral-sh/setup-uv@v7
+      - uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # ratchet:astral-sh/setup-uv@v8.0.0
        with:
          enable-cache: false
          version: "0.9.9"
--- a/.github/workflows/sandbox-deployment.yml
+++ b/.github/workflows/sandbox-deployment.yml
@@ -132,7 +132,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -142,7 +142,7 @@ jobs:

      - name: Build and push AMD64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend/onyx/server/features/build/sandbox/kubernetes/docker
          file: ./backend/onyx/server/features/build/sandbox/kubernetes/docker/Dockerfile
@@ -202,7 +202,7 @@ jobs:
            latest=false

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
@@ -212,7 +212,7 @@ jobs:

      - name: Build and push ARM64
        id: build
-        uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
+        uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
        with:
          context: ./backend/onyx/server/features/build/sandbox/kubernetes/docker
          file: ./backend/onyx/server/features/build/sandbox/kubernetes/docker/Dockerfile
@@ -258,7 +258,7 @@ jobs:
          parse-json-secrets: true

      - name: Set up Docker Buildx
-        uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
+        uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4

      - name: Login to Docker Hub
        uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
--- a/.github/workflows/zizmor.yml
+++ b/.github/workflows/zizmor.yml
@@ -24,7 +24,7 @@ jobs:
          persist-credentials: false

      - name: Install the latest version of uv
-        uses: astral-sh/setup-uv@37802adc94f370d6bfd71619e3f0bf239e1f3b78 # ratchet:astral-sh/setup-uv@v7
+        uses: astral-sh/setup-uv@cec208311dfd045dd5311c1add060b2062131d57 # ratchet:astral-sh/setup-uv@v8.0.0
        with:
          enable-cache: false
          version: "0.9.9"
--- a/.greptile/config.json
+++ b/.greptile/config.json
@@ -1,64 +1,57 @@
 {
-    "labels": [],
-    "comment": "",
-    "fixWithAI": true,
-    "hideFooter": false,
-    "strictness": 3,
-    "statusCheck": true,
-    "commentTypes": [
-      "logic",
-      "syntax",
-      "style"
-    ],
-    "instructions": "",
-    "disabledLabels": [],
-    "excludeAuthors": [
-      "dependabot[bot]",
-      "renovate[bot]"
-    ],
-    "ignoreKeywords": "",
-    "ignorePatterns": "",
-    "includeAuthors": [],
-    "summarySection": {
-      "included": true,
-      "collapsible": false,
-      "defaultOpen": false
+  "labels": [],
+  "comment": "",
+  "fixWithAI": true,
+  "hideFooter": false,
+  "strictness": 3,
+  "statusCheck": true,
+  "commentTypes": ["logic", "syntax", "style"],
+  "instructions": "",
+  "disabledLabels": [],
+  "excludeAuthors": ["dependabot[bot]", "renovate[bot]"],
+  "ignoreKeywords": "",
+  "ignorePatterns": "",
+  "includeAuthors": [],
+  "summarySection": {
+    "included": true,
+    "collapsible": false,
+    "defaultOpen": false
+  },
+  "excludeBranches": [],
+  "fileChangeLimit": 300,
+  "includeBranches": [],
+  "includeKeywords": "",
+  "triggerOnUpdates": false,
+  "updateExistingSummaryComment": true,
+  "updateSummaryOnly": false,
+  "issuesTableSection": {
+    "included": true,
+    "collapsible": false,
+    "defaultOpen": false
+  },
+  "statusCommentsEnabled": true,
+  "confidenceScoreSection": {
+    "included": true,
+    "collapsible": false
+  },
+  "sequenceDiagramSection": {
+    "included": true,
+    "collapsible": false,
+    "defaultOpen": false
+  },
+  "shouldUpdateDescription": false,
+  "rules": [
+    {
+      "scope": ["web/**"],
+      "rule": "In Onyx's Next.js app, the `app/ee/admin/` directory is a filesystem convention for Enterprise Edition route overrides — it does NOT add an `/ee/` prefix to the URL. Both `app/admin/groups/page.tsx` and `app/ee/admin/groups/page.tsx` serve the same URL `/admin/groups`. Hardcoded `/admin/...` paths in router.push() calls are correct and do NOT break EE deployments. Do not flag hardcoded admin paths as bugs."
    },
-    "excludeBranches": [],
-    "fileChangeLimit": 300,
-    "includeBranches": [],
-    "includeKeywords": "",
-    "triggerOnUpdates": true,
-    "updateExistingSummaryComment": true,
-    "updateSummaryOnly": false,
-    "issuesTableSection": {
-      "included": true,
-      "collapsible": false,
-      "defaultOpen": false
+    {
+      "scope": ["web/**"],
+      "rule": "In Onyx, each API key creates a unique user row in the database with a unique `user_id` (UUID). There is a 1:1 mapping between API keys and their backing user records. Multiple API keys do NOT share the same `user_id`. Do not flag potential duplicate row IDs when using `user_id` from API key descriptors."
    },
-    "statusCommentsEnabled": true,
-    "confidenceScoreSection": {
-      "included": true,
-      "collapsible": false
-    },
-    "sequenceDiagramSection": {
-      "included": true,
-      "collapsible": false,
-      "defaultOpen": false
-    },
-    "shouldUpdateDescription": false,
-    "rules": [
-      {
-        "scope": ["web/**"],
-        "rule": "In Onyx's Next.js app, the `app/ee/admin/` directory is a filesystem convention for Enterprise Edition route overrides — it does NOT add an `/ee/` prefix to the URL. Both `app/admin/groups/page.tsx` and `app/ee/admin/groups/page.tsx` serve the same URL `/admin/groups`. Hardcoded `/admin/...` paths in router.push() calls are correct and do NOT break EE deployments. Do not flag hardcoded admin paths as bugs."
-      },
-      {
-        "scope": ["web/**"],
-        "rule": "In Onyx, each API key creates a unique user row in the database with a unique `user_id` (UUID). There is a 1:1 mapping between API keys and their backing user records. Multiple API keys do NOT share the same `user_id`. Do not flag potential duplicate row IDs when using `user_id` from API key descriptors."
-      },
-      {
-        "scope": ["backend/**/*.py"],
-        "rule": "Never raise HTTPException directly in business code. Use `raise OnyxError(OnyxErrorCode.XXX, \"message\")` from `onyx.error_handling.exceptions`. A global FastAPI exception handler converts OnyxError into structured JSON responses with {\"error_code\": \"...\", \"detail\": \"...\"}. Error codes are defined in `onyx.error_handling.error_codes.OnyxErrorCode`. For upstream errors with dynamic HTTP status codes, use `status_code_override`: `raise OnyxError(OnyxErrorCode.BAD_GATEWAY, detail, status_code_override=upstream_status)`."
-      }
-    ]
+    {
+      "scope": ["backend/**/*.py"],
+      "rule": "Never raise HTTPException directly in business code. Use `raise OnyxError(OnyxErrorCode.XXX, \"message\")` from `onyx.error_handling.exceptions`. A global FastAPI exception handler converts OnyxError into structured JSON responses with {\"error_code\": \"...\", \"detail\": \"...\"}. Error codes are defined in `onyx.error_handling.error_codes.OnyxErrorCode`. For upstream errors with dynamic HTTP status codes, use `status_code_override`: `raise OnyxError(OnyxErrorCode.BAD_GATEWAY, detail, status_code_override=upstream_status)`."
+    }
+  ]
 }
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -67,12 +67,11 @@ repos:
        args: ["--active", "--with=onyx-devtools", "ods", "check-lazy-imports"]
        pass_filenames: true
        files: ^backend/(?!\.venv/|scripts/).*\.py$
-      # NOTE: This takes ~6s on a single, large module which is prohibitively slow.
-      # - id: uv-run
-      #   name: mypy
-      #   args: ["--all-extras", "mypy"]
-      #   pass_filenames: true
-      #   files: ^backend/.*\.py$
+      - id: uv-run
+        name: ty
+        args: ["ty", "check"]
+        pass_filenames: true
+        types_or: [python]

  - repo: https://github.com/pre-commit/pre-commit-hooks
    rev: 3e8a8703264a2f4a69428a0aa4dcb512790b2c8c # frozen: v6.0.0
@@ -142,6 +141,7 @@ repos:
    hooks:
      - id: ripsecrets
        args:
+          - --strict-ignore
          - --additional-pattern
          - ^sk-[A-Za-z0-9_\-]{20,}$

--- a/.secretsignore
+++ b/.secretsignore
@@ -0,0 +1 @@
+.devcontainer/github_known_hosts
--- a/.vscode/launch.json
+++ b/.vscode/launch.json
@@ -475,6 +475,18 @@
        "order": 0
      }
    },
+    {
+      "name": "Start Monitoring Stack (Prometheus + Grafana)",
+      "type": "node",
+      "request": "launch",
+      "runtimeExecutable": "docker",
+      "runtimeArgs": ["compose", "up", "-d"],
+      "cwd": "${workspaceFolder}/profiling",
+      "console": "integratedTerminal",
+      "presentation": {
+        "group": "3"
+      }
+    },
    {
      "name": "Clear and Restart External Volumes and Containers",
      "type": "node",
--- a/AGENTS.md
+++ b/AGENTS.md
@@ -1,55 +1,361 @@
-# Project Knowledge Base
+# PROJECT KNOWLEDGE BASE

-This file is the entrypoint for agents working in this repository. Keep it small.
+This file provides guidance to AI agents when working with code in this repository.

-## Start Here
+## KEY NOTES

- General development workflow and repo conventions: [CONTRIBUTING.md](./CONTRIBUTING.md)
- Frontend standards for `web/` and `desktop/`: [web/AGENTS.md](./web/AGENTS.md)
- Backend testing strategy and commands: [backend/tests/README.md](./backend/tests/README.md)
- Celery worker and task guidance: [backend/onyx/background/celery/README.md](./backend/onyx/background/celery/README.md)
- Backend API error-handling rules: [backend/onyx/error_handling/README.md](./backend/onyx/error_handling/README.md)
- Plan-writing guidance: [plans/README.md](./plans/README.md)
+- If you run into any missing python dependency errors, try running your command with `source .venv/bin/activate` \
+  to assume the python venv.
+- To make tests work, check the `.env` file at the root of the project to find an OpenAI key.
+- If using `playwright` to explore the frontend, you can usually log in with username `a@example.com` and password
+  `a`. The app can be accessed at `http://localhost:3000`.
+- You should assume that all Onyx services are running. To verify, you can check the `backend/log` directory to
+  make sure we see logs coming out from the relevant service.
+- To connect to the Postgres database, use: `docker exec -it onyx-relational_db-1 psql -U postgres -c "<SQL>"`
+- When making calls to the backend, always go through the frontend. E.g. make a call to `http://localhost:3000/api/persona` not `http://localhost:8080/api/persona`
+- Put ALL db operations under the `backend/onyx/db` / `backend/ee/onyx/db` directories. Don't run queries
+  outside of those directories.

-## Agent-Lab Docs
+## Project Overview

-When working on `agent-lab` or on tasks explicitly about agent-engineering, use:
+**Onyx** (formerly Danswer) is an open-source Gen-AI and Enterprise Search platform that connects to company documents, apps, and people. It features a modular architecture with both Community Edition (MIT licensed) and Enterprise Edition offerings.

- [docs/agent/README.md](./docs/agent/README.md)
+### Background Workers (Celery)

-These docs are the system of record for the `agent-lab` workflow.
+Onyx uses Celery for asynchronous task processing with multiple specialized workers:

-## Universal Notes
+#### Worker Types

- For non-trivial work, create the target worktree first and keep the edit, test, and PR loop
-  inside that worktree. Do not prototype in one checkout and copy the patch into another unless
-  you are explicitly debugging the harness itself.
- Use `ods worktree create` for harness-managed worktrees. Do not use raw `git worktree add` when
-  you want the `agent-lab` workflow, because it will skip the manifest, env overlays, dependency
-  bootstrap, and lane-aware base-ref selection.
- When a change needs browser proof, use the harness journey flow instead of ad hoc screen capture:
-  record `before` in the target worktree before making the change, then record `after` in that
-  same worktree after validation. Use `ods journey compare` only when you need to recover a missed
-  baseline or compare two explicit revisions after the fact.
- After opening a PR, treat review feedback and failing checks as part of the same loop:
-  use `ods pr-review ...` for GitHub review threads and `ods pr-checks diagnose` plus `ods trace`
-  for failing Playwright runs.
- PR titles and commit messages should use conventional-commit style such as `fix: ...` or
-  `feat: ...`. Do not use `[codex]` prefixes in this repo.
- If Python dependencies appear missing, activate the root venv with `source .venv/bin/activate`.
- To make tests work, check the root `.env` file for an OpenAI key.
- If using Playwright to explore the frontend, you can usually log in with username `a@example.com`
-  and password `a` at `http://localhost:3000`.
- Assume Onyx services are already running unless the task indicates otherwise. Check `backend/log`
-  if you need to verify service activity.
- When making backend calls in local development flows, go through the frontend proxy:
-  `http://localhost:3000/api/...`, not `http://localhost:8080/...`.
- Put DB operations under `backend/onyx/db/` or `backend/ee/onyx/db/`. Do not add ad hoc DB access
-  elsewhere.
+1. **Primary Worker** (`celery_app.py`)
+   - Coordinates core background tasks and system-wide operations
+   - Handles connector management, document sync, pruning, and periodic checks
+   - Runs with 4 threads concurrency
+   - Tasks: connector deletion, vespa sync, pruning, LLM model updates, user file sync

-## How To Use This File
+2. **Docfetching Worker** (`docfetching`)
+   - Fetches documents from external data sources (connectors)
+   - Spawns docprocessing tasks for each document batch
+   - Implements watchdog monitoring for stuck connectors
+   - Configurable concurrency (default from env)

- Use this file as a map, not a manual.
- Follow the nearest authoritative doc for the subsystem you are changing.
- If a repeated rule matters enough to teach every future agent, document it near the code it
-  governs or encode it mechanically.
+3. **Docprocessing Worker** (`docprocessing`)
+   - Processes fetched documents through the indexing pipeline:
+     - Upserts documents to PostgreSQL
+     - Chunks documents and adds contextual information
+     - Embeds chunks via model server
+     - Writes chunks to Vespa vector database
+     - Updates document metadata
+   - Configurable concurrency (default from env)
+
+4. **Light Worker** (`light`)
+   - Handles lightweight, fast operations
+   - Tasks: vespa metadata sync, connector deletion, doc permissions upsert, checkpoint cleanup, index attempt cleanup
+   - Higher concurrency for quick tasks
+
+5. **Heavy Worker** (`heavy`)
+   - Handles resource-intensive operations
+   - Tasks: connector pruning, document permissions sync, external group sync, CSV generation
+   - Runs with 4 threads concurrency
+
+6. **KG Processing Worker** (`kg_processing`)
+   - Handles Knowledge Graph processing and clustering
+   - Builds relationships between documents
+   - Runs clustering algorithms
+   - Configurable concurrency
+
+7. **Monitoring Worker** (`monitoring`)
+   - System health monitoring and metrics collection
+   - Monitors Celery queues, process memory, and system status
+   - Single thread (monitoring doesn't need parallelism)
+   - Cloud-specific monitoring tasks
+
+8. **User File Processing Worker** (`user_file_processing`)
+   - Processes user-uploaded files
+   - Handles user file indexing and project synchronization
+   - Configurable concurrency
+
+9. **Beat Worker** (`beat`)
+   - Celery's scheduler for periodic tasks
+   - Uses DynamicTenantScheduler for multi-tenant support
+   - Schedules tasks like:
+     - Indexing checks (every 15 seconds)
+     - Connector deletion checks (every 20 seconds)
+     - Vespa sync checks (every 20 seconds)
+     - Pruning checks (every 20 seconds)
+     - KG processing (every 60 seconds)
+     - Monitoring tasks (every 5 minutes)
+     - Cleanup tasks (hourly)
+
+#### Key Features
+
+- **Thread-based Workers**: All workers use thread pools (not processes) for stability
+- **Tenant Awareness**: Multi-tenant support with per-tenant task isolation. There is a
+  middleware layer that automatically finds the appropriate tenant ID when sending tasks
+  via Celery Beat.
+- **Task Prioritization**: High, Medium, Low priority queues
+- **Monitoring**: Built-in heartbeat and liveness checking
+- **Failure Handling**: Automatic retry and failure recovery mechanisms
+- **Redis Coordination**: Inter-process communication via Redis
+- **PostgreSQL State**: Task state and metadata stored in PostgreSQL
+
+#### Important Notes
+
+**Defining Tasks**:
+
+- Always use `@shared_task` rather than `@celery_app`
+- Put tasks under `background/celery/tasks/` or `ee/background/celery/tasks`
+- Never enqueue a task without an expiration. Always supply `expires=` when
+  sending tasks, either from the beat schedule or directly from another task. It
+  should never be acceptable to submit code which enqueues tasks without an
+  expiration, as doing so can lead to unbounded task queue growth.
+
+**Defining APIs**:
+When creating new FastAPI APIs, do NOT use the `response_model` field. Instead, just type the
+function.
+
+**Testing Updates**:
+If you make any updates to a celery worker and you want to test these changes, you will need
+to ask me to restart the celery worker. There is no auto-restart on code-change mechanism.
+
+**Task Time Limits**:
+Since all tasks are executed in thread pools, the time limit features of Celery are silently 
+disabled and won't work. Timeout logic must be implemented within the task itself.
+
+### Code Quality
+
+```bash
+# Install and run pre-commit hooks
+pre-commit install
+pre-commit run --all-files
+```
+
+NOTE: Always make sure everything is strictly typed (both in Python and Typescript).
+
+## Architecture Overview
+
+### Technology Stack
+
+- **Backend**: Python 3.11, FastAPI, SQLAlchemy, Alembic, Celery
+- **Frontend**: Next.js 15+, React 18, TypeScript, Tailwind CSS
+- **Database**: PostgreSQL with Redis caching
+- **Search**: Vespa vector database
+- **Auth**: OAuth2, SAML, multi-provider support
+- **AI/ML**: LangChain, LiteLLM, multiple embedding models
+
+### Directory Structure
+
+```
+backend/
+├── onyx/
+│   ├── auth/                    # Authentication & authorization
+│   ├── chat/                    # Chat functionality & LLM interactions
+│   ├── connectors/              # Data source connectors
+│   ├── db/                      # Database models & operations
+│   ├── document_index/          # Vespa integration
+│   ├── federated_connectors/    # External search connectors
+│   ├── llm/                     # LLM provider integrations
+│   └── server/                  # API endpoints & routers
+├── ee/                          # Enterprise Edition features
+├── alembic/                     # Database migrations
+└── tests/                       # Test suites
+
+web/
+├── src/app/                     # Next.js app router pages
+├── src/components/              # Reusable React components
+└── src/lib/                     # Utilities & business logic
+```
+
+## Frontend Standards
+
+Frontend standards for the `web/` and `desktop/` projects live in `web/AGENTS.md`.
+
+## Database & Migrations
+
+### Running Migrations
+
+```bash
+# Standard migrations
+alembic upgrade head
+
+# Multi-tenant (Enterprise)
+alembic -n schema_private upgrade head
+```
+
+### Creating Migrations
+
+```bash
+# Create migration
+alembic revision -m "description"
+
+# Multi-tenant migration
+alembic -n schema_private revision -m "description"
+```
+
+Write the migration manually and place it in the file that alembic creates when running the above command.
+
+## Testing Strategy
+
+First, you must activate the virtual environment with `source .venv/bin/activate`.
+
+There are 4 main types of tests within Onyx:
+
+### Unit Tests
+
+These should not assume any Onyx/external services are available to be called.
+Interactions with the outside world should be mocked using `unittest.mock`. Generally, only
+write these for complex, isolated modules e.g. `citation_processing.py`.
+
+To run them:
+
+```bash
+pytest -xv backend/tests/unit
+```
+
+### External Dependency Unit Tests
+
+These tests assume that all external dependencies of Onyx are available and callable (e.g. Postgres, Redis,
+MinIO/S3, Vespa are running + OpenAI can be called + any request to the internet is fine + etc.).
+
+However, the actual Onyx containers are not running and with these tests we call the function to test directly.
+We can also mock components/calls at will.
+
+The goal with these tests are to minimize mocking while giving some flexibility to mock things that are flakey,
+need strictly controlled behavior, or need to have their internal behavior validated (e.g. verify a function is called
+with certain args, something that would be impossible with proper integration tests).
+
+A great example of this type of test is `backend/tests/external_dependency_unit/connectors/confluence/test_confluence_group_sync.py`.
+
+To run them:
+
+```bash
+python -m dotenv -f .vscode/.env run -- pytest backend/tests/external_dependency_unit
+```
+
+### Integration Tests
+
+Standard integration tests. Every test in `backend/tests/integration` runs against a real Onyx deployment. We cannot
+mock anything in these tests. Prefer writing integration tests (or External Dependency Unit Tests if mocking/internal
+verification is necessary) over any other type of test.
+
+Tests are parallelized at a directory level.
+
+When writing integration tests, make sure to check the root `conftest.py` for useful fixtures + the `backend/tests/integration/common_utils` directory for utilities. Prefer (if one exists), calling the appropriate Manager
+class in the utils over directly calling the APIs with a library like `requests`. Prefer using fixtures rather than
+calling the utilities directly (e.g. do NOT create admin users with
+`admin_user = UserManager.create(name="admin_user")`, instead use the `admin_user` fixture).
+
+A great example of this type of test is `backend/tests/integration/tests/streaming_endpoints/test_chat_stream.py`.
+
+To run them:
+
+```bash
+python -m dotenv -f .vscode/.env run -- pytest backend/tests/integration
+```
+
+### Playwright (E2E) Tests
+
+These tests are an even more complete version of the Integration Tests mentioned above. Has all services of Onyx
+running, _including_ the Web Server.
+
+Use these tests for anything that requires significant frontend <-> backend coordination.
+
+Tests are located at `web/tests/e2e`. Tests are written in TypeScript.
+
+To run them:
+
+```bash
+npx playwright test <TEST_NAME>
+```
+
+For shared fixtures, best practices, and detailed guidance, see `backend/tests/README.md`.
+
+## Logs
+
+When (1) writing integration tests or (2) doing live tests (e.g. curl / playwright) you can get access
+to logs via the `backend/log/<service_name>_debug.log` file. All Onyx services (api_server, web_server, celery_X)
+will be tailing their logs to this file.
+
+## Security Considerations
+
+- Never commit API keys or secrets to repository
+- Use encrypted credential storage for connector credentials
+- Follow RBAC patterns for new features
+- Implement proper input validation with Pydantic models
+- Use parameterized queries to prevent SQL injection
+
+## AI/LLM Integration
+
+- Multiple LLM providers supported via LiteLLM
+- Configurable models per feature (chat, search, embeddings)
+- Streaming support for real-time responses
+- Token management and rate limiting
+- Custom prompts and agent actions
+
+## Creating a Plan
+
+When creating a plan in the `plans` directory, make sure to include at least these elements:
+
+**Issues to Address**
+What the change is meant to do.
+
+**Important Notes**
+Things you come across in your research that are important to the implementation.
+
+**Implementation strategy**
+How you are going to make the changes happen. High level approach.
+
+**Tests**
+What unit (use rarely), external dependency unit, integration, and playwright tests you plan to write to
+verify the correct behavior. Don't overtest. Usually, a given change only needs one type of test.
+
+Do NOT include these: _Timeline_, _Rollback plan_
+
+This is a minimal list - feel free to include more. Do NOT write code as part of your plan.
+Keep it high level. You can reference certain files or functions though.
+
+Before writing your plan, make sure to do research. Explore the relevant sections in the codebase.
+
+## Error Handling
+
+**Always raise `OnyxError` from `onyx.error_handling.exceptions` instead of `HTTPException`.
+Never hardcode status codes or use `starlette.status` / `fastapi.status` constants directly.**
+
+A global FastAPI exception handler converts `OnyxError` into a JSON response with the standard
+`{"error_code": "...", "detail": "..."}` shape. This eliminates boilerplate and keeps error
+handling consistent across the entire backend.
+
+```python
+from onyx.error_handling.error_codes import OnyxErrorCode
+from onyx.error_handling.exceptions import OnyxError
+
+# ✅ Good
+raise OnyxError(OnyxErrorCode.NOT_FOUND, "Session not found")
+
+# ✅ Good — no extra message needed
+raise OnyxError(OnyxErrorCode.UNAUTHENTICATED)
+
+# ✅ Good — upstream service with dynamic status code
+raise OnyxError(OnyxErrorCode.BAD_GATEWAY, detail, status_code_override=upstream_status)
+
+# ❌ Bad — using HTTPException directly
+raise HTTPException(status_code=404, detail="Session not found")
+
+# ❌ Bad — starlette constant
+raise HTTPException(status_code=status.HTTP_403_FORBIDDEN, detail="Access denied")
+```
+
+Available error codes are defined in `backend/onyx/error_handling/error_codes.py`. If a new error
+category is needed, add it there first — do not invent ad-hoc codes.
+
+**Upstream service errors:** When forwarding errors from an upstream service where the HTTP
+status code is dynamic (comes from the upstream response), use `status_code_override`:
+
+```python
+raise OnyxError(OnyxErrorCode.BAD_GATEWAY, detail, status_code_override=e.response.status_code)
+```
+
+## Best Practices
+
+In addition to the other content in this file, best practices for contributing
+to the codebase can be found in the "Engineering Best Practices" section of
+`CONTRIBUTING.md`. Understand its contents and follow them.
--- a/CONTRIBUTING.md
+++ b/CONTRIBUTING.md
@@ -63,11 +63,13 @@ Your features must pass all tests and all comments must be addressed prior to me
 ### Implicit agreements

 If we approve an issue, we are promising you the following:
+
 - Your work will receive timely attention and we will put aside other important items to ensure you are not blocked.
 - You will receive necessary coaching on eng quality, system design, etc. to ensure the feature is completed well.
 - The Onyx team will pull resources and bandwidth from design, PM, and engineering to ensure that you have all the resources to build the feature to the quality required for merging.

 Because this is a large investment from our team, we ask that you:
+
 - Thoroughly read all the requirements of the design docs, engineering best practices, and try to minimize overhead for the Onyx team.
 - Complete the feature in a timely manner to reduce context switching and an ongoing resource pull from the Onyx team.

@@ -149,10 +151,10 @@ Set up pre-commit hooks (black / reorder-python-imports):
 uv run pre-commit install
 ```

-We also use `mypy` for static type checking. Onyx is fully type-annotated, and we want to keep it that way! To run the mypy checks manually:
+We also use `ty` for static type checking. Onyx is fully type-annotated, and we want to keep it that way! To run the ty checks manually:

 ```bash
-uv run mypy .  # from onyx/backend
+uv run ty check
 ```

 #### Frontend
@@ -192,6 +194,7 @@ Before starting, make sure the Docker Daemon is running.
 > **Note:** "Clear and Restart External Volumes and Containers" will reset your Postgres and OpenSearch (relational-db and index). Only run this if you are okay with wiping your data.

 **Features:**
+
 - Hot reload is enabled for the web server and API servers
 - Python debugging is configured with debugpy
 - Environment variables are loaded from `.vscode/.env`
@@ -344,13 +347,16 @@ sudo xattr -r -d com.apple.quarantine ~/.cache/pre-commit
 ### Style and Maintainability

 #### Comments and readability
+
 Add clear comments:
+
 - At logical boundaries (e.g., interfaces) so the reader doesn't need to dig 10 layers deeper.
 - Wherever assumptions are made or something non-obvious/unexpected is done.
 - For complicated flows/functions.
 - Wherever it saves time (e.g., nontrivial regex patterns).

 #### Errors and exceptions
+
 - **Fail loudly** rather than silently skipping work.
  - Example: raise and let exceptions propagate instead of silently dropping a document.
 - **Don't overuse `try/except`.**
@@ -358,6 +364,7 @@ Add clear comments:
  - Do not mask exceptions unless it is clearly appropriate.

 #### Typing
+
 - Everything should be **as strictly typed as possible**.
 - Use `cast` for annoying/loose-typed interfaces (e.g., results of `run_functions_tuples_in_parallel`).
  - Only `cast` when the type checker sees `Any` or types are too loose.
@@ -368,6 +375,7 @@ Add clear comments:
    - `dict[EmbeddingModel, list[EmbeddingVector]]`

 #### State, objects, and boundaries
+
 - Keep **clear logical boundaries** for state containers and objects.
 - A **config** object should never contain things like a `db_session`.
 - Avoid state containers that are overly nested, or huge + flat (use judgment).
@@ -380,6 +388,7 @@ Add clear comments:
  - Prefer **hash maps (dicts)** over tree structures unless there's a strong reason.

 #### Naming
+
 - Name variables carefully and intentionally.
 - Prefer long, explicit names when undecided.
 - Avoid single-character variables except for small, self-contained utilities (or not at all).
@@ -390,6 +399,7 @@ Add clear comments:
  - IntelliSense can miss call sites; search works best with unique names.

 #### Correctness by construction
+
 - Prefer self-contained correctness — don't rely on callers to "use it right" if you can make misuse hard.
 - Avoid redundancies: if a function takes an arg, it shouldn't also take a state object that contains that same arg.
 - No dead code (unless there's a very good reason).
@@ -417,29 +427,35 @@ Add clear comments:
 ### Repository Conventions

 #### Where code lives
+
 - Pydantic + data models: `models.py` files.
 - DB interface functions (excluding lazy loading): `db/` directory.
 - LLM prompts: `prompts/` directory, roughly mirroring the code layout that uses them.
 - API routes: `server/` directory.

 #### Pydantic and modeling
+
 - Prefer **Pydantic** over dataclasses.
 - If absolutely required, use `allow_arbitrary_types`.

 #### Data conventions
+
 - Prefer explicit `None` over sentinel empty strings (usually; depends on intent).
 - Prefer explicit identifiers: use string enums instead of integer codes.
 - Avoid magic numbers (co-location is good when necessary). **Always avoid magic strings.**

 #### Logging
+
 - Log messages where they are created.
 - Don't propagate log messages around just to log them elsewhere.

 #### Encapsulation
+
 - Don't use private attributes/methods/properties from other classes/modules.
 - "Private" is private — respect that boundary.

 #### SQLAlchemy guidance
+
 - Lazy loading is often bad at scale, especially across multiple list relationships.
 - Be careful when accessing SQLAlchemy object attributes:
  - It can help avoid redundant DB queries,
@@ -448,6 +464,7 @@ Add clear comments:
 - Reference: https://www.reddit.com/r/SQLAlchemy/comments/138f248/joinedload_vs_selectinload/

 #### Trunk-based development and feature flags
+
 - **PRs should contain no more than 500 lines of real change.**
 - **Merge to main frequently.** Avoid long-lived feature branches — they create merge conflicts and integration pain.
 - **Use feature flags for incremental rollout.**
@@ -458,6 +475,7 @@ Add clear comments:
 - **Test both flag states.** Ensure the codebase works correctly with the flag on and off.

 #### Miscellaneous
+
 - Any TODOs you add in the code must be accompanied by either the name/username of the owner of that TODO, or an issue number for an issue referencing that piece of work.
 - Avoid module-level logic that runs on import, which leads to import-time side effects. Essentially every piece of meaningful logic should exist within some function that has to be explicitly invoked. Acceptable exceptions may include loading environment variables or setting up loggers.
  - If you find yourself needing something like this, you may want that logic to exist in a file dedicated for manual execution (contains `if __name__ == "__main__":`) which should not be imported by anything else.
--- a/backend/Dockerfile
+++ b/backend/Dockerfile
@@ -1,4 +1,4 @@
-FROM python:3.11.7-slim-bookworm
+FROM python:3.11-slim-bookworm@sha256:9c6f90801e6b68e772b7c0ca74260cbf7af9f320acec894e26fccdaccfbe3b47

 LABEL com.danswer.maintainer="founders@onyx.app"
 LABEL com.danswer.description="This image is the web/frontend container of Onyx which \
--- a/backend/Dockerfile.model_server
+++ b/backend/Dockerfile.model_server
@@ -1,5 +1,5 @@
 # Base stage with dependencies
-FROM python:3.11.7-slim-bookworm AS base
+FROM python:3.11-slim-bookworm@sha256:9c6f90801e6b68e772b7c0ca74260cbf7af9f320acec894e26fccdaccfbe3b47 AS base

 ENV DANSWER_RUNNING_IN_DOCKER="true" \
    HF_HOME=/app/.cache/huggingface
@@ -50,6 +50,10 @@ COPY ./onyx/utils/logger.py /app/onyx/utils/logger.py
 COPY ./onyx/utils/middleware.py /app/onyx/utils/middleware.py
 COPY ./onyx/utils/tenant.py /app/onyx/utils/tenant.py

+# Sentry configuration (used when SENTRY_DSN is set)
+COPY ./onyx/configs/__init__.py /app/onyx/configs/__init__.py
+COPY ./onyx/configs/sentry.py /app/onyx/configs/sentry.py
+
 # Place to fetch version information
 COPY ./onyx/__init__.py /app/onyx/__init__.py

--- a/backend/alembic/env.py
+++ b/backend/alembic/env.py
@@ -26,7 +26,9 @@ from shared_configs.configs import (
    TENANT_ID_PREFIX,
 )
 from onyx.db.models import Base
-from celery.backends.database.session import ResultModelBase  # type: ignore
+from celery.backends.database.session import (  # ty: ignore[unresolved-import]
+    ResultModelBase,
+)
 from onyx.db.engine.sql_engine import SqlEngine

 # Make sure in alembic.ini [logger_root] level=INFO is set or most logging will be
@@ -208,7 +210,7 @@ def do_run_migrations(

    context.configure(
        connection=connection,
-        target_metadata=target_metadata,  # type: ignore
+        target_metadata=target_metadata,
        version_table_schema=schema_name,
        include_schemas=True,
        compare_type=True,
@@ -380,7 +382,7 @@ def run_migrations_offline() -> None:
            logger.info(f"Migrating schema: {schema}")
            context.configure(
                url=url,
-                target_metadata=target_metadata,  # type: ignore
+                target_metadata=target_metadata,
                literal_binds=True,
                version_table_schema=schema,
                include_schemas=True,
@@ -421,7 +423,7 @@ def run_migrations_offline() -> None:
            logger.info(f"Migrating schema: {schema}")
            context.configure(
                url=url,
-                target_metadata=target_metadata,  # type: ignore
+                target_metadata=target_metadata,
                literal_binds=True,
                version_table_schema=schema,
                include_schemas=True,
@@ -464,7 +466,7 @@ def run_migrations_online() -> None:

            context.configure(
                connection=connection,
-                target_metadata=target_metadata,  # type: ignore
+                target_metadata=target_metadata,
                version_table_schema=schema_name,
                include_schemas=True,
                compare_type=True,
--- a/backend/alembic/versions/351faebd379d_add_curator_fields.py
+++ b/backend/alembic/versions/351faebd379d_add_curator_fields.py
@@ -25,7 +25,7 @@ def upgrade() -> None:

    # Use batch mode to modify the enum type
    with op.batch_alter_table("user", schema=None) as batch_op:
-        batch_op.alter_column(  # type: ignore[attr-defined]
+        batch_op.alter_column(
            "role",
            type_=sa.Enum(
                "BASIC",
@@ -71,7 +71,7 @@ def downgrade() -> None:
    op.drop_column("user__user_group", "is_curator")

    with op.batch_alter_table("user", schema=None) as batch_op:
-        batch_op.alter_column(  # type: ignore[attr-defined]
+        batch_op.alter_column(
            "role",
            type_=sa.Enum(
                "BASIC", "ADMIN", name="userrole", native_enum=False, length=20
--- a/backend/alembic/versions/47433d30de82_create_indexattempt_table.py
+++ b/backend/alembic/versions/47433d30de82_create_indexattempt_table.py
@@ -49,7 +49,7 @@ def upgrade() -> None:
            "time_updated",
            sa.DateTime(timezone=True),
            server_default=sa.text("now()"),
-            server_onupdate=sa.text("now()"),  # type: ignore
+            server_onupdate=sa.text("now()"),  # ty: ignore[invalid-argument-type]
            nullable=True,
        ),
        sa.Column(
--- a/backend/alembic/versions/4f8a2b3c1d9e_add_open_url_tool.py
+++ b/backend/alembic/versions/4f8a2b3c1d9e_add_open_url_tool.py
@@ -68,7 +68,7 @@ def upgrade() -> None:
            sa.text("SELECT id FROM tool WHERE in_code_tool_id = :in_code_tool_id"),
            {"in_code_tool_id": OPEN_URL_TOOL["in_code_tool_id"]},
        ).fetchone()
-        tool_id = result[0]  # type: ignore
+        tool_id = result[0]  # ty: ignore[not-subscriptable]

    # Associate the tool with all existing personas
    # Get all persona IDs
--- a/backend/alembic/versions/6d387b3196c2_basic_auth.py
+++ b/backend/alembic/versions/6d387b3196c2_basic_auth.py
@@ -63,7 +63,7 @@ def upgrade() -> None:
        "time_created",
        existing_type=postgresql.TIMESTAMP(timezone=True),
        nullable=False,
-        existing_server_default=sa.text("now()"),  # type: ignore
+        existing_server_default=sa.text("now()"),
    )
    op.alter_column(
        "index_attempt",
@@ -85,7 +85,7 @@ def downgrade() -> None:
        "time_created",
        existing_type=postgresql.TIMESTAMP(timezone=True),
        nullable=True,
-        existing_server_default=sa.text("now()"),  # type: ignore
+        existing_server_default=sa.text("now()"),
    )
    op.drop_index(op.f("ix_accesstoken_created_at"), table_name="accesstoken")
    op.drop_table("accesstoken")
--- a/backend/alembic/versions/800f48024ae9_add_id_to_connectorcredentialpair.py
+++ b/backend/alembic/versions/800f48024ae9_add_id_to_connectorcredentialpair.py
@@ -19,7 +19,7 @@ depends_on: None = None

 def upgrade() -> None:
    sequence = Sequence("connector_credential_pair_id_seq")
-    op.execute(CreateSequence(sequence))  # type: ignore
+    op.execute(CreateSequence(sequence))
    op.add_column(
        "connector_credential_pair",
        sa.Column(
--- a/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
+++ b/backend/alembic/versions/91d150c361f6_add_file_id_to_documents.py
@@ -0,0 +1,27 @@
+"""Add file_id to documents
+
+Revision ID: 91d150c361f6
+Revises: d129f37b3d87
+Create Date: 2026-04-16 15:43:30.314823
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+# revision identifiers, used by Alembic.
+revision = "91d150c361f6"
+down_revision = "d129f37b3d87"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "document",
+        sa.Column("file_id", sa.String(), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("document", "file_id")
--- a/backend/alembic/versions/9aadf32dfeb4_add_user_files.py
+++ b/backend/alembic/versions/9aadf32dfeb4_add_user_files.py
@@ -52,7 +52,7 @@ def upgrade() -> None:
        sa.Column(
            "created_at",
            sa.DateTime(),
-            default=datetime.datetime.utcnow,
+            default=lambda: datetime.datetime.now(datetime.timezone.utc),
        ),
        sa.Column(
            "cc_pair_id",
--- a/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
+++ b/backend/alembic/versions/c9e2cd766c29_add_s3_file_store_table.py
@@ -10,7 +10,7 @@ from alembic import op
 import sqlalchemy as sa
 from sqlalchemy.orm import Session
 from sqlalchemy import text
-from typing import cast, Any
+from typing import cast

 from botocore.exceptions import ClientError

@@ -255,7 +255,7 @@ def _migrate_files_to_external_storage() -> None:
            continue

        lobj_id = cast(int, file_record.lobj_oid)
-        file_metadata = cast(Any, file_record.file_metadata)
+        file_metadata = file_record.file_metadata

        # Read file content from PostgreSQL
        try:
--- a/backend/alembic/versions/d129f37b3d87_add_error_tracking_fields_to_index_.py
+++ b/backend/alembic/versions/d129f37b3d87_add_error_tracking_fields_to_index_.py
@@ -0,0 +1,28 @@
+"""add_error_tracking_fields_to_index_attempt_errors
+
+Revision ID: d129f37b3d87
+Revises: 503883791c39
+Create Date: 2026-04-06 19:11:18.261800
+
+"""
+
+from alembic import op
+import sqlalchemy as sa
+
+
+# revision identifiers, used by Alembic.
+revision = "d129f37b3d87"
+down_revision = "503883791c39"
+branch_labels = None
+depends_on = None
+
+
+def upgrade() -> None:
+    op.add_column(
+        "index_attempt_errors",
+        sa.Column("error_type", sa.String(), nullable=True),
+    )
+
+
+def downgrade() -> None:
+    op.drop_column("index_attempt_errors", "error_type")
--- a/backend/alembic_tenants/env.py
+++ b/backend/alembic_tenants/env.py
@@ -49,7 +49,7 @@ def run_migrations_offline() -> None:
    url = build_connection_string()
    context.configure(
        url=url,
-        target_metadata=target_metadata,  # type: ignore
+        target_metadata=target_metadata,
        literal_binds=True,
        dialect_opts={"paramstyle": "named"},
    )
@@ -61,7 +61,7 @@ def run_migrations_offline() -> None:
 def do_run_migrations(connection: Connection) -> None:
    context.configure(
        connection=connection,
-        target_metadata=target_metadata,  # type: ignore[arg-type]
+        target_metadata=target_metadata,
    )

    with context.begin_transaction():
--- a/backend/ee/onyx/access/access.py
+++ b/backend/ee/onyx/access/access.py
@@ -112,7 +112,7 @@ def _get_access_for_documents(
        access_map[document_id] = DocumentAccess.build(
            user_emails=list(non_ee_access.user_emails),
            user_groups=user_group_info.get(document_id, []),
-            is_public=is_public_anywhere,
+            is_public=is_public_anywhere,  # ty: ignore[invalid-argument-type]
            external_user_emails=list(ext_u_emails),
            external_user_group_ids=list(ext_u_groups),
        )
--- a/backend/ee/onyx/auth/users.py
+++ b/backend/ee/onyx/auth/users.py
@@ -1,5 +1,6 @@
 import os
 from datetime import datetime
+from datetime import timezone

 import jwt
 from fastapi import Depends
@@ -58,7 +59,7 @@ def generate_anonymous_user_jwt_token(tenant_id: str) -> str:
    payload = {
        "tenant_id": tenant_id,
        # Token does not expire
-        "iat": datetime.utcnow(),  # Issued at time
+        "iat": datetime.now(timezone.utc),  # Issued at time
    }

    return jwt.encode(payload, USER_AUTH_SECRET, algorithm="HS256")
--- a/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/doc_permission_syncing/tasks.py
@@ -80,6 +80,7 @@ from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSyn
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
 from onyx.redis.redis_pool import redis_lock_dump
+from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
 from onyx.server.runtime.onyx_runtime import OnyxRuntime
 from onyx.server.utils import make_short_id
 from onyx.utils.logger import doc_permission_sync_ctx
@@ -208,6 +209,11 @@ def check_for_doc_permissions_sync(self: Task, *, tenant_id: str) -> bool | None
                if _is_external_doc_permissions_sync_due(cc_pair):
                    cc_pair_ids_to_sync.append(cc_pair.id)

+        # Tenant-work-gating hook: refresh this tenant's active-set membership
+        # whenever doc-permission sync has any due cc_pairs to dispatch.
+        if cc_pair_ids_to_sync:
+            maybe_mark_tenant_active(tenant_id)
+
        lock_beat.reacquire()
        for cc_pair_id in cc_pair_ids_to_sync:
            payload_id = try_creating_permissions_sync_task(
--- a/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
+++ b/backend/ee/onyx/background/celery/tasks/external_group_syncing/tasks.py
@@ -69,6 +69,7 @@ from onyx.redis.redis_connector_ext_group_sync import (
 )
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
+from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
 from onyx.server.runtime.onyx_runtime import OnyxRuntime
 from onyx.server.utils import make_short_id
 from onyx.utils.logger import format_error_for_logging
@@ -202,6 +203,11 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
                if _is_external_group_sync_due(cc_pair):
                    cc_pair_ids_to_sync.append(cc_pair.id)

+        # Tenant-work-gating hook: refresh this tenant's active-set membership
+        # whenever external-group sync has any due cc_pairs to dispatch.
+        if cc_pair_ids_to_sync:
+            maybe_mark_tenant_active(tenant_id)
+
        lock_beat.reacquire()
        for cc_pair_id in cc_pair_ids_to_sync:
            payload_id = try_creating_external_group_sync_task(
--- a/backend/ee/onyx/db/analytics.py
+++ b/backend/ee/onyx/db/analytics.py
@@ -53,7 +53,7 @@ def fetch_query_analytics(
        .order_by(cast(ChatMessage.time_sent, Date))
    )

-    return db_session.execute(stmt).all()  # type: ignore
+    return db_session.execute(stmt).all()  # ty: ignore[invalid-return-type]


 def fetch_per_user_query_analytics(
@@ -92,7 +92,7 @@ def fetch_per_user_query_analytics(
        .order_by(cast(ChatMessage.time_sent, Date), ChatSession.user_id)
    )

-    return db_session.execute(stmt).all()  # type: ignore
+    return db_session.execute(stmt).all()  # ty: ignore[invalid-return-type]


 def fetch_onyxbot_analytics(
--- a/backend/ee/onyx/db/connector.py
+++ b/backend/ee/onyx/db/connector.py
@@ -9,7 +9,7 @@ logger = setup_logger()


 def fetch_sources_with_connectors(db_session: Session) -> list[DocumentSource]:
-    sources = db_session.query(distinct(Connector.source)).all()  # type: ignore
+    sources = db_session.query(distinct(Connector.source)).all()

    document_sources = [source[0] for source in sources]

--- a/backend/ee/onyx/db/license.py
+++ b/backend/ee/onyx/db/license.py
@@ -13,6 +13,7 @@ from ee.onyx.server.license.models import LicenseSource
 from onyx.auth.schemas import UserRole
 from onyx.cache.factory import get_cache_backend
 from onyx.configs.constants import ANONYMOUS_USER_EMAIL
+from onyx.db.enums import AccountType
 from onyx.db.models import License
 from onyx.db.models import User
 from onyx.utils.logger import setup_logger
@@ -107,12 +108,13 @@ def get_used_seats(tenant_id: str | None = None) -> int:
    Get current seat usage directly from database.

    For multi-tenant: counts users in UserTenantMapping for this tenant.
-    For self-hosted: counts all active users (excludes EXT_PERM_USER role
-    and the anonymous system user).
+    For self-hosted: counts all active users.

-    TODO: Exclude API key dummy users from seat counting. API keys create
-    users with emails like `__DANSWER_API_KEY_*` that should not count toward
-    seat limits. See: https://linear.app/onyx-app/issue/ENG-3518
+    Only human accounts count toward seat limits.
+    SERVICE_ACCOUNT (API key dummy users), EXT_PERM_USER, and the
+    anonymous system user are excluded. BOT (Slack users) ARE counted
+    because they represent real humans and get upgraded to STANDARD
+    when they log in via web.
    """
    if MULTI_TENANT:
        from ee.onyx.server.tenants.user_mapping import get_tenant_count
@@ -126,9 +128,10 @@ def get_used_seats(tenant_id: str | None = None) -> int:
                select(func.count())
                .select_from(User)
                .where(
-                    User.is_active == True,  # type: ignore  # noqa: E712
+                    User.is_active == True,  # noqa: E712
                    User.role != UserRole.EXT_PERM_USER,
-                    User.email != ANONYMOUS_USER_EMAIL,  # type: ignore
+                    User.email != ANONYMOUS_USER_EMAIL,
+                    User.account_type != AccountType.SERVICE_ACCOUNT,
                )
            )
            return result.scalar() or 0
--- a/backend/ee/onyx/db/scim.py
+++ b/backend/ee/onyx/db/scim.py
@@ -121,7 +121,7 @@ class ScimDAL(DAL):
        """Update the last_used_at timestamp for a token."""
        token = self._session.get(ScimToken, token_id)
        if token:
-            token.last_used_at = func.now()  # type: ignore[assignment]
+            token.last_used_at = func.now()

    # ------------------------------------------------------------------
    # User mapping operations
@@ -229,7 +229,7 @@ class ScimDAL(DAL):
    def get_user(self, user_id: UUID) -> User | None:
        """Fetch a user by ID."""
        return self._session.scalar(
-            select(User).where(User.id == user_id)  # type: ignore[arg-type]
+            select(User).where(User.id == user_id)  # ty: ignore[invalid-argument-type]
        )

    def get_user_by_email(self, email: str) -> User | None:
@@ -293,16 +293,22 @@ class ScimDAL(DAL):
            if attr == "username":
                # arg-type: fastapi-users types User.email as str, not a column expression
                # assignment: union return type widens but query is still Select[tuple[User]]
-                query = _apply_scim_string_op(query, User.email, scim_filter)  # type: ignore[arg-type, assignment]
+                query = _apply_scim_string_op(
+                    query, User.email, scim_filter  # ty: ignore[invalid-argument-type]
+                )
            elif attr == "active":
                query = query.where(
-                    User.is_active.is_(scim_filter.value.lower() == "true")  # type: ignore[attr-defined]
+                    User.is_active.is_(  # ty: ignore[unresolved-attribute]
+                        scim_filter.value.lower() == "true"
+                    )
                )
            elif attr == "externalid":
                mapping = self.get_user_mapping_by_external_id(scim_filter.value)
                if not mapping:
                    return [], 0
-                query = query.where(User.id == mapping.user_id)  # type: ignore[arg-type]
+                query = query.where(
+                    User.id == mapping.user_id  # ty: ignore[invalid-argument-type]
+                )
            else:
                raise ValueError(
                    f"Unsupported filter attribute: {scim_filter.attribute}"
@@ -318,7 +324,9 @@ class ScimDAL(DAL):
        offset = max(start_index - 1, 0)
        users = list(
            self._session.scalars(
-                query.order_by(User.id).offset(offset).limit(count)  # type: ignore[arg-type]
+                query.order_by(User.id)  # ty: ignore[invalid-argument-type]
+                .offset(offset)
+                .limit(count)
            )
            .unique()
            .all()
@@ -577,7 +585,7 @@ class ScimDAL(DAL):
            attr = scim_filter.attribute.lower()
            if attr == "displayname":
                # assignment: union return type widens but query is still Select[tuple[UserGroup]]
-                query = _apply_scim_string_op(query, UserGroup.name, scim_filter)  # type: ignore[assignment]
+                query = _apply_scim_string_op(query, UserGroup.name, scim_filter)
            elif attr == "externalid":
                mapping = self.get_group_mapping_by_external_id(scim_filter.value)
                if not mapping:
@@ -615,7 +623,9 @@ class ScimDAL(DAL):

        users = (
            self._session.scalars(
-                select(User).where(User.id.in_(user_ids))  # type: ignore[attr-defined]
+                select(User).where(
+                    User.id.in_(user_ids)  # ty: ignore[unresolved-attribute]
+                )
            )
            .unique()
            .all()
@@ -640,7 +650,9 @@ class ScimDAL(DAL):
            return []
        existing_users = (
            self._session.scalars(
-                select(User).where(User.id.in_(uuids))  # type: ignore[attr-defined]
+                select(User).where(
+                    User.id.in_(uuids)  # ty: ignore[unresolved-attribute]
+                )
            )
            .unique()
            .all()
--- a/backend/ee/onyx/db/user_group.py
+++ b/backend/ee/onyx/db/user_group.py
@@ -300,8 +300,11 @@ def fetch_user_groups_for_user(
    stmt = (
        select(UserGroup)
        .join(User__UserGroup, User__UserGroup.user_group_id == UserGroup.id)
-        .join(User, User.id == User__UserGroup.user_id)  # type: ignore
-        .where(User.id == user_id)  # type: ignore
+        .join(
+            User,
+            User.id == User__UserGroup.user_id,  # ty: ignore[invalid-argument-type]
+        )
+        .where(User.id == user_id)  # ty: ignore[invalid-argument-type]
    )
    if only_curator_groups:
        stmt = stmt.where(User__UserGroup.is_curator == True)  # noqa: E712
@@ -430,7 +433,7 @@ def fetch_user_groups_for_documents(
        .group_by(Document.id)
    )

-    return db_session.execute(stmt).all()  # type: ignore
+    return db_session.execute(stmt).all()  # ty: ignore[invalid-return-type]


 def _check_user_group_is_modifiable(user_group: UserGroup) -> None:
@@ -804,7 +807,9 @@ def update_user_group(
        db_user_group.is_up_to_date = False

    removed_users = db_session.scalars(
-        select(User).where(User.id.in_(removed_user_ids))  # type: ignore
+        select(User).where(
+            User.id.in_(removed_user_ids)  # ty: ignore[unresolved-attribute]
+        )
    ).unique()

    # Filter out admin and global curator users before validating curator status
--- a/backend/ee/onyx/external_permissions/google_drive/folder_retrieval.py
+++ b/backend/ee/onyx/external_permissions/google_drive/folder_retrieval.py
@@ -1,6 +1,6 @@
 from collections.abc import Iterator

-from googleapiclient.discovery import Resource  # type: ignore
+from googleapiclient.discovery import Resource

 from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
 from ee.onyx.external_permissions.google_drive.permission_retrieval import (
@@ -38,7 +38,7 @@ def get_folder_permissions_by_ids(
        A list of permissions matching the provided permission IDs
    """
    return get_permissions_by_ids(
-        drive_service=service,
+        drive_service=service,  # ty: ignore[invalid-argument-type]
        doc_id=folder_id,
        permission_ids=permission_ids,
    )
@@ -68,7 +68,7 @@ def get_modified_folders(

    # Retrieve and yield folders
    for folder in execute_paginated_retrieval(
-        retrieval_function=service.files().list,
+        retrieval_function=service.files().list,  # ty: ignore[unresolved-attribute]
        list_key="files",
        continue_on_404_or_403=True,
        corpora="allDrives",
--- a/backend/ee/onyx/external_permissions/google_drive/group_sync.py
+++ b/backend/ee/onyx/external_permissions/google_drive/group_sync.py
@@ -1,6 +1,6 @@
 from collections.abc import Generator

-from googleapiclient.errors import HttpError  # type: ignore
+from googleapiclient.errors import HttpError
 from pydantic import BaseModel

 from ee.onyx.db.external_perm import ExternalUserGroup
@@ -183,7 +183,7 @@ def _get_drive_members(
    )

    admin_user_info = (
-        admin_service.users()
+        admin_service.users()  # ty: ignore[unresolved-attribute]
        .get(userKey=google_drive_connector.primary_admin_email)
        .execute()
    )
@@ -197,7 +197,7 @@ def _get_drive_members(

        try:
            for permission in execute_paginated_retrieval(
-                drive_service.permissions().list,
+                drive_service.permissions().list,  # ty: ignore[unresolved-attribute]
                list_key="permissions",
                fileId=drive_id,
                fields="permissions(emailAddress, type),nextPageToken",
@@ -256,7 +256,7 @@ def _get_all_google_groups(
    """
    group_emails: set[str] = set()
    for group in execute_paginated_retrieval(
-        admin_service.groups().list,
+        admin_service.groups().list,  # ty: ignore[unresolved-attribute]
        list_key="groups",
        domain=google_domain,
        fields="groups(email),nextPageToken",
@@ -274,7 +274,7 @@ def _google_group_to_onyx_group(
    """
    group_member_emails: set[str] = set()
    for member in execute_paginated_retrieval(
-        admin_service.members().list,
+        admin_service.members().list,  # ty: ignore[unresolved-attribute]
        list_key="members",
        groupKey=group_email,
        fields="members(email),nextPageToken",
@@ -298,7 +298,7 @@ def _map_group_email_to_member_emails(
    for group_email in group_emails:
        group_member_emails: set[str] = set()
        for member in execute_paginated_retrieval(
-            admin_service.members().list,
+            admin_service.members().list,  # ty: ignore[unresolved-attribute]
            list_key="members",
            groupKey=group_email,
            fields="members(email),nextPageToken",
--- a/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py
+++ b/backend/ee/onyx/external_permissions/google_drive/permission_retrieval.py
@@ -33,7 +33,7 @@ def get_permissions_by_ids(

    # Fetch all permissions for the document
    fetched_permissions = execute_paginated_retrieval(
-        retrieval_function=drive_service.permissions().list,
+        retrieval_function=drive_service.permissions().list,  # ty: ignore[unresolved-attribute]
        list_key="permissions",
        fileId=doc_id,
        fields="permissions(id, emailAddress, type, domain, allowFileDiscovery, permissionDetails),nextPageToken",
--- a/backend/ee/onyx/external_permissions/jira/page_access.py
+++ b/backend/ee/onyx/external_permissions/jira/page_access.py
@@ -68,7 +68,7 @@ def _build_holder_map(permissions: list[dict]) -> dict[str, list[Holder]]:
            logger.warning(f"Expected a 'raw' field, but none was found: {raw_perm=}")
            continue

-        permission = Permission(**raw_perm.raw)
+        permission = Permission(**raw_perm.raw)  # ty: ignore[invalid-argument-type]

        # We only care about ability to browse through projects + issues (not other permissions such as read/write).
        if permission.permission != "BROWSE_PROJECTS":
--- a/backend/ee/onyx/external_permissions/sharepoint/group_sync.py
+++ b/backend/ee/onyx/external_permissions/sharepoint/group_sync.py
@@ -1,6 +1,6 @@
 from collections.abc import Generator

-from office365.sharepoint.client_context import ClientContext  # type: ignore[import-untyped]
+from office365.sharepoint.client_context import ClientContext

 from ee.onyx.db.external_perm import ExternalUserGroup
 from ee.onyx.external_permissions.sharepoint.permission_utils import (
--- a/backend/ee/onyx/external_permissions/sharepoint/permission_utils.py
+++ b/backend/ee/onyx/external_permissions/sharepoint/permission_utils.py
@@ -7,11 +7,11 @@ from typing import Any
 from urllib.parse import urlparse

 import requests as _requests
-from office365.graph_client import GraphClient  # type: ignore[import-untyped]
-from office365.onedrive.driveitems.driveItem import DriveItem  # type: ignore[import-untyped]
-from office365.runtime.client_request import ClientRequestException  # type: ignore
-from office365.sharepoint.client_context import ClientContext  # type: ignore[import-untyped]
-from office365.sharepoint.permissions.securable_object import RoleAssignmentCollection  # type: ignore[import-untyped]
+from office365.graph_client import GraphClient
+from office365.onedrive.driveitems.driveItem import DriveItem
+from office365.runtime.client_request import ClientRequestException
+from office365.sharepoint.client_context import ClientContext
+from office365.sharepoint.permissions.securable_object import RoleAssignmentCollection
 from pydantic import BaseModel

 from ee.onyx.db.external_perm import ExternalUserGroup
--- a/backend/ee/onyx/server/analytics/api.py
+++ b/backend/ee/onyx/server/analytics/api.py
@@ -46,9 +46,10 @@ def get_query_analytics(
    daily_query_usage_info = fetch_query_analytics(
        start=start
        or (
-            datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
+            datetime.datetime.now(tz=datetime.timezone.utc)
+            - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
        ),  # default is 30d lookback
-        end=end or datetime.datetime.utcnow(),
+        end=end or datetime.datetime.now(tz=datetime.timezone.utc),
        db_session=db_session,
    )
    return [
@@ -77,9 +78,10 @@ def get_user_analytics(
    daily_query_usage_info_per_user = fetch_per_user_query_analytics(
        start=start
        or (
-            datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
+            datetime.datetime.now(tz=datetime.timezone.utc)
+            - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
        ),  # default is 30d lookback
-        end=end or datetime.datetime.utcnow(),
+        end=end or datetime.datetime.now(tz=datetime.timezone.utc),
        db_session=db_session,
    )

@@ -111,9 +113,10 @@ def get_onyxbot_analytics(
    daily_onyxbot_info = fetch_onyxbot_analytics(
        start=start
        or (
-            datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
+            datetime.datetime.now(tz=datetime.timezone.utc)
+            - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
        ),  # default is 30d lookback
-        end=end or datetime.datetime.utcnow(),
+        end=end or datetime.datetime.now(tz=datetime.timezone.utc),
        db_session=db_session,
    )

@@ -146,9 +149,10 @@ def get_persona_messages(
 ) -> list[PersonaMessageAnalyticsResponse]:
    """Fetch daily message counts for a single persona within the given time range."""
    start = start or (
-        datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
+        datetime.datetime.now(tz=datetime.timezone.utc)
+        - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
    )
-    end = end or datetime.datetime.utcnow()
+    end = end or datetime.datetime.now(tz=datetime.timezone.utc)

    persona_message_counts = []
    for count, date in fetch_persona_message_analytics(
@@ -226,9 +230,10 @@ def get_assistant_stats(
    along with the overall total messages and total distinct users.
    """
    start = start or (
-        datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
+        datetime.datetime.now(tz=datetime.timezone.utc)
+        - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
    )
-    end = end or datetime.datetime.utcnow()
+    end = end or datetime.datetime.now(tz=datetime.timezone.utc)

    if not user_can_view_assistant_stats(db_session, user, assistant_id):
        raise HTTPException(
--- a/backend/ee/onyx/server/features/hooks/api.py
+++ b/backend/ee/onyx/server/features/hooks/api.py
@@ -287,8 +287,10 @@ def update_hook(
    validated_is_reachable: bool | None = None
    if endpoint_url_changing or api_key_changing or timeout_changing:
        existing = _get_hook_or_404(db_session, hook_id)
-        effective_url: str = (
-            req.endpoint_url if endpoint_url_changing else existing.endpoint_url  # type: ignore[assignment]  # endpoint_url is required on create and cannot be cleared on update
+        effective_url: str = (  # ty: ignore[invalid-assignment]
+            req.endpoint_url
+            if endpoint_url_changing
+            else existing.endpoint_url  # endpoint_url is required on create and cannot be cleared on update
        )
        effective_api_key: str | None = (
            (api_key if not isinstance(api_key, UnsetType) else None)
@@ -299,8 +301,10 @@ def update_hook(
                else None
            )
        )
-        effective_timeout: float = (
-            req.timeout_seconds if timeout_changing else existing.timeout_seconds  # type: ignore[assignment]  # req.timeout_seconds is non-None when timeout_changing (validated by HookUpdateRequest)
+        effective_timeout: float = (  # ty: ignore[invalid-assignment]
+            req.timeout_seconds
+            if timeout_changing
+            else existing.timeout_seconds  # req.timeout_seconds is non-None when timeout_changing (validated by HookUpdateRequest)
        )
        validation = _validate_endpoint(
            endpoint_url=effective_url,
--- a/backend/ee/onyx/server/query_history/api.py
+++ b/backend/ee/onyx/server/query_history/api.py
@@ -97,7 +97,7 @@ def fetch_and_process_chat_session_history(
            break

        paged_snapshots = parallel_yield(
-            [
+            [  # ty: ignore[invalid-argument-type]
                yield_snapshot_from_chat_session(
                    db_session=db_session,
                    chat_session=chat_session,
--- a/backend/ee/onyx/server/scim/api.py
+++ b/backend/ee/onyx/server/scim/api.py
@@ -11,6 +11,8 @@ require a valid SCIM bearer token.

 from __future__ import annotations

+import hashlib
+import struct
 from uuid import UUID

 from fastapi import APIRouter
@@ -22,6 +24,7 @@ from fastapi import Response
 from fastapi.responses import JSONResponse
 from fastapi_users.password import PasswordHelper
 from sqlalchemy import func
+from sqlalchemy import text
 from sqlalchemy.exc import IntegrityError
 from sqlalchemy.orm import Session

@@ -65,12 +68,25 @@ from onyx.db.permissions import recompute_user_permissions__no_commit
 from onyx.db.users import assign_user_to_default_groups__no_commit
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import fetch_ee_implementation_or_noop
+from shared_configs.contextvars import get_current_tenant_id

 logger = setup_logger()

 # Group names reserved for system default groups (seeded by migration).
 _RESERVED_GROUP_NAMES = frozenset({"Admin", "Basic"})

+# Namespace prefix for the seat-allocation advisory lock. Hashed together
+# with the tenant ID so the lock is scoped per-tenant (unrelated tenants
+# never block each other) and cannot collide with unrelated advisory locks.
+_SEAT_LOCK_NAMESPACE = "onyx_scim_seat_lock"
+
+
+def _seat_lock_id_for_tenant(tenant_id: str) -> int:
+    """Derive a stable 64-bit signed int lock id for this tenant's seat lock."""
+    digest = hashlib.sha256(f"{_SEAT_LOCK_NAMESPACE}:{tenant_id}".encode()).digest()
+    # pg_advisory_xact_lock takes a signed 8-byte int; unpack as such.
+    return struct.unpack("q", digest[:8])[0]
+

 class ScimJSONResponse(JSONResponse):
    """JSONResponse with Content-Type: application/scim+json (RFC 7644 §3.1)."""
@@ -209,12 +225,37 @@ def _apply_exclusions(


 def _check_seat_availability(dal: ScimDAL) -> str | None:
-    """Return an error message if seat limit is reached, else None."""
+    """Return an error message if seat limit is reached, else None.
+
+    Acquires a transaction-scoped advisory lock so that concurrent
+    SCIM requests are serialized.  IdPs like Okta send provisioning
+    requests in parallel batches — without serialization the check is
+    vulnerable to a TOCTOU race where N concurrent requests each see
+    "seats available", all insert, and the tenant ends up over its
+    seat limit.
+
+    The lock is held until the caller's next COMMIT or ROLLBACK, which
+    means the seat count cannot change between the check here and the
+    subsequent INSERT/UPDATE.  Each call site in this module follows
+    the pattern: _check_seat_availability → write → dal.commit()
+    (which releases the lock for the next waiting request).
+    """
    check_fn = fetch_ee_implementation_or_noop(
        "onyx.db.license", "check_seat_availability", None
    )
    if check_fn is None:
        return None
+
+    # Transaction-scoped advisory lock — released on dal.commit() / dal.rollback().
+    # The lock id is derived from the tenant so unrelated tenants never block
+    # each other, and from a namespace string so it cannot collide with
+    # unrelated advisory locks elsewhere in the codebase.
+    lock_id = _seat_lock_id_for_tenant(get_current_tenant_id())
+    dal.session.execute(
+        text("SELECT pg_advisory_xact_lock(:lock_id)"),
+        {"lock_id": lock_id},
+    )
+
    result = check_fn(dal.session, seats_needed=1)
    if not result.available:
        return result.error_message or "Seat limit reached"
--- a/backend/ee/onyx/server/tenants/access.py
+++ b/backend/ee/onyx/server/tenants/access.py
@@ -1,5 +1,6 @@
 from datetime import datetime
 from datetime import timedelta
+from datetime import timezone

 import jwt
 from fastapi import HTTPException
@@ -19,8 +20,8 @@ def generate_data_plane_token() -> str:

    payload = {
        "iss": "data_plane",
-        "exp": datetime.utcnow() + timedelta(minutes=5),
-        "iat": datetime.utcnow(),
+        "exp": datetime.now(tz=timezone.utc) + timedelta(minutes=5),
+        "iat": datetime.now(tz=timezone.utc),
        "scope": "api_access",
    }

--- a/backend/ee/onyx/server/tenants/schema_management.py
+++ b/backend/ee/onyx/server/tenants/schema_management.py
@@ -55,8 +55,10 @@ def run_alembic_migrations(schema_name: str) -> None:
        alembic_cfg.attributes["configure_logger"] = False

        # Mimic command-line options by adding 'cmd_opts' to the config
-        alembic_cfg.cmd_opts = SimpleNamespace()  # type: ignore
-        alembic_cfg.cmd_opts.x = [f"schemas={schema_name}"]  # type: ignore
+        alembic_cfg.cmd_opts = SimpleNamespace()  # ty: ignore[invalid-assignment]
+        alembic_cfg.cmd_opts.x = [  # ty: ignore[invalid-assignment]
+            f"schemas={schema_name}"
+        ]

        # Run migrations programmatically
        command.upgrade(alembic_cfg, "head")
--- a/backend/ee/onyx/server/tenants/user_mapping.py
+++ b/backend/ee/onyx/server/tenants/user_mapping.py
@@ -349,8 +349,9 @@ def get_tenant_count(tenant_id: str) -> int:
        user_count = (
            db_session.query(User)
            .filter(
-                User.email.in_(emails),  # type: ignore
-                User.is_active == True,  # type: ignore  # noqa: E712
+                User.email.in_(emails),  # ty: ignore[unresolved-attribute]
+                User.is_active  # noqa: E712  # ty: ignore[invalid-argument-type]
+                == True,
            )
            .count()
        )
--- a/backend/ee/onyx/utils/posthog_client.py
+++ b/backend/ee/onyx/utils/posthog_client.py
@@ -73,7 +73,7 @@ def capture_and_sync_with_alternate_posthog(
            cloud_props.pop("onyx_cloud_user_id", None)

            posthog.identify(
-                distinct_id=cloud_user_id,
+                distinct_id=cloud_user_id,  # ty: ignore[possibly-unresolved-reference]
                properties=cloud_props,
            )
    except Exception as e:
@@ -105,7 +105,7 @@ def get_anon_id_from_request(request: Any) -> str | None:
    if (cookie_value := request.cookies.get(cookie_name)) and (
        parsed := parse_posthog_cookie(cookie_value)
    ):
-        return parsed.get("distinct_id")
+        return parsed.get("distinct_id")  # ty: ignore[possibly-unresolved-reference]

    return None

--- a/backend/model_server/legacy/custom_models.py
+++ b/backend/model_server/legacy/custom_models.py
@@ -23,7 +23,7 @@
 # from shared_configs.model_server_models import IntentResponse

 # if TYPE_CHECKING:
-#     from setfit import SetFitModel  # type: ignore[import-untyped]
+#     from setfit import SetFitModel
 #     from transformers import PreTrainedTokenizer, BatchEncoding


@@ -423,7 +423,7 @@
 # def map_keywords(
 #     input_ids: torch.Tensor, tokenizer: "PreTrainedTokenizer", is_keyword: list[bool]
 # ) -> list[str]:
-#     tokens = tokenizer.convert_ids_to_tokens(input_ids)  # type: ignore
+#     tokens = tokenizer.convert_ids_to_tokens(input_ids)

 #     if not len(tokens) == len(is_keyword):
 #         raise ValueError("Length of tokens and keyword predictions must match")
--- a/backend/model_server/legacy/onyx_torch_model.py
+++ b/backend/model_server/legacy/onyx_torch_model.py
@@ -18,7 +18,7 @@
 #         super().__init__()
 #         config = DistilBertConfig()
 #         self.distilbert = DistilBertModel(config)
-#         config = self.distilbert.config  # type: ignore
+#         config = self.distilbert.config

 #         # Keyword tokenwise binary classification layer
 #         self.keyword_classifier = nn.Linear(config.dim, 2)
@@ -85,7 +85,7 @@

 #         self.config = config
 #         self.distilbert = DistilBertModel(config)
-#         config = self.distilbert.config  # type: ignore
+#         config = self.distilbert.config
 #         self.connector_global_classifier = nn.Linear(config.dim, 1)
 #         self.connector_match_classifier = nn.Linear(config.dim, 1)
 #         self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
--- a/backend/model_server/main.py
+++ b/backend/model_server/main.py
@@ -96,11 +96,14 @@ def get_model_app() -> FastAPI:
        title="Onyx Model Server", version=__version__, lifespan=lifespan
    )
    if SENTRY_DSN:
+        from onyx.configs.sentry import _add_instance_tags
+
        sentry_sdk.init(
            dsn=SENTRY_DSN,
            integrations=[StarletteIntegration(), FastApiIntegration()],
            traces_sample_rate=0.1,
            release=__version__,
+            before_send=_add_instance_tags,
        )
        logger.info("Sentry initialized")
    else:
--- a/backend/onyx/auth/email_utils.py
+++ b/backend/onyx/auth/email_utils.py
@@ -7,8 +7,8 @@ from email.mime.text import MIMEText
 from email.utils import formatdate
 from email.utils import make_msgid

-import sendgrid  # type: ignore
-from sendgrid.helpers.mail import Attachment  # type: ignore
+import sendgrid
+from sendgrid.helpers.mail import Attachment
 from sendgrid.helpers.mail import Content
 from sendgrid.helpers.mail import ContentId
 from sendgrid.helpers.mail import Disposition
--- a/backend/onyx/auth/jwt.py
+++ b/backend/onyx/auth/jwt.py
@@ -10,7 +10,7 @@ from cryptography.hazmat.primitives.asymmetric.rsa import RSAPublicKey
 from jwt import decode as jwt_decode
 from jwt import InvalidTokenError
 from jwt import PyJWTError
-from jwt.algorithms import RSAAlgorithm
+from jwt.algorithms import RSAAlgorithm  # ty: ignore[possibly-missing-import]

 from onyx.configs.app_configs import JWT_PUBLIC_KEY_URL
 from onyx.utils.logger import setup_logger
--- a/backend/onyx/auth/oauth_refresher.py
+++ b/backend/onyx/auth/oauth_refresher.py
@@ -46,8 +46,10 @@ async def _test_expire_oauth_token(

        updated_data: Dict[str, Any] = {"expires_at": new_expires_at}

-        await user_manager.user_db.update_oauth_account(
-            user, cast(Any, oauth_account), updated_data
+        await user_manager.user_db.update_oauth_account(  # ty: ignore[invalid-argument-type]
+            user,  # ty: ignore[invalid-argument-type]
+            cast(Any, oauth_account),
+            updated_data,
        )

        return True
@@ -132,8 +134,10 @@ async def refresh_oauth_token(
                    )

            # Update the OAuth account
-            await user_manager.user_db.update_oauth_account(
-                user, cast(Any, oauth_account), updated_data
+            await user_manager.user_db.update_oauth_account(  # ty: ignore[invalid-argument-type]
+                user,  # ty: ignore[invalid-argument-type]
+                cast(Any, oauth_account),
+                updated_data,
            )

            logger.info(f"Successfully refreshed OAuth token for {user.email}")
--- a/backend/onyx/auth/oauth_token_manager.py
+++ b/backend/onyx/auth/oauth_token_manager.py
@@ -191,7 +191,7 @@ class OAuthTokenManager:
    @staticmethod
    def _unwrap_sensitive_str(value: SensitiveValue[str] | str) -> str:
        if isinstance(value, SensitiveValue):
-            return value.get_value(apply_mask=False)
+            return value.get_value(apply_mask=False)  # ty: ignore[invalid-return-type]
        return value

    @staticmethod
@@ -199,5 +199,7 @@ class OAuthTokenManager:
        token_data: SensitiveValue[dict[str, Any]] | dict[str, Any],
    ) -> dict[str, Any]:
        if isinstance(token_data, SensitiveValue):
-            return token_data.get_value(apply_mask=False)
+            return token_data.get_value(  # ty: ignore[invalid-return-type]
+                apply_mask=False
+            )
        return token_data
--- a/backend/onyx/auth/permissions.py
+++ b/backend/onyx/auth/permissions.py
@@ -121,5 +121,7 @@ def require_permission(

        return user

-    dependency._is_require_permission = True  # type: ignore[attr-defined]  # sentinel for auth_check detection
+    dependency._is_require_permission = (  # ty: ignore[unresolved-attribute]
+        True  # sentinel for auth_check detection
+    )
    return dependency
--- a/backend/onyx/auth/users.py
+++ b/backend/onyx/auth/users.py
@@ -45,7 +45,9 @@ from fastapi_users import UUIDIDMixin
 from fastapi_users.authentication import AuthenticationBackend
 from fastapi_users.authentication import CookieTransport
 from fastapi_users.authentication import JWTStrategy
-from fastapi_users.authentication import RedisStrategy
+from fastapi_users.authentication import (
+    RedisStrategy,  # ty: ignore[possibly-missing-import]
+)
 from fastapi_users.authentication import Strategy
 from fastapi_users.authentication.strategy.db import AccessTokenDatabase
 from fastapi_users.authentication.strategy.db import DatabaseStrategy
@@ -462,14 +464,16 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    self.user_db = tenant_user_db

                if hasattr(user_create, "role"):
-                    user_create.role = UserRole.BASIC
+                    user_create.role = UserRole.BASIC  # ty: ignore[invalid-assignment]

                    user_count = await get_user_count()
                    if (
                        user_count == 0
                        or user_create.email in get_default_admin_user_emails()
                    ):
-                        user_create.role = UserRole.ADMIN
+                        user_create.role = (  # ty: ignore[invalid-assignment]
+                            UserRole.ADMIN
+                        )

                # Check seat availability for new users (single-tenant only)
                with get_session_with_current_tenant() as sync_db:
@@ -516,7 +520,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    # Expire so the async session re-fetches the row updated by
                    # the sync session above.
                    self.user_db.session.expire(user)
-                    user = await self.user_db.get(user_id)  # type: ignore[assignment]
+                    user = await self.user_db.get(  # ty: ignore[invalid-assignment]
+                        user_id
+                    )
                except exceptions.UserAlreadyExists:
                    user = await self.get_by_email(user_create.email)

@@ -544,7 +550,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    # Expire so the async session re-fetches the row updated by
                    # the sync session above.
                    self.user_db.session.expire(user)
-                    user = await self.user_db.get(user_id)  # type: ignore[assignment]
+                    user = await self.user_db.get(  # ty: ignore[invalid-assignment]
+                        user_id
+                    )
                if user_created:
                    await self._assign_default_pinned_assistants(user, db_session)
                remove_user_from_invited_users(user_create.email)
@@ -592,7 +600,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        update nor the group assignment is visible without the other.
        """
        with get_session_with_current_tenant() as sync_db:
-            sync_user = sync_db.query(User).filter(User.id == user_id).first()  # type: ignore[arg-type]
+            sync_user = (
+                sync_db.query(User)
+                .filter(User.id == user_id)  # ty: ignore[invalid-argument-type]
+                .first()
+            )
            if sync_user:
                sync_user.hashed_password = self.password_helper.hash(
                    user_create.password
@@ -613,7 +625,9 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                    user_id,
                )

-    async def validate_password(self, password: str, _: schemas.UC | models.UP) -> None:
+    async def validate_password(  # ty: ignore[invalid-method-override]
+        self, password: str, _: schemas.UC | models.UP
+    ) -> None:
        # Validate password according to configurable security policy (defined via environment variables)
        if len(password) < PASSWORD_MIN_LENGTH:
            raise exceptions.InvalidPasswordException(
@@ -644,7 +658,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
        return

    @log_function_time(print_only=True)
-    async def oauth_callback(
+    async def oauth_callback(  # ty: ignore[invalid-method-override]
        self,
        oauth_name: str,
        access_token: str,
@@ -754,7 +768,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                                user,
                                # NOTE: OAuthAccount DOES implement the OAuthAccountProtocol
                                # but the type checker doesn't know that :(
-                                existing_oauth_account,  # type: ignore
+                                existing_oauth_account,  # ty: ignore[invalid-argument-type]
                                oauth_account_dict,
                            )

@@ -788,7 +802,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
                # transaction so neither change is visible without the other.
                was_inactive = not user.is_active
                with get_session_with_current_tenant() as sync_db:
-                    sync_user = sync_db.query(User).filter(User.id == user.id).first()  # type: ignore[arg-type]
+                    sync_user = (
+                        sync_db.query(User)
+                        .filter(User.id == user.id)  # ty: ignore[invalid-argument-type]
+                        .first()
+                    )
                    if sync_user:
                        sync_user.is_verified = is_verified_by_default
                        sync_user.role = UserRole.BASIC
@@ -808,7 +826,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            # otherwise, the oidc expiry will always be old, and the user will never be able to login
            if user.oidc_expiry is not None and not TRACK_EXTERNAL_IDP_EXPIRY:
                await self.user_db.update(user, {"oidc_expiry": None})
-                user.oidc_expiry = None  # type: ignore
+                user.oidc_expiry = None  # ty: ignore[invalid-assignment]
            remove_user_from_invited_users(user.email)
            if token:
                CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
@@ -925,7 +943,11 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            and (marketing_cookie_value := request.cookies.get(marketing_cookie_name))
            and (parsed_cookie := parse_posthog_cookie(marketing_cookie_value))
        ):
-            marketing_anonymous_id = parsed_cookie["distinct_id"]
+            marketing_anonymous_id = (
+                parsed_cookie[  # ty: ignore[possibly-unresolved-reference]
+                    "distinct_id"
+                ]
+            )

            # Technically, USER_SIGNED_UP is only fired from the cloud site when
            # it is the first user in a tenant. However, it is semantically correct
@@ -942,7 +964,10 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
            }

            # Add all other values from the marketing cookie (featureFlags, etc.)
-            for key, value in parsed_cookie.items():
+            for (
+                key,
+                value,
+            ) in parsed_cookie.items():  # ty: ignore[possibly-unresolved-reference]
                if key != "distinct_id":
                    properties.setdefault(key, value)

@@ -1504,7 +1529,7 @@ async def _sync_jwt_oidc_expiry(

    if user.oidc_expiry is not None:
        await user_manager.user_db.update(user, {"oidc_expiry": None})
-        user.oidc_expiry = None  # type: ignore
+        user.oidc_expiry = None  # ty: ignore[invalid-assignment]


 async def _get_or_create_user_from_jwt(
@@ -2232,7 +2257,7 @@ def get_oauth_router(

            # Proceed to authenticate or create the user
            try:
-                user = await user_manager.oauth_callback(
+                user = await user_manager.oauth_callback(  # ty: ignore[invalid-argument-type]
                    oauth_client.name,
                    token["access_token"],
                    account_id,
--- a/backend/onyx/background/celery/README.md
+++ b/backend/onyx/background/celery/README.md
@@ -1,37 +0,0 @@
-# Celery Development Notes
-
-This document is the local reference for Celery worker structure and task-writing rules in Onyx.
-
-## Worker Types
-
-Onyx uses multiple specialized workers:
-
-1. `primary`: coordinates core background tasks and system-wide operations.
-2. `docfetching`: fetches documents from connectors and schedules downstream work.
-3. `docprocessing`: runs the indexing pipeline for fetched documents.
-4. `light`: handles lightweight and fast operations.
-5. `heavy`: handles more resource-intensive operations.
-6. `kg_processing`: runs knowledge-graph processing and clustering.
-7. `monitoring`: collects health and system metrics.
-8. `user_file_processing`: processes user-uploaded files.
-9. `beat`: schedules periodic work.
-
-For actual implementation details, inspect:
-
- `backend/onyx/background/celery/apps/`
- `backend/onyx/background/celery/configs/`
- `backend/onyx/background/celery/tasks/`
-
-## Task Rules
-
- Always use `@shared_task` rather than `@celery_app`.
- Put tasks under `background/celery/tasks/` or `ee/background/celery/tasks/`.
- Never enqueue a task without `expires=`. This is a hard requirement because stale queued work can
-  accumulate without bound.
- Do not rely on Celery time-limit enforcement. These workers run in thread pools, so timeout logic
-  must be implemented inside the task itself.
-
-## Testing Note
-
-If you change Celery worker code and want to validate it against a running local worker, the worker
-usually needs to be restarted manually. There is no general auto-restart on code change.
--- a/backend/onyx/background/celery/apps/app_base.py
+++ b/backend/onyx/background/celery/apps/app_base.py
@@ -6,15 +6,16 @@ from typing import Any
 from typing import cast

 import sentry_sdk
-from celery import bootsteps  # type: ignore
+from celery import bootsteps  # ty: ignore[unresolved-import]
 from celery import Task
-from celery.app import trace
+from celery.app import trace  # ty: ignore[unresolved-import]
 from celery.exceptions import WorkerShutdown
+from celery.signals import before_task_publish
 from celery.signals import task_postrun
 from celery.signals import task_prerun
 from celery.states import READY_STATES
 from celery.utils.log import get_task_logger
-from celery.worker import strategy  # type: ignore
+from celery.worker import strategy  # ty: ignore[unresolved-import]
 from redis.lock import Lock as RedisLock
 from sentry_sdk.integrations.celery import CeleryIntegration
 from sqlalchemy import text
@@ -62,11 +63,14 @@ logger = setup_logger()
 task_logger = get_task_logger(__name__)

 if SENTRY_DSN:
+    from onyx.configs.sentry import _add_instance_tags
+
    sentry_sdk.init(
        dsn=SENTRY_DSN,
        integrations=[CeleryIntegration()],
        traces_sample_rate=0.1,
        release=__version__,
+        before_send=_add_instance_tags,
    )
    logger.info("Sentry initialized")
 else:
@@ -94,6 +98,17 @@ class TenantAwareTask(Task):
            CURRENT_TENANT_ID_CONTEXTVAR.set(None)


+@before_task_publish.connect
+def on_before_task_publish(
+    headers: dict[str, Any] | None = None,
+    **kwargs: Any,  # noqa: ARG001
+) -> None:
+    """Stamp the current wall-clock time into the task message headers so that
+    workers can compute queue wait time (time between publish and execution)."""
+    if headers is not None:
+        headers["enqueued_at"] = time.time()
+
+
@task_prerun.connect
 def on_task_prerun(
    sender: Any | None = None,  # noqa: ARG001
--- a/backend/onyx/background/celery/apps/beat.py
+++ b/backend/onyx/background/celery/apps/beat.py
@@ -3,7 +3,7 @@ from typing import Any

 from celery import Celery
 from celery import signals
-from celery.beat import PersistentScheduler  # type: ignore
+from celery.beat import PersistentScheduler  # ty: ignore[unresolved-import]
 from celery.signals import beat_init
 from celery.utils.log import get_task_logger

--- a/backend/onyx/background/celery/apps/client.py
+++ b/backend/onyx/background/celery/apps/client.py
@@ -4,4 +4,4 @@ import onyx.background.celery.apps.app_base as app_base

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.client")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]
--- a/backend/onyx/background/celery/apps/docfetching.py
+++ b/backend/onyx/background/celery/apps/docfetching.py
@@ -29,7 +29,7 @@ logger = setup_logger()

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.docfetching")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]


@signals.task_prerun.connect
@@ -100,7 +100,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
    logger.info("worker_init signal received.")

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_DOCFETCHING_APP_NAME)
-    pool_size = cast(int, sender.concurrency)  # type: ignore
+    pool_size = cast(int, sender.concurrency)  # ty: ignore[unresolved-attribute]
    SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)

    app_base.wait_for_redis(sender, **kwargs)
--- a/backend/onyx/background/celery/apps/docprocessing.py
+++ b/backend/onyx/background/celery/apps/docprocessing.py
@@ -30,7 +30,7 @@ logger = setup_logger()

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.docprocessing")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]


@signals.task_prerun.connect
@@ -106,7 +106,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
    # "SSL connection has been closed unexpectedly"
    # actually setting the spawn method in the cloud fixes 95% of these.
    # setting pre ping might help even more, but not worrying about that yet
-    pool_size = cast(int, sender.concurrency)  # type: ignore
+    pool_size = cast(int, sender.concurrency)  # ty: ignore[unresolved-attribute]
    SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)

    app_base.wait_for_redis(sender, **kwargs)
--- a/backend/onyx/background/celery/apps/heavy.py
+++ b/backend/onyx/background/celery/apps/heavy.py
@@ -27,7 +27,7 @@ logger = setup_logger()

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.heavy")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]


@signals.task_prerun.connect
@@ -92,7 +92,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
    logger.info("worker_init signal received.")

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
-    pool_size = cast(int, sender.concurrency)  # type: ignore
+    pool_size = cast(int, sender.concurrency)  # ty: ignore[unresolved-attribute]
    SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)

    app_base.wait_for_redis(sender, **kwargs)
--- a/backend/onyx/background/celery/apps/light.py
+++ b/backend/onyx/background/celery/apps/light.py
@@ -16,6 +16,12 @@ from onyx.configs.app_configs import VESPA_CLOUD_CERT_PATH
 from onyx.configs.app_configs import VESPA_CLOUD_KEY_PATH
 from onyx.configs.constants import POSTGRES_CELERY_WORKER_LIGHT_APP_NAME
 from onyx.db.engine.sql_engine import SqlEngine
+from onyx.server.metrics.celery_task_metrics import on_celery_task_postrun
+from onyx.server.metrics.celery_task_metrics import on_celery_task_prerun
+from onyx.server.metrics.celery_task_metrics import on_celery_task_rejected
+from onyx.server.metrics.celery_task_metrics import on_celery_task_retry
+from onyx.server.metrics.celery_task_metrics import on_celery_task_revoked
+from onyx.server.metrics.metrics_server import start_metrics_server
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT

@@ -23,7 +29,7 @@ logger = setup_logger()

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.light")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]


@signals.task_prerun.connect
@@ -36,6 +42,7 @@ def on_task_prerun(
    **kwds: Any,
 ) -> None:
    app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds)
+    on_celery_task_prerun(task_id, task)


@signals.task_postrun.connect
@@ -50,6 +57,31 @@ def on_task_postrun(
    **kwds: Any,
 ) -> None:
    app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds)
+    on_celery_task_postrun(task_id, task, state)
+
+
+@signals.task_retry.connect
+def on_task_retry(sender: Any | None = None, **kwargs: Any) -> None:  # noqa: ARG001
+    task_id = getattr(getattr(sender, "request", None), "id", None)
+    on_celery_task_retry(task_id, sender)
+
+
+@signals.task_revoked.connect
+def on_task_revoked(sender: Any | None = None, **kwargs: Any) -> None:
+    task_name = getattr(sender, "name", None) or str(sender)
+    on_celery_task_revoked(kwargs.get("task_id"), task_name)
+
+
+@signals.task_rejected.connect
+def on_task_rejected(sender: Any | None = None, **kwargs: Any) -> None:  # noqa: ARG001
+    message = kwargs.get("message")
+    task_name: str | None = None
+    if message is not None:
+        headers = getattr(message, "headers", None) or {}
+        task_name = headers.get("task")
+    if task_name is None:
+        task_name = "unknown"
+    on_celery_task_rejected(None, task_name)


@celeryd_init.connect
@@ -63,19 +95,26 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:

    logger.info("worker_init signal received.")

-    logger.info(f"Concurrency: {sender.concurrency}")  # type: ignore
+    logger.info(
+        f"Concurrency: {sender.concurrency}"  # ty: ignore[unresolved-attribute]
+    )

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
-    SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=EXTRA_CONCURRENCY)  # type: ignore
+    SqlEngine.init_engine(
+        pool_size=sender.concurrency,  # ty: ignore[unresolved-attribute]
+        max_overflow=EXTRA_CONCURRENCY,
+    )

    if MANAGED_VESPA:
        httpx_init_vespa_pool(
-            sender.concurrency + EXTRA_CONCURRENCY,  # type: ignore
+            sender.concurrency + EXTRA_CONCURRENCY,  # ty: ignore[unresolved-attribute]
            ssl_cert=VESPA_CLOUD_CERT_PATH,
            ssl_key=VESPA_CLOUD_KEY_PATH,
        )
    else:
-        httpx_init_vespa_pool(sender.concurrency + EXTRA_CONCURRENCY)  # type: ignore
+        httpx_init_vespa_pool(
+            sender.concurrency + EXTRA_CONCURRENCY  # ty: ignore[unresolved-attribute]
+        )

    app_base.wait_for_redis(sender, **kwargs)
    app_base.wait_for_db(sender, **kwargs)
@@ -90,6 +129,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:

@worker_ready.connect
 def on_worker_ready(sender: Any, **kwargs: Any) -> None:
+    start_metrics_server("light")
    app_base.on_worker_ready(sender, **kwargs)


--- a/backend/onyx/background/celery/apps/monitoring.py
+++ b/backend/onyx/background/celery/apps/monitoring.py
@@ -20,7 +20,7 @@ logger = setup_logger()

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.monitoring")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]


@signals.task_prerun.connect
--- a/backend/onyx/background/celery/apps/primary.py
+++ b/backend/onyx/background/celery/apps/primary.py
@@ -3,7 +3,7 @@ import os
 from typing import Any
 from typing import cast

-from celery import bootsteps  # type: ignore
+from celery import bootsteps  # ty: ignore[unresolved-import]
 from celery import Celery
 from celery import signals
 from celery import Task
@@ -38,6 +38,12 @@ from onyx.redis.redis_connector_stop import RedisConnectorStop
 from onyx.redis.redis_document_set import RedisDocumentSet
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_usergroup import RedisUserGroup
+from onyx.server.metrics.celery_task_metrics import on_celery_task_postrun
+from onyx.server.metrics.celery_task_metrics import on_celery_task_prerun
+from onyx.server.metrics.celery_task_metrics import on_celery_task_rejected
+from onyx.server.metrics.celery_task_metrics import on_celery_task_retry
+from onyx.server.metrics.celery_task_metrics import on_celery_task_revoked
+from onyx.server.metrics.metrics_server import start_metrics_server
 from onyx.utils.logger import setup_logger
 from shared_configs.configs import MULTI_TENANT
 from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
@@ -46,7 +52,7 @@ logger = setup_logger()

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.primary")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]


@signals.task_prerun.connect
@@ -59,6 +65,7 @@ def on_task_prerun(
    **kwds: Any,
 ) -> None:
    app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds)
+    on_celery_task_prerun(task_id, task)


@signals.task_postrun.connect
@@ -73,6 +80,31 @@ def on_task_postrun(
    **kwds: Any,
 ) -> None:
    app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds)
+    on_celery_task_postrun(task_id, task, state)
+
+
+@signals.task_retry.connect
+def on_task_retry(sender: Any | None = None, **kwargs: Any) -> None:  # noqa: ARG001
+    task_id = getattr(getattr(sender, "request", None), "id", None)
+    on_celery_task_retry(task_id, sender)
+
+
+@signals.task_revoked.connect
+def on_task_revoked(sender: Any | None = None, **kwargs: Any) -> None:
+    task_name = getattr(sender, "name", None) or str(sender)
+    on_celery_task_revoked(kwargs.get("task_id"), task_name)
+
+
+@signals.task_rejected.connect
+def on_task_rejected(sender: Any | None = None, **kwargs: Any) -> None:  # noqa: ARG001
+    message = kwargs.get("message")
+    task_name: str | None = None
+    if message is not None:
+        headers = getattr(message, "headers", None) or {}
+        task_name = headers.get("task")
+    if task_name is None:
+        task_name = "unknown"
+    on_celery_task_rejected(None, task_name)


@celeryd_init.connect
@@ -85,7 +117,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
    logger.info("worker_init signal received.")

    SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
-    pool_size = cast(int, sender.concurrency)  # type: ignore
+    pool_size = cast(int, sender.concurrency)  # ty: ignore[unresolved-attribute]
    SqlEngine.init_engine(
        pool_size=pool_size, max_overflow=CELERY_WORKER_PRIMARY_POOL_OVERFLOW
    )
@@ -145,7 +177,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
        raise WorkerShutdown("Primary worker lock could not be acquired!")

    # tacking on our own user data to the sender
-    sender.primary_worker_lock = lock  # type: ignore
+    sender.primary_worker_lock = lock  # ty: ignore[unresolved-attribute]

    # As currently designed, when this worker starts as "primary", we reinitialize redis
    # to a clean state (for our purposes, anyway)
@@ -212,6 +244,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:

@worker_ready.connect
 def on_worker_ready(sender: Any, **kwargs: Any) -> None:
+    start_metrics_server("primary")
    app_base.on_worker_ready(sender, **kwargs)


--- a/backend/onyx/background/celery/apps/user_file_processing.py
+++ b/backend/onyx/background/celery/apps/user_file_processing.py
@@ -22,7 +22,7 @@ logger = setup_logger()

 celery_app = Celery(__name__)
 celery_app.config_from_object("onyx.background.celery.configs.user_file_processing")
-celery_app.Task = app_base.TenantAwareTask  # type: ignore [misc]
+celery_app.Task = app_base.TenantAwareTask  # ty: ignore[invalid-assignment]


@signals.task_prerun.connect
@@ -66,7 +66,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
    # "SSL connection has been closed unexpectedly"
    # actually setting the spawn method in the cloud fixes 95% of these.
    # setting pre ping might help even more, but not worrying about that yet
-    pool_size = cast(int, sender.concurrency)  # type: ignore
+    pool_size = cast(int, sender.concurrency)  # ty: ignore[unresolved-attribute]
    SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)

    app_base.wait_for_redis(sender, **kwargs)
--- a/backend/onyx/background/celery/celery_redis.py
+++ b/backend/onyx/background/celery/celery_redis.py
@@ -179,7 +179,7 @@ def celery_inspect_get_workers(name_filter: str | None, app: Celery) -> list[str

    # filter for and create an indexing specific inspect object
    inspect = app.control.inspect()
-    workers: dict[str, Any] = inspect.ping()  # type: ignore
+    workers: dict[str, Any] = inspect.ping()  # ty: ignore[invalid-assignment]
    if workers:
        for worker_name in list(workers.keys()):
            # if the name filter not set, return all worker names
@@ -208,7 +208,9 @@ def celery_inspect_get_reserved(worker_names: list[str], app: Celery) -> set[str
    inspect = app.control.inspect(destination=worker_names)

    # get the list of reserved tasks
-    reserved_tasks: dict[str, list] | None = inspect.reserved()  # type: ignore
+    reserved_tasks: dict[str, list] | None = (  # ty: ignore[invalid-assignment]
+        inspect.reserved()
+    )
    if reserved_tasks:
        for _, task_list in reserved_tasks.items():
            for task in task_list:
@@ -229,7 +231,9 @@ def celery_inspect_get_active(worker_names: list[str], app: Celery) -> set[str]:
    inspect = app.control.inspect(destination=worker_names)

    # get the list of reserved tasks
-    active_tasks: dict[str, list] | None = inspect.active()  # type: ignore
+    active_tasks: dict[str, list] | None = (  # ty: ignore[invalid-assignment]
+        inspect.active()
+    )
    if active_tasks:
        for _, task_list in active_tasks.items():
            for task in task_list:
--- a/backend/onyx/background/celery/tasks/beat_schedule.py
+++ b/backend/onyx/background/celery/tasks/beat_schedule.py
@@ -6,6 +6,7 @@ from celery.schedules import crontab

 from onyx.configs.app_configs import AUTO_LLM_CONFIG_URL
 from onyx.configs.app_configs import AUTO_LLM_UPDATE_INTERVAL_SECONDS
+from onyx.configs.app_configs import DISABLE_OPENSEARCH_MIGRATION_TASK
 from onyx.configs.app_configs import DISABLE_VECTOR_DB
 from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
 from onyx.configs.app_configs import ENTERPRISE_EDITION_ENABLED
@@ -226,7 +227,7 @@ if SCHEDULED_EVAL_DATASET_NAMES:
    )

 # Add OpenSearch migration task if enabled.
-if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
+if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX and not DISABLE_OPENSEARCH_MIGRATION_TASK:
    beat_task_templates.append(
        {
            "name": "migrate-chunks-from-vespa-to-opensearch",
--- a/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
+++ b/backend/onyx/background/celery/tasks/connector_deletion/tasks.py
@@ -59,6 +59,12 @@ from onyx.redis.redis_connector_delete import RedisConnectorDelete
 from onyx.redis.redis_connector_delete import RedisConnectorDeletePayload
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
+from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
+from onyx.server.metrics.deletion_metrics import inc_deletion_blocked
+from onyx.server.metrics.deletion_metrics import inc_deletion_completed
+from onyx.server.metrics.deletion_metrics import inc_deletion_fence_reset
+from onyx.server.metrics.deletion_metrics import inc_deletion_started
+from onyx.server.metrics.deletion_metrics import observe_deletion_taskset_duration
 from onyx.utils.variable_functionality import (
    fetch_versioned_implementation_with_fallback,
 )
@@ -102,7 +108,7 @@ def revoke_tasks_blocking_deletion(
                f"Revoked permissions sync task {permissions_sync_payload.celery_task_id}."
            )
    except Exception:
-        task_logger.exception("Exception while revoking pruning task")
+        task_logger.exception("Exception while revoking permissions sync task")

    try:
        prune_payload = redis_connector.prune.payload
@@ -110,7 +116,7 @@ def revoke_tasks_blocking_deletion(
            app.control.revoke(prune_payload.celery_task_id)
            task_logger.info(f"Revoked pruning task {prune_payload.celery_task_id}.")
    except Exception:
-        task_logger.exception("Exception while revoking permissions sync task")
+        task_logger.exception("Exception while revoking pruning task")

    try:
        external_group_sync_payload = redis_connector.external_group_sync.payload
@@ -160,12 +166,22 @@ def check_for_connector_deletion_task(self: Task, *, tenant_id: str) -> bool | N

            r.set(OnyxRedisSignals.BLOCK_VALIDATE_CONNECTOR_DELETION_FENCES, 1, ex=300)

-        # collect cc_pair_ids
+        # collect cc_pair_ids and note whether any are in DELETING status
        cc_pair_ids: list[int] = []
+        has_deleting_cc_pair = False
        with get_session_with_current_tenant() as db_session:
            cc_pairs = get_connector_credential_pairs(db_session)
            for cc_pair in cc_pairs:
                cc_pair_ids.append(cc_pair.id)
+                if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
+                    has_deleting_cc_pair = True
+
+        # Tenant-work-gating hook: mark only when at least one cc_pair is in
+        # DELETING status. Marking on bare cc_pair existence would keep
+        # nearly every tenant in the active set since most have cc_pairs
+        # but almost none are actively being deleted on any given cycle.
+        if has_deleting_cc_pair:
+            maybe_mark_tenant_active(tenant_id)

        # try running cleanup on the cc_pair_ids
        for cc_pair_id in cc_pair_ids:
@@ -300,6 +316,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
                recent_index_attempts
                and recent_index_attempts[0].status == IndexingStatus.IN_PROGRESS
            ):
+                inc_deletion_blocked(tenant_id, "indexing")
                raise TaskDependencyError(
                    "Connector deletion - Delayed (indexing in progress): "
                    f"cc_pair={cc_pair_id} "
@@ -307,11 +324,13 @@ def try_generate_document_cc_pair_cleanup_tasks(
                )

        if redis_connector.prune.fenced:
+            inc_deletion_blocked(tenant_id, "pruning")
            raise TaskDependencyError(
                f"Connector deletion - Delayed (pruning in progress): cc_pair={cc_pair_id}"
            )

        if redis_connector.permissions.fenced:
+            inc_deletion_blocked(tenant_id, "permissions")
            raise TaskDependencyError(
                f"Connector deletion - Delayed (permissions in progress): cc_pair={cc_pair_id}"
            )
@@ -359,6 +378,7 @@ def try_generate_document_cc_pair_cleanup_tasks(
        # set this only after all tasks have been added
        fence_payload.num_tasks = tasks_generated
        redis_connector.delete.set_fence(fence_payload)
+        inc_deletion_started(tenant_id)

    return tasks_generated

@@ -508,7 +528,11 @@ def monitor_connector_deletion_taskset(
                db_session=db_session,
                connector_id=connector_id_to_delete,
            )
-            if not connector or not len(connector.credentials):
+            if not connector:
+                task_logger.info(
+                    "Connector deletion - Connector already deleted, skipping connector cleanup"
+                )
+            elif not len(connector.credentials):
                task_logger.info(
                    "Connector deletion - Found no credentials left for connector, deleting connector"
                )
@@ -523,6 +547,12 @@ def monitor_connector_deletion_taskset(
                num_docs_synced=fence_data.num_tasks,
            )

+            duration = (
+                datetime.now(timezone.utc) - fence_data.submitted
+            ).total_seconds()
+            observe_deletion_taskset_duration(tenant_id, "success", duration)
+            inc_deletion_completed(tenant_id, "success")
+
        except Exception as e:
            db_session.rollback()
            stack_trace = traceback.format_exc()
@@ -541,6 +571,11 @@ def monitor_connector_deletion_taskset(
                f"Connector deletion exceptioned: "
                f"cc_pair={cc_pair_id} connector={connector_id_to_delete} credential={credential_id_to_delete}"
            )
+            duration = (
+                datetime.now(timezone.utc) - fence_data.submitted
+            ).total_seconds()
+            observe_deletion_taskset_duration(tenant_id, "failure", duration)
+            inc_deletion_completed(tenant_id, "failure")
            raise e

    task_logger.info(
@@ -717,5 +752,6 @@ def validate_connector_deletion_fence(
        f"fence={fence_key}"
    )

+    inc_deletion_fence_reset(tenant_id)
    redis_connector.delete.reset()
    return
--- a/backend/onyx/background/celery/tasks/docfetching/tasks.py
+++ b/backend/onyx/background/celery/tasks/docfetching/tasks.py
@@ -34,6 +34,7 @@ from onyx.db.index_attempt import mark_attempt_canceled
 from onyx.db.index_attempt import mark_attempt_failed
 from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.redis.redis_connector import RedisConnector
+from onyx.server.metrics.connector_health_metrics import on_index_attempt_status_change
 from onyx.utils.logger import setup_logger
 from onyx.utils.variable_functionality import global_version
 from shared_configs.configs import SENTRY_DSN
@@ -135,10 +136,13 @@ def _docfetching_task(
    # Since connector_indexing_proxy_task spawns a new process using this function as
    # the entrypoint, we init Sentry here.
    if SENTRY_DSN:
+        from onyx.configs.sentry import _add_instance_tags
+
        sentry_sdk.init(
            dsn=SENTRY_DSN,
            traces_sample_rate=0.1,
            release=__version__,
+            before_send=_add_instance_tags,
        )
        logger.info("Sentry initialized")
    else:
@@ -467,6 +471,15 @@ def docfetching_proxy_task(
                index_attempt.connector_credential_pair.connector.source.value
            )

+            cc_pair = index_attempt.connector_credential_pair
+            on_index_attempt_status_change(
+                tenant_id=tenant_id,
+                source=result.connector_source,
+                cc_pair_id=cc_pair_id,
+                connector_name=cc_pair.connector.name or f"cc_pair_{cc_pair_id}",
+                status="in_progress",
+            )
+
        while True:
            sleep(5)

--- a/backend/onyx/background/celery/tasks/docprocessing/tasks.py
+++ b/backend/onyx/background/celery/tasks/docprocessing/tasks.py
@@ -3,6 +3,7 @@ import os
 import time
 import traceback
 from collections import defaultdict
+from dataclasses import dataclass
 from datetime import datetime
 from datetime import timedelta
 from datetime import timezone
@@ -50,6 +51,7 @@ from onyx.configs.constants import AuthType
 from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
 from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
 from onyx.configs.constants import MilestoneRecordType
+from onyx.configs.constants import NotificationType
 from onyx.configs.constants import OnyxCeleryPriority
 from onyx.configs.constants import OnyxCeleryQueues
 from onyx.configs.constants import OnyxCeleryTask
@@ -85,6 +87,8 @@ from onyx.db.indexing_coordination import INDEXING_PROGRESS_TIMEOUT_HOURS
 from onyx.db.indexing_coordination import IndexingCoordination
 from onyx.db.models import IndexAttempt
 from onyx.db.models import SearchSettings
+from onyx.db.notification import create_notification
+from onyx.db.notification import get_notifications
 from onyx.db.search_settings import get_current_search_settings
 from onyx.db.search_settings import get_secondary_search_settings
 from onyx.db.swap_index import check_and_perform_index_swap
@@ -104,7 +108,11 @@ from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
 from onyx.redis.redis_pool import redis_lock_dump
 from onyx.redis.redis_pool import SCAN_ITER_COUNT_DEFAULT
+from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
 from onyx.redis.redis_utils import is_fence
+from onyx.server.metrics.connector_health_metrics import on_connector_error_state_change
+from onyx.server.metrics.connector_health_metrics import on_connector_indexing_success
+from onyx.server.metrics.connector_health_metrics import on_index_attempt_status_change
 from onyx.server.runtime.onyx_runtime import OnyxRuntime
 from onyx.utils.logger import setup_logger
 from onyx.utils.middleware import make_randomized_onyx_request_id
@@ -400,7 +408,6 @@ def check_indexing_completion(
    tenant_id: str,
    task: Task,
 ) -> None:
-
    logger.info(
        f"Checking for indexing completion: attempt={index_attempt_id} tenant={tenant_id}"
    )
@@ -521,13 +528,25 @@ def check_indexing_completion(

        # Update CC pair status if successful
        cc_pair = get_connector_credential_pair_from_id(
-            db_session, attempt.connector_credential_pair_id
+            db_session,
+            attempt.connector_credential_pair_id,
+            eager_load_connector=True,
        )
        if cc_pair is None:
            raise RuntimeError(
                f"CC pair {attempt.connector_credential_pair_id} not found in database"
            )

+        source = cc_pair.connector.source.value
+        connector_name = cc_pair.connector.name or f"cc_pair_{cc_pair.id}"
+        on_index_attempt_status_change(
+            tenant_id=tenant_id,
+            source=source,
+            cc_pair_id=cc_pair.id,
+            connector_name=connector_name,
+            status=attempt.status.value,
+        )
+
        if attempt.status.is_successful():
            # NOTE: we define the last successful index time as the time the last successful
            # attempt finished. This is distinct from the poll_range_end of the last successful
@@ -548,10 +567,41 @@ def check_indexing_completion(
                event=MilestoneRecordType.CONNECTOR_SUCCEEDED,
            )

+            on_connector_indexing_success(
+                tenant_id=tenant_id,
+                source=source,
+                cc_pair_id=cc_pair.id,
+                connector_name=connector_name,
+                docs_indexed=attempt.new_docs_indexed or 0,
+                success_timestamp=attempt.time_updated.timestamp(),
+            )
+
            # Clear repeated error state on success
            if cc_pair.in_repeated_error_state:
                cc_pair.in_repeated_error_state = False
+
+                # Delete any existing error notification for this CC pair so a
+                # fresh one is created if the connector fails again later.
+                for notif in get_notifications(
+                    user=None,
+                    db_session=db_session,
+                    notif_type=NotificationType.CONNECTOR_REPEATED_ERRORS,
+                    include_dismissed=True,
+                ):
+                    if (
+                        notif.additional_data
+                        and notif.additional_data.get("cc_pair_id") == cc_pair.id
+                    ):
+                        db_session.delete(notif)
+
                db_session.commit()
+                on_connector_error_state_change(
+                    tenant_id=tenant_id,
+                    source=source,
+                    cc_pair_id=cc_pair.id,
+                    connector_name=connector_name,
+                    in_error=False,
+                )

            if attempt.status == IndexingStatus.SUCCESS:
                logger.info(
@@ -608,6 +658,27 @@ def active_indexing_attempt(
    return bool(active_indexing_attempt)


+@dataclass
+class _KickoffResult:
+    """Tracks diagnostic counts from a _kickoff_indexing_tasks run."""
+
+    created: int = 0
+    skipped_active: int = 0
+    skipped_not_found: int = 0
+    skipped_not_indexable: int = 0
+    failed_to_create: int = 0
+
+    @property
+    def evaluated(self) -> int:
+        return (
+            self.created
+            + self.skipped_active
+            + self.skipped_not_found
+            + self.skipped_not_indexable
+            + self.failed_to_create
+        )
+
+
 def _kickoff_indexing_tasks(
    celery_app: Celery,
    db_session: Session,
@@ -617,12 +688,12 @@ def _kickoff_indexing_tasks(
    redis_client: Redis,
    lock_beat: RedisLock,
    tenant_id: str,
-) -> int:
+) -> _KickoffResult:
    """Kick off indexing tasks for the given cc_pair_ids and search_settings.

-    Returns the number of tasks successfully created.
+    Returns a _KickoffResult with diagnostic counts.
    """
-    tasks_created = 0
+    result = _KickoffResult()

    for cc_pair_id in cc_pair_ids:
        lock_beat.reacquire()
@@ -633,6 +704,7 @@ def _kickoff_indexing_tasks(
            search_settings_id=search_settings.id,
            db_session=db_session,
        ):
+            result.skipped_active += 1
            continue

        cc_pair = get_connector_credential_pair_from_id(
@@ -643,6 +715,7 @@ def _kickoff_indexing_tasks(
            task_logger.warning(
                f"_kickoff_indexing_tasks - CC pair not found: cc_pair={cc_pair_id}"
            )
+            result.skipped_not_found += 1
            continue

        # Heavyweight check after fetching cc pair
@@ -657,6 +730,7 @@ def _kickoff_indexing_tasks(
                f"search_settings={search_settings.id}, "
                f"secondary_index_building={secondary_index_building}"
            )
+            result.skipped_not_indexable += 1
            continue

        task_logger.debug(
@@ -696,13 +770,14 @@ def _kickoff_indexing_tasks(
            task_logger.info(
                f"Connector indexing queued: index_attempt={attempt_id} cc_pair={cc_pair.id} search_settings={search_settings.id}"
            )
-            tasks_created += 1
+            result.created += 1
        else:
            task_logger.error(
                f"Failed to create indexing task: cc_pair={cc_pair.id} search_settings={search_settings.id}"
            )
+            result.failed_to_create += 1

-    return tasks_created
+    return result


@shared_task(
@@ -728,13 +803,15 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
    task_logger.warning("check_for_indexing - Starting")

    tasks_created = 0
+    primary_result = _KickoffResult()
+    secondary_result: _KickoffResult | None = None
    locked = False
    redis_client = get_redis_client()
    redis_client_replica = get_redis_replica_client()

    # we need to use celery's redis client to access its redis data
    # (which lives on a different db number)
-    # redis_client_celery: Redis = self.app.broker_connection().channel().client  # type: ignore
+    # redis_client_celery: Redis = self.app.broker_connection().channel().client

    lock_beat: RedisLock = redis_client.lock(
        OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK,
@@ -848,6 +925,43 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
                        cc_pair_id=cc_pair_id,
                        in_repeated_error_state=True,
                    )
+                    error_connector_name = (
+                        cc_pair.connector.name or f"cc_pair_{cc_pair.id}"
+                    )
+                    on_connector_error_state_change(
+                        tenant_id=tenant_id,
+                        source=cc_pair.connector.source.value,
+                        cc_pair_id=cc_pair_id,
+                        connector_name=error_connector_name,
+                        in_error=True,
+                    )
+
+                    connector_name = (
+                        cc_pair.name
+                        or cc_pair.connector.name
+                        or f"CC pair {cc_pair.id}"
+                    )
+                    source = cc_pair.connector.source.value
+                    connector_url = f"/admin/connector/{cc_pair.id}"
+                    create_notification(
+                        user_id=None,
+                        notif_type=NotificationType.CONNECTOR_REPEATED_ERRORS,
+                        db_session=db_session,
+                        title=f"Connector '{connector_name}' has entered repeated error state",
+                        description=(
+                            f"The {source} connector has failed repeatedly and "
+                            f"has been flagged. View indexing history in the "
+                            f"Advanced section: {connector_url}"
+                        ),
+                        additional_data={"cc_pair_id": cc_pair.id},
+                    )
+
+                    task_logger.error(
+                        f"Connector entered repeated error state: "
+                        f"cc_pair={cc_pair.id} "
+                        f"connector={cc_pair.connector.name} "
+                        f"source={source}"
+                    )
                    # When entering repeated error state, also pause the connector
                    # to prevent continued indexing retry attempts burning through embedding credits.
                    # NOTE: only for Cloud, since most self-hosted users use self-hosted embedding
@@ -863,7 +977,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
        # Heavy check, should_index(), is called in _kickoff_indexing_tasks
        with get_session_with_current_tenant() as db_session:
            # Primary first
-            tasks_created += _kickoff_indexing_tasks(
+            primary_result = _kickoff_indexing_tasks(
                celery_app=self.app,
                db_session=db_session,
                search_settings=current_search_settings,
@@ -873,6 +987,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
                lock_beat=lock_beat,
                tenant_id=tenant_id,
            )
+            tasks_created += primary_result.created

            # Secondary indexing (only if secondary search settings exist and switchover_type is not INSTANT)
            if (
@@ -880,7 +995,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
                and secondary_search_settings.switchover_type != SwitchoverType.INSTANT
                and secondary_cc_pair_ids
            ):
-                tasks_created += _kickoff_indexing_tasks(
+                secondary_result = _kickoff_indexing_tasks(
                    celery_app=self.app,
                    db_session=db_session,
                    search_settings=secondary_search_settings,
@@ -890,6 +1005,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
                    lock_beat=lock_beat,
                    tenant_id=tenant_id,
                )
+                tasks_created += secondary_result.created
            elif (
                secondary_search_settings
                and secondary_search_settings.switchover_type == SwitchoverType.INSTANT
@@ -898,6 +1014,14 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
                    f"Skipping secondary indexing: switchover_type=INSTANT for search_settings={secondary_search_settings.id}"
                )

+        # Tenant-work-gating hook: refresh membership only when indexing
+        # actually dispatched at least one docfetching task. `_kickoff_indexing_tasks`
+        # internally calls `should_index()` to decide per-cc_pair; using
+        # `tasks_created > 0` here gives us a "real work was done" signal
+        # rather than just "tenant has a cc_pair somewhere."
+        if tasks_created > 0:
+            maybe_mark_tenant_active(tenant_id)
+
        # 2/3: VALIDATE
        # Check for inconsistent index attempts - active attempts without task IDs
        # This can happen if attempt creation fails partway through
@@ -1002,7 +1126,26 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
                redis_lock_dump(lock_beat, redis_client)

    time_elapsed = time.monotonic() - time_start
-    task_logger.info(f"check_for_indexing finished: elapsed={time_elapsed:.2f}")
+    task_logger.info(
+        f"check_for_indexing finished: "
+        f"elapsed={time_elapsed:.2f}s "
+        f"primary=[evaluated={primary_result.evaluated} "
+        f"created={primary_result.created} "
+        f"skipped_active={primary_result.skipped_active} "
+        f"skipped_not_found={primary_result.skipped_not_found} "
+        f"skipped_not_indexable={primary_result.skipped_not_indexable} "
+        f"failed={primary_result.failed_to_create}]"
+        + (
+            f" secondary=[evaluated={secondary_result.evaluated} "
+            f"created={secondary_result.created} "
+            f"skipped_active={secondary_result.skipped_active} "
+            f"skipped_not_found={secondary_result.skipped_not_found} "
+            f"skipped_not_indexable={secondary_result.skipped_not_indexable} "
+            f"failed={secondary_result.failed_to_create}]"
+            if secondary_result
+            else ""
+        )
+    )
    return tasks_created


--- a/backend/onyx/background/celery/tasks/opensearch_migration/tasks.py
+++ b/backend/onyx/background/celery/tasks/opensearch_migration/tasks.py
@@ -172,6 +172,10 @@ def migrate_chunks_from_vespa_to_opensearch_task(
            search_settings = get_current_search_settings(db_session)
            indexing_setting = IndexingSetting.from_db_model(search_settings)

+            task_logger.debug(
+                "Verified tenant info, migration record, and search settings."
+            )
+
            # 2.e. Build sanitized to original doc ID mapping to check for
            # conflicts in the event we sanitize a doc ID to an
            # already-existing doc ID.
@@ -325,6 +329,7 @@ def migrate_chunks_from_vespa_to_opensearch_task(
    finally:
        if lock.owned():
            lock.release()
+            task_logger.debug("Released the OpenSearch migration lock.")
        else:
            task_logger.warning(
                "The OpenSearch migration lock was not owned on completion of the migration task."
--- a/backend/onyx/background/celery/tasks/pruning/tasks.py
+++ b/backend/onyx/background/celery/tasks/pruning/tasks.py
@@ -38,6 +38,7 @@ from onyx.configs.constants import OnyxRedisConstants
 from onyx.configs.constants import OnyxRedisLocks
 from onyx.configs.constants import OnyxRedisSignals
 from onyx.connectors.factory import instantiate_connector
+from onyx.connectors.interfaces import BaseConnector
 from onyx.connectors.models import InputType
 from onyx.db.connector import mark_ccpair_as_pruned
 from onyx.db.connector_credential_pair import get_connector_credential_pair
@@ -50,7 +51,6 @@ from onyx.db.enums import ConnectorCredentialPairStatus
 from onyx.db.enums import SyncStatus
 from onyx.db.enums import SyncType
 from onyx.db.hierarchy import delete_orphaned_hierarchy_nodes
-from onyx.db.hierarchy import link_hierarchy_nodes_to_documents
 from onyx.db.hierarchy import remove_stale_hierarchy_node_cc_pair_entries
 from onyx.db.hierarchy import reparent_orphaned_hierarchy_nodes
 from onyx.db.hierarchy import update_document_parent_hierarchy_nodes
@@ -72,6 +72,7 @@ from onyx.redis.redis_hierarchy import get_source_node_id_from_cache
 from onyx.redis.redis_hierarchy import HierarchyNodeCacheEntry
 from onyx.redis.redis_pool import get_redis_client
 from onyx.redis.redis_pool import get_redis_replica_client
+from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
 from onyx.server.metrics.pruning_metrics import observe_pruning_diff_duration
 from onyx.server.runtime.onyx_runtime import OnyxRuntime
 from onyx.server.utils import make_short_id
@@ -228,6 +229,7 @@ def check_for_pruning(self: Task, *, tenant_id: str) -> bool | None:
                for cc_pair_entry in cc_pairs:
                    cc_pair_ids.append(cc_pair_entry.id)

+            prune_dispatched = False
            for cc_pair_id in cc_pair_ids:
                lock_beat.reacquire()
                with get_session_with_current_tenant() as db_session:
@@ -250,9 +252,18 @@ def check_for_pruning(self: Task, *, tenant_id: str) -> bool | None:
                        logger.info(f"Pruning not created: {cc_pair_id}")
                        continue

+                    prune_dispatched = True
                    task_logger.info(
                        f"Pruning queued: cc_pair={cc_pair.id} id={payload_id}"
                    )
+
+            # Tenant-work-gating hook: mark only when at least one cc_pair
+            # was actually due for pruning AND a prune task was dispatched.
+            # Marking on bare cc_pair existence over-counts the population
+            # since most tenants have cc_pairs but almost none are due on
+            # any given cycle.
+            if prune_dispatched:
+                maybe_mark_tenant_active(tenant_id)
            r.set(OnyxRedisSignals.BLOCK_PRUNING, 1, ex=_get_pruning_block_expiration())

        # we want to run this less frequently than the overall task
@@ -525,6 +536,14 @@ def connector_pruning_generator_task(
        return None

    try:
+        # Session 1: pre-enumeration — load cc_pair and instantiate the connector.
+        # The session is closed before enumeration so the DB connection is not held
+        # open during the 10–30+ minute connector crawl.
+        connector_source: DocumentSource | None = None
+        connector_type: str = ""
+        is_connector_public: bool = False
+        runnable_connector: BaseConnector | None = None
+
        with get_session_with_current_tenant() as db_session:
            cc_pair = get_connector_credential_pair(
                db_session=db_session,
@@ -550,49 +569,51 @@ def connector_pruning_generator_task(
            )
            redis_connector.prune.set_fence(new_payload)

+            connector_source = cc_pair.connector.source
+            connector_type = connector_source.value
+            is_connector_public = cc_pair.access_type == AccessType.PUBLIC
+
            task_logger.info(
-                f"Pruning generator running connector: cc_pair={cc_pair_id} connector_source={cc_pair.connector.source}"
+                f"Pruning generator running connector: cc_pair={cc_pair_id} connector_source={connector_source}"
            )

            runnable_connector = instantiate_connector(
                db_session,
-                cc_pair.connector.source,
+                connector_source,
                InputType.SLIM_RETRIEVAL,
                cc_pair.connector.connector_specific_config,
                cc_pair.credential,
            )
+        # Session 1 closed here — connection released before enumeration.

-            callback = PruneCallback(
-                0,
-                redis_connector,
-                lock,
-                r,
-                timeout_seconds=JOB_TIMEOUT,
-            )
+        callback = PruneCallback(
+            0,
+            redis_connector,
+            lock,
+            r,
+            timeout_seconds=JOB_TIMEOUT,
+        )

-            # Extract docs and hierarchy nodes from the source
-            connector_type = cc_pair.connector.source.value
-            extraction_result = extract_ids_from_runnable_connector(
-                runnable_connector, callback, connector_type=connector_type
-            )
-            all_connector_doc_ids = extraction_result.raw_id_to_parent
+        # Extract docs and hierarchy nodes from the source (no DB session held).
+        extraction_result = extract_ids_from_runnable_connector(
+            runnable_connector, callback, connector_type=connector_type
+        )
+        all_connector_doc_ids = extraction_result.raw_id_to_parent

-            # Process hierarchy nodes (same as docfetching):
-            # upsert to Postgres and cache in Redis
-            source = cc_pair.connector.source
+        # Session 2: post-enumeration — hierarchy upserts, diff computation, task dispatch.
+        with get_session_with_current_tenant() as db_session:
+            source = connector_source
            redis_client = get_redis_client(tenant_id=tenant_id)

            ensure_source_node_exists(redis_client, db_session, source)

            upserted_nodes: list[DBHierarchyNode] = []
            if extraction_result.hierarchy_nodes:
-                is_connector_public = cc_pair.access_type == AccessType.PUBLIC
-
                upserted_nodes = upsert_hierarchy_nodes_batch(
                    db_session=db_session,
                    nodes=extraction_result.hierarchy_nodes,
                    source=source,
-                    commit=True,
+                    commit=False,
                    is_connector_public=is_connector_public,
                )

@@ -601,9 +622,13 @@ def connector_pruning_generator_task(
                    hierarchy_node_ids=[n.id for n in upserted_nodes],
                    connector_id=connector_id,
                    credential_id=credential_id,
-                    commit=True,
+                    commit=False,
                )

+                # Single commit so the FK reference in the join table can never
+                # outrun the parent hierarchy_node insert.
+                db_session.commit()
+
                cache_entries = [
                    HierarchyNodeCacheEntry.from_db_model(node)
                    for node in upserted_nodes
@@ -628,16 +653,6 @@ def connector_pruning_generator_task(
                raw_id_to_parent=all_connector_doc_ids,
            )

-            # Link hierarchy nodes to documents for sources where pages can be
-            # both hierarchy nodes AND documents (e.g. Notion, Confluence)
-            all_doc_id_list = list(all_connector_doc_ids.keys())
-            link_hierarchy_nodes_to_documents(
-                db_session=db_session,
-                document_ids=all_doc_id_list,
-                source=source,
-                commit=True,
-            )
-
            diff_start = time.monotonic()
            try:
                # a list of docs in our local index
@@ -658,7 +673,7 @@ def connector_pruning_generator_task(
                task_logger.info(
                    "Pruning set collected: "
                    f"cc_pair={cc_pair_id} "
-                    f"connector_source={cc_pair.connector.source} "
+                    f"connector_source={connector_source} "
                    f"docs_to_remove={len(doc_ids_to_remove)}"
                )

--- a/backend/onyx/background/celery/tasks/shared/tasks.py
+++ b/backend/onyx/background/celery/tasks/shared/tasks.py
@@ -248,6 +248,7 @@ def document_by_cc_pair_cleanup_task(
                        ),
                    )
                    mark_document_as_modified(document_id, db_session)
+                    db_session.commit()
                completion_status = (
                    OnyxCeleryTaskCompletionStatus.NON_RETRYABLE_EXCEPTION
                )
--- a/backend/onyx/background/celery/tasks/vespa/document_sync.py
+++ b/backend/onyx/background/celery/tasks/vespa/document_sync.py
@@ -15,6 +15,7 @@ from onyx.configs.constants import OnyxCeleryTask
 from onyx.configs.constants import OnyxRedisConstants
 from onyx.db.document import construct_document_id_select_by_needs_sync
 from onyx.db.document import count_documents_by_needs_sync
+from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
 from onyx.utils.logger import setup_logger

 # Redis keys for document sync tracking
@@ -150,6 +151,10 @@ def try_generate_stale_document_sync_tasks(
        logger.info("No stale documents found. Skipping sync tasks generation.")
        return None

+    # Tenant-work-gating hook: refresh this tenant's active-set membership
+    # whenever vespa sync actually has stale docs to dispatch.
+    maybe_mark_tenant_active(tenant_id)
+
    logger.info(
        f"Stale documents found (at least {stale_doc_count}). Generating sync tasks in one batch."
    )
--- a/backend/onyx/background/indexing/checkpointing_utils.py
+++ b/backend/onyx/background/indexing/checkpointing_utils.py
@@ -61,7 +61,9 @@ def load_checkpoint(
    checkpoint_io = file_store.read_file(checkpoint_pointer, mode="rb")
    checkpoint_data = checkpoint_io.read().decode("utf-8")
    if isinstance(connector, CheckpointedConnector):
-        return connector.validate_checkpoint_json(checkpoint_data)
+        return connector.validate_checkpoint_json(  # ty: ignore[invalid-return-type]
+            checkpoint_data
+        )
    return ConnectorCheckpoint.model_validate_json(checkpoint_data)


--- a/backend/onyx/background/indexing/models.py
+++ b/backend/onyx/background/indexing/models.py
@@ -23,6 +23,8 @@ class IndexAttemptErrorPydantic(BaseModel):

    index_attempt_id: int

+    error_type: str | None = None
+
    @classmethod
    def from_model(cls, model: IndexAttemptError) -> "IndexAttemptErrorPydantic":
        return cls(
@@ -37,4 +39,5 @@ class IndexAttemptErrorPydantic(BaseModel):
            is_resolved=model.is_resolved,
            time_created=model.time_created,
            index_attempt_id=model.index_attempt_id,
+            error_type=model.error_type,
        )
--- a/backend/onyx/background/indexing/run_docfetching.py
+++ b/backend/onyx/background/indexing/run_docfetching.py
@@ -5,6 +5,7 @@ from datetime import datetime
 from datetime import timedelta
 from datetime import timezone

+import sentry_sdk
 from celery import Celery
 from sqlalchemy.orm import Session

@@ -556,6 +557,27 @@ def connector_document_extraction(

                # save record of any failures at the connector level
                if failure is not None:
+                    if failure.exception is not None:
+                        with sentry_sdk.new_scope() as scope:
+                            scope.set_tag("stage", "connector_fetch")
+                            scope.set_tag("connector_source", db_connector.source.value)
+                            scope.set_tag("cc_pair_id", str(cc_pair_id))
+                            scope.set_tag("index_attempt_id", str(index_attempt_id))
+                            scope.set_tag("tenant_id", tenant_id)
+                            if failure.failed_document:
+                                scope.set_tag(
+                                    "doc_id", failure.failed_document.document_id
+                                )
+                            if failure.failed_entity:
+                                scope.set_tag(
+                                    "entity_id", failure.failed_entity.entity_id
+                                )
+                            scope.fingerprint = [
+                                "connector-fetch-failure",
+                                db_connector.source.value,
+                                type(failure.exception).__name__,
+                            ]
+                            sentry_sdk.capture_exception(failure.exception)
                    total_failures += 1
                    with get_session_with_current_tenant() as db_session:
                        create_index_attempt_error(
--- a/backend/onyx/chat/chat_utils.py
+++ b/backend/onyx/chat/chat_utils.py
@@ -364,7 +364,7 @@ def _get_or_extract_plaintext(
        plaintext_io = file_store.read_file(plaintext_key, mode="b")
        return plaintext_io.read().decode("utf-8")
    except Exception:
-        logger.exception(f"Error when reading file, id={file_id}")
+        logger.info(f"Cache miss for file with id={file_id}")

    # Cache miss — extract and store.
    content_text = extract_fn()
--- a/backend/onyx/chat/llm_loop.py
+++ b/backend/onyx/chat/llm_loop.py
@@ -4,8 +4,6 @@ from collections.abc import Callable
 from typing import Any
 from typing import Literal

-from sqlalchemy.orm import Session
-
 from onyx.chat.chat_state import ChatStateContainer
 from onyx.chat.chat_utils import create_tool_call_failure_messages
 from onyx.chat.citation_processor import CitationMapping
@@ -635,7 +633,6 @@ def run_llm_loop(
    user_memory_context: UserMemoryContext | None,
    llm: LLM,
    token_counter: Callable[[str], int],
-    db_session: Session,
    forced_tool_id: int | None = None,
    user_identity: LLMUserIdentity | None = None,
    chat_session_id: str | None = None,
@@ -1020,20 +1017,16 @@ def run_llm_loop(
                    persisted_memory_id: int | None = None
                    if user_memory_context and user_memory_context.user_id:
                        if tool_response.rich_response.index_to_replace is not None:
-                            memory = update_memory_at_index(
+                            persisted_memory_id = update_memory_at_index(
                                user_id=user_memory_context.user_id,
                                index=tool_response.rich_response.index_to_replace,
                                new_text=tool_response.rich_response.memory_text,
-                                db_session=db_session,
                            )
-                            persisted_memory_id = memory.id if memory else None
                        else:
-                            memory = add_memory(
+                            persisted_memory_id = add_memory(
                                user_id=user_memory_context.user_id,
                                memory_text=tool_response.rich_response.memory_text,
-                                db_session=db_session,
                            )
-                            persisted_memory_id = memory.id
                    operation: Literal["add", "update"] = (
                        "update"
                        if tool_response.rich_response.index_to_replace is not None
@@ -1171,7 +1164,10 @@ def run_llm_loop(

        emitter.emit(
            Packet(
-                placement=Placement(turn_index=llm_cycle_count + reasoning_cycles),
+                placement=Placement(
+                    turn_index=llm_cycle_count  # ty: ignore[possibly-unresolved-reference]
+                    + reasoning_cycles
+                ),
                obj=OverallStop(type="stop"),
            )
        )
--- a/backend/onyx/chat/llm_step.py
+++ b/backend/onyx/chat/llm_step.py
@@ -826,6 +826,12 @@ def translate_history_to_llm_format(
                            base64_data = img_file.to_base64()
                            image_url = f"data:{image_type};base64,{base64_data}"

+                            content_parts.append(
+                                TextContentPart(
+                                    type="text",
+                                    text=f"[attached image — file_id: {img_file.file_id}]",
+                                )
+                            )
                            image_part = ImageContentPart(
                                type="image_url",
                                image_url=ImageUrlDetail(
--- a/backend/onyx/chat/process_message.py
+++ b/backend/onyx/chat/process_message.py
@@ -67,7 +67,6 @@ from onyx.db.chat import get_chat_session_by_id
 from onyx.db.chat import get_or_create_root_message
 from onyx.db.chat import reserve_message_id
 from onyx.db.chat import reserve_multi_model_message_ids
-from onyx.db.engine.sql_engine import get_session_with_current_tenant
 from onyx.db.enums import HookPoint
 from onyx.db.memory import get_memories
 from onyx.db.models import ChatMessage
@@ -1006,93 +1005,86 @@ def _run_models(
        model_llm = setup.llms[model_idx]

        try:
-            # Each worker opens its own session — SQLAlchemy sessions are not thread-safe.
-            # Do NOT write to the outer db_session (or any shared DB state) from here;
-            # all DB writes in this thread must go through thread_db_session.
-            with get_session_with_current_tenant() as thread_db_session:
-                thread_tool_dict = construct_tools(
-                    persona=setup.persona,
-                    db_session=thread_db_session,
-                    emitter=model_emitter,
-                    user=user,
-                    llm=model_llm,
-                    search_tool_config=SearchToolConfig(
-                        user_selected_filters=setup.new_msg_req.internal_search_filters,
-                        project_id_filter=setup.search_params.project_id_filter,
-                        persona_id_filter=setup.search_params.persona_id_filter,
-                        bypass_acl=setup.bypass_acl,
-                        slack_context=setup.slack_context,
-                        enable_slack_search=_should_enable_slack_search(
-                            setup.persona, setup.new_msg_req.internal_search_filters
-                        ),
+            # Each function opens short-lived DB sessions on demand.
+            # Do NOT pass a long-lived session here — it would hold a
+            # connection for the entire LLM loop (minutes), and cloud
+            # infrastructure may drop idle connections.
+            thread_tool_dict = construct_tools(
+                persona=setup.persona,
+                emitter=model_emitter,
+                user=user,
+                llm=model_llm,
+                search_tool_config=SearchToolConfig(
+                    user_selected_filters=setup.new_msg_req.internal_search_filters,
+                    project_id_filter=setup.search_params.project_id_filter,
+                    persona_id_filter=setup.search_params.persona_id_filter,
+                    bypass_acl=setup.bypass_acl,
+                    slack_context=setup.slack_context,
+                    enable_slack_search=_should_enable_slack_search(
+                        setup.persona, setup.new_msg_req.internal_search_filters
                    ),
-                    custom_tool_config=CustomToolConfig(
-                        chat_session_id=setup.chat_session.id,
-                        message_id=setup.user_message.id,
-                        additional_headers=setup.custom_tool_additional_headers,
-                        mcp_headers=setup.mcp_headers,
-                    ),
-                    file_reader_tool_config=FileReaderToolConfig(
-                        user_file_ids=setup.available_files.user_file_ids,
-                        chat_file_ids=setup.available_files.chat_file_ids,
-                    ),
-                    allowed_tool_ids=setup.new_msg_req.allowed_tool_ids,
-                    search_usage_forcing_setting=setup.search_params.search_usage,
+                ),
+                custom_tool_config=CustomToolConfig(
+                    chat_session_id=setup.chat_session.id,
+                    message_id=setup.user_message.id,
+                    additional_headers=setup.custom_tool_additional_headers,
+                    mcp_headers=setup.mcp_headers,
+                ),
+                file_reader_tool_config=FileReaderToolConfig(
+                    user_file_ids=setup.available_files.user_file_ids,
+                    chat_file_ids=setup.available_files.chat_file_ids,
+                ),
+                allowed_tool_ids=setup.new_msg_req.allowed_tool_ids,
+                search_usage_forcing_setting=setup.search_params.search_usage,
+            )
+            model_tools = [
+                tool for tool_list in thread_tool_dict.values() for tool in tool_list
+            ]
+
+            if setup.forced_tool_id and setup.forced_tool_id not in {
+                tool.id for tool in model_tools
+            }:
+                raise ValueError(
+                    f"Forced tool {setup.forced_tool_id} not found in tools"
                )
-                model_tools = [
-                    tool
-                    for tool_list in thread_tool_dict.values()
-                    for tool in tool_list
-                ]

-                if setup.forced_tool_id and setup.forced_tool_id not in {
-                    tool.id for tool in model_tools
-                }:
-                    raise ValueError(
-                        f"Forced tool {setup.forced_tool_id} not found in tools"
-                    )
-
-                # Per-thread copy: run_llm_loop mutates simple_chat_history in-place.
-                if n_models == 1 and setup.new_msg_req.deep_research:
-                    if setup.chat_session.project_id:
-                        raise RuntimeError(
-                            "Deep research is not supported for projects"
-                        )
-                    run_deep_research_llm_loop(
-                        emitter=model_emitter,
-                        state_container=sc,
-                        simple_chat_history=list(setup.simple_chat_history),
-                        tools=model_tools,
-                        custom_agent_prompt=setup.custom_agent_prompt,
-                        llm=model_llm,
-                        token_counter=get_llm_token_counter(model_llm),
-                        db_session=thread_db_session,
-                        skip_clarification=setup.skip_clarification,
-                        user_identity=setup.user_identity,
-                        chat_session_id=str(setup.chat_session.id),
-                        all_injected_file_metadata=setup.all_injected_file_metadata,
-                    )
-                else:
-                    run_llm_loop(
-                        emitter=model_emitter,
-                        state_container=sc,
-                        simple_chat_history=list(setup.simple_chat_history),
-                        tools=model_tools,
-                        custom_agent_prompt=setup.custom_agent_prompt,
-                        context_files=setup.extracted_context_files,
-                        persona=setup.persona,
-                        user_memory_context=setup.user_memory_context,
-                        llm=model_llm,
-                        token_counter=get_llm_token_counter(model_llm),
-                        db_session=thread_db_session,
-                        forced_tool_id=setup.forced_tool_id,
-                        user_identity=setup.user_identity,
-                        chat_session_id=str(setup.chat_session.id),
-                        chat_files=setup.chat_files_for_tools,
-                        include_citations=setup.new_msg_req.include_citations,
-                        all_injected_file_metadata=setup.all_injected_file_metadata,
-                        inject_memories_in_prompt=user.use_memories,
-                    )
+            # Per-thread copy: run_llm_loop mutates simple_chat_history in-place.
+            if n_models == 1 and setup.new_msg_req.deep_research:
+                if setup.chat_session.project_id:
+                    raise RuntimeError("Deep research is not supported for projects")
+                run_deep_research_llm_loop(
+                    emitter=model_emitter,
+                    state_container=sc,
+                    simple_chat_history=list(setup.simple_chat_history),
+                    tools=model_tools,
+                    custom_agent_prompt=setup.custom_agent_prompt,
+                    llm=model_llm,
+                    token_counter=get_llm_token_counter(model_llm),
+                    skip_clarification=setup.skip_clarification,
+                    user_identity=setup.user_identity,
+                    chat_session_id=str(setup.chat_session.id),
+                    all_injected_file_metadata=setup.all_injected_file_metadata,
+                )
+            else:
+                run_llm_loop(
+                    emitter=model_emitter,
+                    state_container=sc,
+                    simple_chat_history=list(setup.simple_chat_history),
+                    tools=model_tools,
+                    custom_agent_prompt=setup.custom_agent_prompt,
+                    context_files=setup.extracted_context_files,
+                    persona=setup.persona,
+                    user_memory_context=setup.user_memory_context,
+                    llm=model_llm,
+                    token_counter=get_llm_token_counter(model_llm),
+                    forced_tool_id=setup.forced_tool_id,
+                    user_identity=setup.user_identity,
+                    chat_session_id=str(setup.chat_session.id),
+                    chat_files=setup.chat_files_for_tools,
+                    include_citations=setup.new_msg_req.include_citations,
+                    all_injected_file_metadata=setup.all_injected_file_metadata,
+                    inject_memories_in_prompt=user.use_memories,
+                )

            model_succeeded[model_idx] = True

--- a/backend/onyx/configs/app_configs.py
+++ b/backend/onyx/configs/app_configs.py
@@ -324,6 +324,9 @@ ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX = (
    ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
    and os.environ.get("ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX", "").lower() == "true"
 )
+DISABLE_OPENSEARCH_MIGRATION_TASK = (
+    os.environ.get("DISABLE_OPENSEARCH_MIGRATION_TASK", "").lower() == "true"
+)
 # Whether we should check for and create an index if necessary every time we
 # instantiate an OpenSearchDocumentIndex on multitenant cloud. Defaults to True.
 VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT = (
@@ -1125,6 +1128,32 @@ DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB = 20
 # Number of pre-provisioned tenants to maintain
 TARGET_AVAILABLE_TENANTS = int(os.environ.get("TARGET_AVAILABLE_TENANTS", "5"))

+# Master switch for the tenant work-gating feature. Controls the `enabled`
+# axis only — flipping this True puts the feature in shadow mode (compute
+# the gate, log skip counts, but do not actually skip). The `enforce` axis
+# is Redis-only with a hard-coded default of False, so this env flag alone
+# cannot cause real tenants to be skipped. Default off.
+ENABLE_TENANT_WORK_GATING = (
+    os.environ.get("ENABLE_TENANT_WORK_GATING", "").lower() == "true"
+)
+
+# Membership TTL for the `active_tenants` sorted set. Members older than this
+# are treated as inactive by the gate read path. Must be > the full-fanout
+# interval so self-healing re-adds a genuinely-working tenant before their
+# membership expires. Default 30 min.
+TENANT_WORK_GATING_TTL_SECONDS = int(
+    os.environ.get("TENANT_WORK_GATING_TTL_SECONDS", 30 * 60)
+)
+
+# Minimum wall-clock interval between full-fanout cycles. When this many
+# seconds have elapsed since the last bypass, the generator ignores the gate
+# on the next invocation and dispatches to every non-gated tenant, letting
+# consumers re-populate the active set. Schedule-independent so beat drift
+# or backlog can't make the self-heal bursty or sparse. Default 20 min.
+TENANT_WORK_GATING_FULL_FANOUT_INTERVAL_SECONDS = int(
+    os.environ.get("TENANT_WORK_GATING_FULL_FANOUT_INTERVAL_SECONDS", 20 * 60)
+)
+

 # Image summarization configuration
 IMAGE_SUMMARIZATION_SYSTEM_PROMPT = os.environ.get(
--- a/backend/onyx/configs/constants.py
+++ b/backend/onyx/configs/constants.py
@@ -283,6 +283,7 @@ class NotificationType(str, Enum):
    RELEASE_NOTES = "release_notes"
    ASSISTANT_FILES_READY = "assistant_files_ready"
    FEATURE_ANNOUNCEMENT = "feature_announcement"
+    CONNECTOR_REPEATED_ERRORS = "connector_repeated_errors"


 class BlobType(str, Enum):
@@ -638,9 +639,11 @@ REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
 REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3

 if platform.system() == "Darwin":
-    REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPALIVE] = 60  # type: ignore[attr-defined,unused-ignore]
+    REDIS_SOCKET_KEEPALIVE_OPTIONS[
+        socket.TCP_KEEPALIVE  # ty: ignore[unresolved-attribute]
+    ] = 60
 else:
-    REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPIDLE] = 60  # type: ignore[attr-defined,unused-ignore]
+    REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPIDLE] = 60


 class OnyxCallTypes(str, Enum):
--- a/backend/onyx/configs/sentry.py
+++ b/backend/onyx/configs/sentry.py
@@ -0,0 +1,48 @@
+from typing import Any
+
+from sentry_sdk.types import Event
+
+from onyx.utils.logger import setup_logger
+
+logger = setup_logger()
+
+_instance_id_resolved = False
+
+
+def _add_instance_tags(
+    event: Event,
+    hint: dict[str, Any],  # noqa: ARG001
+) -> Event | None:
+    """Sentry before_send hook that lazily attaches instance identification tags.
+
+    On the first event, resolves the instance UUID from the KV store (requires DB)
+    and sets it as a global Sentry tag. Subsequent events pick it up automatically.
+    """
+    global _instance_id_resolved
+
+    if _instance_id_resolved:
+        return event
+
+    try:
+        import sentry_sdk
+
+        from shared_configs.configs import MULTI_TENANT
+
+        if MULTI_TENANT:
+            instance_id = "multi-tenant-cloud"
+        else:
+            from onyx.utils.telemetry import get_or_generate_uuid
+
+            instance_id = get_or_generate_uuid()
+
+        sentry_sdk.set_tag("instance_id", instance_id)
+
+        # Also set on this event since set_tag won't retroactively apply
+        event.setdefault("tags", {})["instance_id"] = instance_id
+
+        # Only mark resolved after success — if DB wasn't ready, retry next event
+        _instance_id_resolved = True
+    except Exception:
+        logger.debug("Failed to resolve instance_id for Sentry tagging")
+
+    return event
--- a/Show More
+++ b/Show More