mirror of
https://github.com/onyx-dot-app/onyx.git
synced 2026-04-18 07:56:48 +00:00
Compare commits
11 Commits
bo/pruning
...
argus
| Author | SHA1 | Date | |
|---|---|---|---|
|
|
4feca94f08 | ||
|
|
e9538c2a8f | ||
|
|
bc39465a5c | ||
|
|
48463a353d | ||
|
|
0c61b3bb97 | ||
|
|
0948f58fa0 | ||
|
|
ac448cf3c5 | ||
|
|
f7771847fb | ||
|
|
c61adc6560 | ||
|
|
4b0cb5b9c3 | ||
|
|
21293f6621 |
@@ -1,9 +1,7 @@
|
||||
FROM ubuntu:26.04@sha256:cc925e589b7543b910fea57a240468940003fbfc0515245a495dd0ad8fe7cef1
|
||||
|
||||
RUN apt-get update && apt-get install -y --no-install-recommends \
|
||||
build-essential \
|
||||
curl \
|
||||
default-jre \
|
||||
fd-find \
|
||||
fzf \
|
||||
git \
|
||||
@@ -62,11 +60,3 @@ RUN chsh -s /bin/zsh root && \
|
||||
echo '[ -f /workspace/.devcontainer/zshrc ] && . /workspace/.devcontainer/zshrc' >> "$rc"; \
|
||||
done && \
|
||||
chown dev:dev /home/dev/.zshrc
|
||||
|
||||
# Pre-seed GitHub's SSH host keys so git-over-SSH never prompts. Keys are
|
||||
# pinned in-repo (verified against the fingerprints GitHub publishes at
|
||||
# https://docs.github.com/en/authentication/keeping-your-account-and-data-secure/githubs-ssh-key-fingerprints)
|
||||
# rather than fetched at build time, so a compromised build-time network can't
|
||||
# inject a rogue key.
|
||||
COPY github_known_hosts /etc/ssh/ssh_known_hosts
|
||||
RUN chmod 644 /etc/ssh/ssh_known_hosts
|
||||
|
||||
@@ -1,11 +1,7 @@
|
||||
{
|
||||
"name": "Onyx Dev Sandbox",
|
||||
"image": "onyxdotapp/onyx-devcontainer@sha256:4986c9252289b660ce772b45f0488b938fe425d8114245e96ef64b273b3fcee4",
|
||||
"runArgs": [
|
||||
"--cap-add=NET_ADMIN",
|
||||
"--cap-add=NET_RAW",
|
||||
"--network=onyx_default"
|
||||
],
|
||||
"image": "onyxdotapp/onyx-devcontainer@sha256:12184169c5bcc9cca0388286d5ffe504b569bc9c37bfa631b76ee8eee2064055",
|
||||
"runArgs": ["--cap-add=NET_ADMIN", "--cap-add=NET_RAW"],
|
||||
"mounts": [
|
||||
"source=${localEnv:HOME}/.claude,target=/home/dev/.claude,type=bind",
|
||||
"source=${localEnv:HOME}/.claude.json,target=/home/dev/.claude.json,type=bind",
|
||||
@@ -16,13 +12,10 @@
|
||||
"source=onyx-devcontainer-local,target=/home/dev/.local,type=volume"
|
||||
],
|
||||
"containerEnv": {
|
||||
"SSH_AUTH_SOCK": "/tmp/ssh-agent.sock",
|
||||
"POSTGRES_HOST": "relational_db",
|
||||
"REDIS_HOST": "cache"
|
||||
"SSH_AUTH_SOCK": "/tmp/ssh-agent.sock"
|
||||
},
|
||||
"remoteUser": "${localEnv:DEVCONTAINER_REMOTE_USER:dev}",
|
||||
"updateRemoteUserUID": false,
|
||||
"initializeCommand": "docker network create onyx_default 2>/dev/null || true",
|
||||
"workspaceMount": "source=${localWorkspaceFolder},target=/workspace,type=bind,consistency=delegated",
|
||||
"workspaceFolder": "/workspace",
|
||||
"postStartCommand": "sudo bash /workspace/.devcontainer/init-dev-user.sh && sudo bash /workspace/.devcontainer/init-firewall.sh",
|
||||
|
||||
@@ -1,3 +0,0 @@
|
||||
github.com ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABgQCj7ndNxQowgcQnjshcLrqPEiiphnt+VTTvDP6mHBL9j1aNUkY4Ue1gvwnGLVlOhGeYrnZaMgRK6+PKCUXaDbC7qtbW8gIkhL7aGCsOr/C56SJMy/BCZfxd1nWzAOxSDPgVsmerOBYfNqltV9/hWCqBywINIR+5dIg6JTJ72pcEpEjcYgXkE2YEFXV1JHnsKgbLWNlhScqb2UmyRkQyytRLtL+38TGxkxCflmO+5Z8CSSNY7GidjMIZ7Q4zMjA2n1nGrlTDkzwDCsw+wqFPGQA179cnfGWOWRVruj16z6XyvxvjJwbz0wQZ75XK5tKSb7FNyeIEs4TT4jk+S4dhPeAUC5y+bDYirYgM4GC7uEnztnZyaVWQ7B381AK4Qdrwt51ZqExKbQpTUNn+EjqoTwvqNj4kqx5QUCI0ThS/YkOxJCXmPUWZbhjpCg56i+2aB6CmK2JGhn57K5mj0MNdBXA4/WnwH6XoPWJzK5Nyu2zB3nAZp+S5hpQs+p1vN1/wsjk=
|
||||
github.com ecdsa-sha2-nistp256 AAAAE2VjZHNhLXNoYTItbmlzdHAyNTYAAAAIbmlzdHAyNTYAAABBBEmKSENjQEezOmxkZMy7opKgwFB9nkt5YRrYMjNuG5N87uRgg6CLrbo5wAdT/y6v0mKV0U2w0WZ2YB/++Tpockg=
|
||||
github.com ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIOMqqnkVzrm0SdG6UOoqKLsabgH5C9okWi0dh2l9GKJl
|
||||
@@ -45,7 +45,7 @@ if [ "$ACTIVE_HOME" != "$MOUNT_HOME" ]; then
|
||||
[ -d "$MOUNT_HOME/$item" ] || continue
|
||||
if [ -e "$ACTIVE_HOME/$item" ] && [ ! -L "$ACTIVE_HOME/$item" ]; then
|
||||
echo "warning: replacing $ACTIVE_HOME/$item with symlink to $MOUNT_HOME/$item" >&2
|
||||
rm -rf "${ACTIVE_HOME:?}/$item"
|
||||
rm -rf "$ACTIVE_HOME/$item"
|
||||
fi
|
||||
ln -sfn "$MOUNT_HOME/$item" "$ACTIVE_HOME/$item"
|
||||
done
|
||||
|
||||
@@ -4,23 +4,22 @@ set -euo pipefail
|
||||
|
||||
echo "Setting up firewall..."
|
||||
|
||||
# Reset default policies to ACCEPT before flushing rules. On re-runs the
|
||||
# previous invocation's DROP policies are still in effect; flushing rules while
|
||||
# the default is DROP would block the DNS lookups below. Register a trap so
|
||||
# that if the script exits before the DROP policies are re-applied at the end,
|
||||
# we fail closed instead of leaving the container with an unrestricted
|
||||
# firewall.
|
||||
trap 'iptables -P INPUT DROP; iptables -P OUTPUT DROP; iptables -P FORWARD DROP' EXIT
|
||||
iptables -P INPUT ACCEPT
|
||||
iptables -P OUTPUT ACCEPT
|
||||
iptables -P FORWARD ACCEPT
|
||||
# Preserve docker dns resolution
|
||||
DOCKER_DNS_RULES=$(iptables-save | grep -E "^-A.*-d 127.0.0.11/32" || true)
|
||||
|
||||
# Only flush the filter table. The nat and mangle tables are managed by Docker
|
||||
# (DNS DNAT to 127.0.0.11, container networking, etc.) and must not be touched —
|
||||
# flushing them breaks Docker's embedded DNS resolver.
|
||||
# Flush all rules
|
||||
iptables -t nat -F
|
||||
iptables -t nat -X
|
||||
iptables -t mangle -F
|
||||
iptables -t mangle -X
|
||||
iptables -F
|
||||
iptables -X
|
||||
|
||||
# Restore docker dns rules
|
||||
if [ -n "$DOCKER_DNS_RULES" ]; then
|
||||
echo "$DOCKER_DNS_RULES" | iptables-restore -n
|
||||
fi
|
||||
|
||||
# Create ipset for allowed destinations
|
||||
ipset create allowed-domains hash:net || true
|
||||
ipset flush allowed-domains
|
||||
@@ -35,7 +34,6 @@ done
|
||||
|
||||
# Resolve allowed domains
|
||||
ALLOWED_DOMAINS=(
|
||||
"github.com"
|
||||
"registry.npmjs.org"
|
||||
"api.anthropic.com"
|
||||
"api-staging.anthropic.com"
|
||||
@@ -45,16 +43,8 @@ ALLOWED_DOMAINS=(
|
||||
"pypi.org"
|
||||
"files.pythonhosted.org"
|
||||
"go.dev"
|
||||
"proxy.golang.org"
|
||||
"sum.golang.org"
|
||||
"storage.googleapis.com"
|
||||
"dl.google.com"
|
||||
"static.rust-lang.org"
|
||||
"index.crates.io"
|
||||
"static.crates.io"
|
||||
"archive.ubuntu.com"
|
||||
"security.ubuntu.com"
|
||||
"deb.nodesource.com"
|
||||
)
|
||||
|
||||
for domain in "${ALLOWED_DOMAINS[@]}"; do
|
||||
@@ -75,14 +65,6 @@ if [ -n "$DOCKER_GATEWAY" ]; then
|
||||
fi
|
||||
fi
|
||||
|
||||
# Allow traffic to all attached Docker network subnets so the container can
|
||||
# reach sibling services (e.g. relational_db, cache) on shared compose networks.
|
||||
for subnet in $(ip -4 -o addr show scope global | awk '{print $4}'); do
|
||||
if ! ipset add allowed-domains "$subnet" -exist 2>&1; then
|
||||
echo "warning: failed to add Docker subnet $subnet to allowlist" >&2
|
||||
fi
|
||||
done
|
||||
|
||||
# Set default policies to DROP
|
||||
iptables -P FORWARD DROP
|
||||
iptables -P INPUT DROP
|
||||
|
||||
50
.github/workflows/deployment.yml
vendored
50
.github/workflows/deployment.yml
vendored
@@ -462,7 +462,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -472,7 +472,7 @@ jobs:
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
@@ -536,7 +536,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -546,7 +546,7 @@ jobs:
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
@@ -597,7 +597,7 @@ jobs:
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -676,7 +676,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -686,7 +686,7 @@ jobs:
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
@@ -761,7 +761,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -771,7 +771,7 @@ jobs:
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
@@ -833,7 +833,7 @@ jobs:
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -908,7 +908,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -918,7 +918,7 @@ jobs:
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
@@ -981,7 +981,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -991,7 +991,7 @@ jobs:
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
@@ -1041,7 +1041,7 @@ jobs:
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -1119,7 +1119,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -1129,7 +1129,7 @@ jobs:
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
@@ -1192,7 +1192,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -1202,7 +1202,7 @@ jobs:
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
@@ -1253,7 +1253,7 @@ jobs:
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -1329,7 +1329,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
with:
|
||||
buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}
|
||||
|
||||
@@ -1341,7 +1341,7 @@ jobs:
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
env:
|
||||
DEBUG: ${{ vars.DOCKER_DEBUG == 'true' && 1 || 0 }}
|
||||
with:
|
||||
@@ -1409,7 +1409,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
with:
|
||||
buildkitd-flags: ${{ vars.DOCKER_DEBUG == 'true' && '--debug' || '' }}
|
||||
|
||||
@@ -1421,7 +1421,7 @@ jobs:
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
env:
|
||||
DEBUG: ${{ vars.DOCKER_DEBUG == 'true' && 1 || 0 }}
|
||||
with:
|
||||
@@ -1475,7 +1475,7 @@ jobs:
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
|
||||
2
.github/workflows/docker-tag-beta.yml
vendored
2
.github/workflows/docker-tag-beta.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
|
||||
2
.github/workflows/docker-tag-latest.yml
vendored
2
.github/workflows/docker-tag-latest.yml
vendored
@@ -21,7 +21,7 @@ jobs:
|
||||
timeout-minutes: 45
|
||||
steps:
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
|
||||
10
.github/workflows/pr-integration-tests.yml
vendored
10
.github/workflows/pr-integration-tests.yml
vendored
@@ -115,7 +115,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
@@ -127,7 +127,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Backend Docker image
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
@@ -175,7 +175,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling Vespa, Redis, Postgres, and Minio images
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
@@ -187,7 +187,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Model Server Docker image
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
@@ -220,7 +220,7 @@ jobs:
|
||||
persist-credentials: false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling openapitools/openapi-generator-cli
|
||||
# otherwise, we hit the "Unauthenticated users" limit
|
||||
|
||||
12
.github/workflows/pr-playwright-tests.yml
vendored
12
.github/workflows/pr-playwright-tests.yml
vendored
@@ -94,7 +94,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
@@ -105,7 +105,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Web Docker image
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./web
|
||||
file: ./web/Dockerfile
|
||||
@@ -155,7 +155,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
@@ -166,7 +166,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Backend Docker image
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile
|
||||
@@ -216,7 +216,7 @@ jobs:
|
||||
echo "cache-suffix=${CACHE_SUFFIX}" >> $GITHUB_OUTPUT
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
# needed for pulling external images otherwise, we hit the "Unauthenticated users" limit
|
||||
# https://docs.docker.com/docker-hub/usage/
|
||||
@@ -227,7 +227,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Build and push Model Server Docker image
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend
|
||||
file: ./backend/Dockerfile.model_server
|
||||
|
||||
31
.github/workflows/pr-python-checks.yml
vendored
31
.github/workflows/pr-python-checks.yml
vendored
@@ -19,16 +19,16 @@ permissions:
|
||||
jobs:
|
||||
mypy-check:
|
||||
# See https://runs-on.com/runners/linux/
|
||||
# NOTE: This job is named mypy-check for branch protection compatibility,
|
||||
# but it actually runs ty (astral-sh's Rust type checker).
|
||||
# Note: Mypy seems quite optimized for x64 compared to arm64.
|
||||
# Similarly, mypy is single-threaded and incremental, so 2cpu is sufficient.
|
||||
runs-on:
|
||||
[
|
||||
runs-on,
|
||||
runner=2cpu-linux-arm64,
|
||||
runner=2cpu-linux-x64,
|
||||
"run-id=${{ github.run_id }}-mypy-check",
|
||||
"extras=s3-cache",
|
||||
]
|
||||
timeout-minutes: 15
|
||||
timeout-minutes: 45
|
||||
|
||||
steps:
|
||||
- uses: runs-on/action@cd2b598b0515d39d78c38a02d529db87d2196d1e # ratchet:runs-on/action@v2
|
||||
@@ -46,7 +46,26 @@ jobs:
|
||||
backend/requirements/model_server.txt
|
||||
backend/requirements/ee.txt
|
||||
|
||||
- name: Run ty
|
||||
- name: Generate OpenAPI schema and Python client
|
||||
shell: bash
|
||||
# TODO(Nik): https://linear.app/onyx-app/issue/ENG-1/update-test-infra-to-use-test-license
|
||||
env:
|
||||
LICENSE_ENFORCEMENT_ENABLED: "false"
|
||||
run: |
|
||||
ods openapi all
|
||||
|
||||
- name: Cache mypy cache
|
||||
if: ${{ vars.DISABLE_MYPY_CACHE != 'true' }}
|
||||
uses: runs-on/cache@a5f51d6f3fece787d03b7b4e981c82538a0654ed # ratchet:runs-on/cache@v4
|
||||
with:
|
||||
path: .mypy_cache
|
||||
key: mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-${{ hashFiles('**/*.py', '**/*.pyi', 'pyproject.toml') }}
|
||||
restore-keys: |
|
||||
mypy-${{ runner.os }}-${{ github.base_ref || github.event.merge_group.base_ref || 'main' }}-
|
||||
mypy-${{ runner.os }}-
|
||||
|
||||
- name: Run MyPy
|
||||
env:
|
||||
MYPY_FORCE_COLOR: 1
|
||||
TERM: xterm-256color
|
||||
run: ty check --output-format github
|
||||
run: mypy .
|
||||
|
||||
4
.github/workflows/pr-python-model-tests.yml
vendored
4
.github/workflows/pr-python-model-tests.yml
vendored
@@ -17,6 +17,8 @@ env:
|
||||
|
||||
# API keys for testing
|
||||
COHERE_API_KEY: ${{ secrets.COHERE_API_KEY }}
|
||||
LITELLM_API_KEY: ${{ secrets.LITELLM_API_KEY }}
|
||||
LITELLM_API_URL: ${{ secrets.LITELLM_API_URL }}
|
||||
OPENAI_API_KEY: ${{ secrets.OPENAI_API_KEY }}
|
||||
AZURE_API_KEY: ${{ secrets.AZURE_API_KEY }}
|
||||
AZURE_API_URL: ${{ vars.AZURE_API_URL }}
|
||||
@@ -69,7 +71,7 @@ jobs:
|
||||
password: ${{ secrets.DOCKER_TOKEN }}
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f
|
||||
|
||||
- name: Build and load
|
||||
uses: docker/bake-action@82490499d2e5613fcead7e128237ef0b0ea210f7 # ratchet:docker/bake-action@v7.0.0
|
||||
|
||||
2
.github/workflows/pr-quality-checks.yml
vendored
2
.github/workflows/pr-quality-checks.yml
vendored
@@ -39,8 +39,6 @@ jobs:
|
||||
working-directory: ./web
|
||||
run: npm ci
|
||||
- uses: j178/prek-action@cbc2f23eb5539cf20d82d1aabd0d0ecbcc56f4e3
|
||||
env:
|
||||
SKIP: ty
|
||||
with:
|
||||
prek-version: '0.3.4'
|
||||
extra-args: ${{ github.event_name == 'pull_request' && format('--from-ref {0} --to-ref {1}', github.event.pull_request.base.sha, github.event.pull_request.head.sha) || github.event_name == 'merge_group' && format('--from-ref {0} --to-ref {1}', github.event.merge_group.base_sha, github.event.merge_group.head_sha) || github.ref_name == 'main' && '--all-files' || '' }}
|
||||
|
||||
10
.github/workflows/sandbox-deployment.yml
vendored
10
.github/workflows/sandbox-deployment.yml
vendored
@@ -132,7 +132,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -142,7 +142,7 @@ jobs:
|
||||
|
||||
- name: Build and push AMD64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend/onyx/server/features/build/sandbox/kubernetes/docker
|
||||
file: ./backend/onyx/server/features/build/sandbox/kubernetes/docker/Dockerfile
|
||||
@@ -202,7 +202,7 @@ jobs:
|
||||
latest=false
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
@@ -212,7 +212,7 @@ jobs:
|
||||
|
||||
- name: Build and push ARM64
|
||||
id: build
|
||||
uses: docker/build-push-action@bcafcacb16a39f128d818304e6c9c0c18556b85f # ratchet:docker/build-push-action@v7
|
||||
uses: docker/build-push-action@263435318d21b8e681c14492fe198d362a7d2c83 # ratchet:docker/build-push-action@v6
|
||||
with:
|
||||
context: ./backend/onyx/server/features/build/sandbox/kubernetes/docker
|
||||
file: ./backend/onyx/server/features/build/sandbox/kubernetes/docker/Dockerfile
|
||||
@@ -258,7 +258,7 @@ jobs:
|
||||
parse-json-secrets: true
|
||||
|
||||
- name: Set up Docker Buildx
|
||||
uses: docker/setup-buildx-action@4d04d5d9486b7bd6fa91e7baf45bbb4f8b9deedd # ratchet:docker/setup-buildx-action@v4
|
||||
uses: docker/setup-buildx-action@8d2750c68a42422c14e847fe6c8ac0403b4cbd6f # ratchet:docker/setup-buildx-action@v3
|
||||
|
||||
- name: Login to Docker Hub
|
||||
uses: docker/login-action@c94ce9fb468520275223c153574b00df6fe4bcc9 # ratchet:docker/login-action@v3
|
||||
|
||||
@@ -67,12 +67,12 @@ repos:
|
||||
args: ["--active", "--with=onyx-devtools", "ods", "check-lazy-imports"]
|
||||
pass_filenames: true
|
||||
files: ^backend/(?!\.venv/|scripts/).*\.py$
|
||||
- id: uv-run
|
||||
alias: ty
|
||||
name: ty
|
||||
args: ["ty", "check"]
|
||||
pass_filenames: true
|
||||
types_or: [python]
|
||||
# NOTE: This takes ~6s on a single, large module which is prohibitively slow.
|
||||
# - id: uv-run
|
||||
# name: mypy
|
||||
# args: ["--all-extras", "mypy"]
|
||||
# pass_filenames: true
|
||||
# files: ^backend/.*\.py$
|
||||
|
||||
- repo: https://github.com/pre-commit/pre-commit-hooks
|
||||
rev: 3e8a8703264a2f4a69428a0aa4dcb512790b2c8c # frozen: v6.0.0
|
||||
@@ -86,17 +86,6 @@ repos:
|
||||
hooks:
|
||||
- id: actionlint
|
||||
|
||||
- repo: https://github.com/shellcheck-py/shellcheck-py
|
||||
rev: 745eface02aef23e168a8afb6b5737818efbea95 # frozen: v0.11.0.1
|
||||
hooks:
|
||||
- id: shellcheck
|
||||
exclude: >-
|
||||
(?x)^(
|
||||
backend/scripts/setup_craft_templates\.sh|
|
||||
deployment/docker_compose/init-letsencrypt\.sh|
|
||||
deployment/docker_compose/install\.sh
|
||||
)$
|
||||
|
||||
- repo: https://github.com/psf/black
|
||||
rev: 8a737e727ac5ab2f1d4cf5876720ed276dc8dc4b # frozen: 25.1.0
|
||||
hooks:
|
||||
@@ -153,7 +142,6 @@ repos:
|
||||
hooks:
|
||||
- id: ripsecrets
|
||||
args:
|
||||
- --strict-ignore
|
||||
- --additional-pattern
|
||||
- ^sk-[A-Za-z0-9_\-]{20,}$
|
||||
|
||||
|
||||
@@ -1 +0,0 @@
|
||||
.devcontainer/github_known_hosts
|
||||
12
.vscode/launch.json
vendored
12
.vscode/launch.json
vendored
@@ -475,18 +475,6 @@
|
||||
"order": 0
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Start Monitoring Stack (Prometheus + Grafana)",
|
||||
"type": "node",
|
||||
"request": "launch",
|
||||
"runtimeExecutable": "docker",
|
||||
"runtimeArgs": ["compose", "up", "-d"],
|
||||
"cwd": "${workspaceFolder}/profiling",
|
||||
"console": "integratedTerminal",
|
||||
"presentation": {
|
||||
"group": "3"
|
||||
}
|
||||
},
|
||||
{
|
||||
"name": "Clear and Restart External Volumes and Containers",
|
||||
"type": "node",
|
||||
|
||||
@@ -63,13 +63,11 @@ Your features must pass all tests and all comments must be addressed prior to me
|
||||
### Implicit agreements
|
||||
|
||||
If we approve an issue, we are promising you the following:
|
||||
|
||||
- Your work will receive timely attention and we will put aside other important items to ensure you are not blocked.
|
||||
- You will receive necessary coaching on eng quality, system design, etc. to ensure the feature is completed well.
|
||||
- The Onyx team will pull resources and bandwidth from design, PM, and engineering to ensure that you have all the resources to build the feature to the quality required for merging.
|
||||
|
||||
Because this is a large investment from our team, we ask that you:
|
||||
|
||||
- Thoroughly read all the requirements of the design docs, engineering best practices, and try to minimize overhead for the Onyx team.
|
||||
- Complete the feature in a timely manner to reduce context switching and an ongoing resource pull from the Onyx team.
|
||||
|
||||
@@ -151,10 +149,10 @@ Set up pre-commit hooks (black / reorder-python-imports):
|
||||
uv run pre-commit install
|
||||
```
|
||||
|
||||
We also use `ty` for static type checking. Onyx is fully type-annotated, and we want to keep it that way! To run the ty checks manually:
|
||||
We also use `mypy` for static type checking. Onyx is fully type-annotated, and we want to keep it that way! To run the mypy checks manually:
|
||||
|
||||
```bash
|
||||
uv run ty check
|
||||
uv run mypy . # from onyx/backend
|
||||
```
|
||||
|
||||
#### Frontend
|
||||
@@ -194,7 +192,6 @@ Before starting, make sure the Docker Daemon is running.
|
||||
> **Note:** "Clear and Restart External Volumes and Containers" will reset your Postgres and OpenSearch (relational-db and index). Only run this if you are okay with wiping your data.
|
||||
|
||||
**Features:**
|
||||
|
||||
- Hot reload is enabled for the web server and API servers
|
||||
- Python debugging is configured with debugpy
|
||||
- Environment variables are loaded from `.vscode/.env`
|
||||
@@ -347,16 +344,13 @@ sudo xattr -r -d com.apple.quarantine ~/.cache/pre-commit
|
||||
### Style and Maintainability
|
||||
|
||||
#### Comments and readability
|
||||
|
||||
Add clear comments:
|
||||
|
||||
- At logical boundaries (e.g., interfaces) so the reader doesn't need to dig 10 layers deeper.
|
||||
- Wherever assumptions are made or something non-obvious/unexpected is done.
|
||||
- For complicated flows/functions.
|
||||
- Wherever it saves time (e.g., nontrivial regex patterns).
|
||||
|
||||
#### Errors and exceptions
|
||||
|
||||
- **Fail loudly** rather than silently skipping work.
|
||||
- Example: raise and let exceptions propagate instead of silently dropping a document.
|
||||
- **Don't overuse `try/except`.**
|
||||
@@ -364,7 +358,6 @@ Add clear comments:
|
||||
- Do not mask exceptions unless it is clearly appropriate.
|
||||
|
||||
#### Typing
|
||||
|
||||
- Everything should be **as strictly typed as possible**.
|
||||
- Use `cast` for annoying/loose-typed interfaces (e.g., results of `run_functions_tuples_in_parallel`).
|
||||
- Only `cast` when the type checker sees `Any` or types are too loose.
|
||||
@@ -375,7 +368,6 @@ Add clear comments:
|
||||
- `dict[EmbeddingModel, list[EmbeddingVector]]`
|
||||
|
||||
#### State, objects, and boundaries
|
||||
|
||||
- Keep **clear logical boundaries** for state containers and objects.
|
||||
- A **config** object should never contain things like a `db_session`.
|
||||
- Avoid state containers that are overly nested, or huge + flat (use judgment).
|
||||
@@ -388,7 +380,6 @@ Add clear comments:
|
||||
- Prefer **hash maps (dicts)** over tree structures unless there's a strong reason.
|
||||
|
||||
#### Naming
|
||||
|
||||
- Name variables carefully and intentionally.
|
||||
- Prefer long, explicit names when undecided.
|
||||
- Avoid single-character variables except for small, self-contained utilities (or not at all).
|
||||
@@ -399,7 +390,6 @@ Add clear comments:
|
||||
- IntelliSense can miss call sites; search works best with unique names.
|
||||
|
||||
#### Correctness by construction
|
||||
|
||||
- Prefer self-contained correctness — don't rely on callers to "use it right" if you can make misuse hard.
|
||||
- Avoid redundancies: if a function takes an arg, it shouldn't also take a state object that contains that same arg.
|
||||
- No dead code (unless there's a very good reason).
|
||||
@@ -427,35 +417,29 @@ Add clear comments:
|
||||
### Repository Conventions
|
||||
|
||||
#### Where code lives
|
||||
|
||||
- Pydantic + data models: `models.py` files.
|
||||
- DB interface functions (excluding lazy loading): `db/` directory.
|
||||
- LLM prompts: `prompts/` directory, roughly mirroring the code layout that uses them.
|
||||
- API routes: `server/` directory.
|
||||
|
||||
#### Pydantic and modeling
|
||||
|
||||
- Prefer **Pydantic** over dataclasses.
|
||||
- If absolutely required, use `allow_arbitrary_types`.
|
||||
|
||||
#### Data conventions
|
||||
|
||||
- Prefer explicit `None` over sentinel empty strings (usually; depends on intent).
|
||||
- Prefer explicit identifiers: use string enums instead of integer codes.
|
||||
- Avoid magic numbers (co-location is good when necessary). **Always avoid magic strings.**
|
||||
|
||||
#### Logging
|
||||
|
||||
- Log messages where they are created.
|
||||
- Don't propagate log messages around just to log them elsewhere.
|
||||
|
||||
#### Encapsulation
|
||||
|
||||
- Don't use private attributes/methods/properties from other classes/modules.
|
||||
- "Private" is private — respect that boundary.
|
||||
|
||||
#### SQLAlchemy guidance
|
||||
|
||||
- Lazy loading is often bad at scale, especially across multiple list relationships.
|
||||
- Be careful when accessing SQLAlchemy object attributes:
|
||||
- It can help avoid redundant DB queries,
|
||||
@@ -464,7 +448,6 @@ Add clear comments:
|
||||
- Reference: https://www.reddit.com/r/SQLAlchemy/comments/138f248/joinedload_vs_selectinload/
|
||||
|
||||
#### Trunk-based development and feature flags
|
||||
|
||||
- **PRs should contain no more than 500 lines of real change.**
|
||||
- **Merge to main frequently.** Avoid long-lived feature branches — they create merge conflicts and integration pain.
|
||||
- **Use feature flags for incremental rollout.**
|
||||
@@ -475,7 +458,6 @@ Add clear comments:
|
||||
- **Test both flag states.** Ensure the codebase works correctly with the flag on and off.
|
||||
|
||||
#### Miscellaneous
|
||||
|
||||
- Any TODOs you add in the code must be accompanied by either the name/username of the owner of that TODO, or an issue number for an issue referencing that piece of work.
|
||||
- Avoid module-level logic that runs on import, which leads to import-time side effects. Essentially every piece of meaningful logic should exist within some function that has to be explicitly invoked. Acceptable exceptions may include loading environment variables or setting up loggers.
|
||||
- If you find yourself needing something like this, you may want that logic to exist in a file dedicated for manual execution (contains `if __name__ == "__main__":`) which should not be imported by anything else.
|
||||
|
||||
@@ -1,4 +1,4 @@
|
||||
FROM python:3.11-slim-bookworm@sha256:9c6f90801e6b68e772b7c0ca74260cbf7af9f320acec894e26fccdaccfbe3b47
|
||||
FROM python:3.11.7-slim-bookworm
|
||||
|
||||
LABEL com.danswer.maintainer="founders@onyx.app"
|
||||
LABEL com.danswer.description="This image is the web/frontend container of Onyx which \
|
||||
|
||||
@@ -1,5 +1,5 @@
|
||||
# Base stage with dependencies
|
||||
FROM python:3.11-slim-bookworm@sha256:9c6f90801e6b68e772b7c0ca74260cbf7af9f320acec894e26fccdaccfbe3b47 AS base
|
||||
FROM python:3.11.7-slim-bookworm AS base
|
||||
|
||||
ENV DANSWER_RUNNING_IN_DOCKER="true" \
|
||||
HF_HOME=/app/.cache/huggingface
|
||||
@@ -50,10 +50,6 @@ COPY ./onyx/utils/logger.py /app/onyx/utils/logger.py
|
||||
COPY ./onyx/utils/middleware.py /app/onyx/utils/middleware.py
|
||||
COPY ./onyx/utils/tenant.py /app/onyx/utils/tenant.py
|
||||
|
||||
# Sentry configuration (used when SENTRY_DSN is set)
|
||||
COPY ./onyx/configs/__init__.py /app/onyx/configs/__init__.py
|
||||
COPY ./onyx/configs/sentry.py /app/onyx/configs/sentry.py
|
||||
|
||||
# Place to fetch version information
|
||||
COPY ./onyx/__init__.py /app/onyx/__init__.py
|
||||
|
||||
|
||||
@@ -26,9 +26,7 @@ from shared_configs.configs import (
|
||||
TENANT_ID_PREFIX,
|
||||
)
|
||||
from onyx.db.models import Base
|
||||
from celery.backends.database.session import ( # ty: ignore[unresolved-import]
|
||||
ResultModelBase,
|
||||
)
|
||||
from celery.backends.database.session import ResultModelBase # type: ignore
|
||||
from onyx.db.engine.sql_engine import SqlEngine
|
||||
|
||||
# Make sure in alembic.ini [logger_root] level=INFO is set or most logging will be
|
||||
|
||||
@@ -49,7 +49,7 @@ def upgrade() -> None:
|
||||
"time_updated",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
server_onupdate=sa.text("now()"), # ty: ignore[invalid-argument-type]
|
||||
server_onupdate=sa.text("now()"), # type: ignore
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column(
|
||||
|
||||
@@ -68,7 +68,7 @@ def upgrade() -> None:
|
||||
sa.text("SELECT id FROM tool WHERE in_code_tool_id = :in_code_tool_id"),
|
||||
{"in_code_tool_id": OPEN_URL_TOOL["in_code_tool_id"]},
|
||||
).fetchone()
|
||||
tool_id = result[0] # ty: ignore[not-subscriptable]
|
||||
tool_id = result[0] # type: ignore
|
||||
|
||||
# Associate the tool with all existing personas
|
||||
# Get all persona IDs
|
||||
|
||||
@@ -0,0 +1,499 @@
|
||||
"""add proposal review tables
|
||||
|
||||
Revision ID: 61ea78857c97
|
||||
Revises: d129f37b3d87
|
||||
Create Date: 2026-04-09 10:00:00.000000
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.dialects import postgresql
|
||||
|
||||
import fastapi_users_db_sqlalchemy
|
||||
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "61ea78857c97"
|
||||
down_revision = "d129f37b3d87"
|
||||
branch_labels: str | None = None
|
||||
depends_on: str | None = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
# -- proposal_review_ruleset --
|
||||
op.create_table(
|
||||
"proposal_review_ruleset",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("tenant_id", sa.Text(), nullable=False),
|
||||
sa.Column("name", sa.Text(), nullable=False),
|
||||
sa.Column("description", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"is_default",
|
||||
sa.Boolean(),
|
||||
server_default=sa.text("false"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"is_active",
|
||||
sa.Boolean(),
|
||||
server_default=sa.text("true"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"created_by",
|
||||
fastapi_users_db_sqlalchemy.generics.GUID(),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(["created_by"], ["user.id"]),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_ruleset_tenant_id",
|
||||
"proposal_review_ruleset",
|
||||
["tenant_id"],
|
||||
)
|
||||
|
||||
# -- proposal_review_rule --
|
||||
op.create_table(
|
||||
"proposal_review_rule",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"ruleset_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("name", sa.Text(), nullable=False),
|
||||
sa.Column("description", sa.Text(), nullable=True),
|
||||
sa.Column("category", sa.Text(), nullable=True),
|
||||
sa.Column("rule_type", sa.Text(), nullable=False),
|
||||
sa.Column(
|
||||
"rule_intent",
|
||||
sa.Text(),
|
||||
server_default=sa.text("'CHECK'"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("prompt_template", sa.Text(), nullable=False),
|
||||
sa.Column(
|
||||
"source",
|
||||
sa.Text(),
|
||||
server_default=sa.text("'MANUAL'"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("authority", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"is_hard_stop",
|
||||
sa.Boolean(),
|
||||
server_default=sa.text("false"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"priority",
|
||||
sa.Integer(),
|
||||
server_default=sa.text("0"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"is_active",
|
||||
sa.Boolean(),
|
||||
server_default=sa.text("true"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"refinement_needed",
|
||||
sa.Boolean(),
|
||||
server_default=sa.text("false"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("refinement_question", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["ruleset_id"],
|
||||
["proposal_review_ruleset.id"],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_rule_ruleset_id",
|
||||
"proposal_review_rule",
|
||||
["ruleset_id"],
|
||||
)
|
||||
|
||||
# -- proposal_review_proposal --
|
||||
# Includes inline proposal-level decision fields (no separate decision table).
|
||||
op.create_table(
|
||||
"proposal_review_proposal",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("document_id", sa.Text(), nullable=False),
|
||||
sa.Column("tenant_id", sa.Text(), nullable=False),
|
||||
sa.Column(
|
||||
"status",
|
||||
sa.Text(),
|
||||
server_default=sa.text("'PENDING'"),
|
||||
nullable=False,
|
||||
),
|
||||
# Inline proposal-level decision fields
|
||||
sa.Column("decision_notes", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"decision_officer_id",
|
||||
fastapi_users_db_sqlalchemy.generics.GUID(),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("decision_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column(
|
||||
"jira_synced",
|
||||
sa.Boolean(),
|
||||
server_default=sa.text("false"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("jira_synced_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(["decision_officer_id"], ["user.id"]),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
sa.UniqueConstraint("document_id", "tenant_id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_proposal_tenant_id",
|
||||
"proposal_review_proposal",
|
||||
["tenant_id"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_proposal_document_id",
|
||||
"proposal_review_proposal",
|
||||
["document_id"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_proposal_status",
|
||||
"proposal_review_proposal",
|
||||
["status"],
|
||||
)
|
||||
|
||||
# -- proposal_review_run --
|
||||
op.create_table(
|
||||
"proposal_review_run",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"proposal_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"ruleset_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"triggered_by",
|
||||
fastapi_users_db_sqlalchemy.generics.GUID(),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"status",
|
||||
sa.Text(),
|
||||
server_default=sa.text("'PENDING'"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("total_rules", sa.Integer(), nullable=False),
|
||||
sa.Column(
|
||||
"completed_rules",
|
||||
sa.Integer(),
|
||||
server_default=sa.text("0"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("started_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["proposal_id"],
|
||||
["proposal_review_proposal.id"],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["ruleset_id"],
|
||||
["proposal_review_ruleset.id"],
|
||||
),
|
||||
sa.ForeignKeyConstraint(["triggered_by"], ["user.id"]),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_run_proposal_id",
|
||||
"proposal_review_run",
|
||||
["proposal_id"],
|
||||
)
|
||||
|
||||
# -- proposal_review_finding --
|
||||
# Includes inline per-finding decision fields (no separate decision table).
|
||||
op.create_table(
|
||||
"proposal_review_finding",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"proposal_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"rule_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"review_run_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("verdict", sa.Text(), nullable=False),
|
||||
sa.Column("confidence", sa.Text(), nullable=True),
|
||||
sa.Column("evidence", sa.Text(), nullable=True),
|
||||
sa.Column("explanation", sa.Text(), nullable=True),
|
||||
sa.Column("suggested_action", sa.Text(), nullable=True),
|
||||
sa.Column("llm_model", sa.Text(), nullable=True),
|
||||
sa.Column("llm_tokens_used", sa.Integer(), nullable=True),
|
||||
# Inline per-finding decision fields
|
||||
sa.Column("decision_action", sa.Text(), nullable=True),
|
||||
sa.Column("decision_notes", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"decision_officer_id",
|
||||
fastapi_users_db_sqlalchemy.generics.GUID(),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column("decided_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["proposal_id"],
|
||||
["proposal_review_proposal.id"],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["rule_id"],
|
||||
["proposal_review_rule.id"],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["review_run_id"],
|
||||
["proposal_review_run.id"],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
sa.ForeignKeyConstraint(["decision_officer_id"], ["user.id"]),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_finding_proposal_id",
|
||||
"proposal_review_finding",
|
||||
["proposal_id"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_finding_review_run_id",
|
||||
"proposal_review_finding",
|
||||
["review_run_id"],
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_finding_rule_id",
|
||||
"proposal_review_finding",
|
||||
["rule_id"],
|
||||
)
|
||||
|
||||
# -- proposal_review_document --
|
||||
op.create_table(
|
||||
"proposal_review_document",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"proposal_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("file_name", sa.Text(), nullable=False),
|
||||
sa.Column("file_type", sa.Text(), nullable=True),
|
||||
sa.Column("file_store_id", sa.Text(), nullable=True),
|
||||
sa.Column("extracted_text", sa.Text(), nullable=True),
|
||||
sa.Column("document_role", sa.Text(), nullable=False),
|
||||
sa.Column(
|
||||
"uploaded_by",
|
||||
fastapi_users_db_sqlalchemy.generics.GUID(),
|
||||
nullable=True,
|
||||
),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.ForeignKeyConstraint(
|
||||
["proposal_id"],
|
||||
["proposal_review_proposal.id"],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
sa.ForeignKeyConstraint(["uploaded_by"], ["user.id"]),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_document_proposal_id",
|
||||
"proposal_review_document",
|
||||
["proposal_id"],
|
||||
)
|
||||
|
||||
# -- proposal_review_import_job --
|
||||
op.create_table(
|
||||
"proposal_review_import_job",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"ruleset_id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("tenant_id", sa.Text(), nullable=False),
|
||||
sa.Column(
|
||||
"status",
|
||||
sa.Text(),
|
||||
server_default=sa.text("'PENDING'"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("source_filename", sa.Text(), nullable=False),
|
||||
sa.Column("extracted_text", sa.Text(), nullable=False),
|
||||
sa.Column(
|
||||
"rules_created",
|
||||
sa.Integer(),
|
||||
server_default=sa.text("0"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("error_message", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("completed_at", sa.DateTime(timezone=True), nullable=True),
|
||||
sa.ForeignKeyConstraint(
|
||||
["ruleset_id"],
|
||||
["proposal_review_ruleset.id"],
|
||||
ondelete="CASCADE",
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
op.create_index(
|
||||
"ix_proposal_review_import_job_ruleset_id",
|
||||
"proposal_review_import_job",
|
||||
["ruleset_id"],
|
||||
)
|
||||
|
||||
# -- proposal_review_config --
|
||||
op.create_table(
|
||||
"proposal_review_config",
|
||||
sa.Column(
|
||||
"id",
|
||||
postgresql.UUID(as_uuid=True),
|
||||
server_default=sa.text("gen_random_uuid()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column("tenant_id", sa.Text(), nullable=False, unique=True),
|
||||
sa.Column("jira_connector_id", sa.Integer(), nullable=True),
|
||||
sa.Column("jira_project_key", sa.Text(), nullable=True),
|
||||
sa.Column("field_mapping", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("jira_writeback", postgresql.JSONB(), nullable=True),
|
||||
sa.Column("review_model", sa.Text(), nullable=True),
|
||||
sa.Column("import_model", sa.Text(), nullable=True),
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.Column(
|
||||
"updated_at",
|
||||
sa.DateTime(timezone=True),
|
||||
server_default=sa.text("now()"),
|
||||
nullable=False,
|
||||
),
|
||||
sa.PrimaryKeyConstraint("id"),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_table("proposal_review_import_job")
|
||||
op.drop_table("proposal_review_config")
|
||||
op.drop_table("proposal_review_document")
|
||||
op.drop_table("proposal_review_finding")
|
||||
op.drop_table("proposal_review_run")
|
||||
op.drop_table("proposal_review_proposal")
|
||||
op.drop_table("proposal_review_rule")
|
||||
op.drop_table("proposal_review_ruleset")
|
||||
@@ -52,7 +52,7 @@ def upgrade() -> None:
|
||||
sa.Column(
|
||||
"created_at",
|
||||
sa.DateTime(),
|
||||
default=lambda: datetime.datetime.now(datetime.timezone.utc),
|
||||
default=datetime.datetime.utcnow,
|
||||
),
|
||||
sa.Column(
|
||||
"cc_pair_id",
|
||||
|
||||
@@ -10,7 +10,7 @@ from alembic import op
|
||||
import sqlalchemy as sa
|
||||
from sqlalchemy.orm import Session
|
||||
from sqlalchemy import text
|
||||
from typing import cast
|
||||
from typing import cast, Any
|
||||
|
||||
from botocore.exceptions import ClientError
|
||||
|
||||
@@ -255,7 +255,7 @@ def _migrate_files_to_external_storage() -> None:
|
||||
continue
|
||||
|
||||
lobj_id = cast(int, file_record.lobj_oid)
|
||||
file_metadata = file_record.file_metadata
|
||||
file_metadata = cast(Any, file_record.file_metadata)
|
||||
|
||||
# Read file content from PostgreSQL
|
||||
try:
|
||||
|
||||
@@ -0,0 +1,32 @@
|
||||
"""add failed_rules to proposal_review_run
|
||||
|
||||
Revision ID: ce2aa573d445
|
||||
Revises: 61ea78857c97
|
||||
Create Date: 2026-04-14 16:34:57.276707
|
||||
|
||||
"""
|
||||
|
||||
from alembic import op
|
||||
import sqlalchemy as sa
|
||||
|
||||
# revision identifiers, used by Alembic.
|
||||
revision = "ce2aa573d445"
|
||||
down_revision = "61ea78857c97"
|
||||
branch_labels = None
|
||||
depends_on = None
|
||||
|
||||
|
||||
def upgrade() -> None:
|
||||
op.add_column(
|
||||
"proposal_review_run",
|
||||
sa.Column(
|
||||
"failed_rules",
|
||||
sa.Integer(),
|
||||
nullable=False,
|
||||
server_default=sa.text("0"),
|
||||
),
|
||||
)
|
||||
|
||||
|
||||
def downgrade() -> None:
|
||||
op.drop_column("proposal_review_run", "failed_rules")
|
||||
@@ -112,7 +112,7 @@ def _get_access_for_documents(
|
||||
access_map[document_id] = DocumentAccess.build(
|
||||
user_emails=list(non_ee_access.user_emails),
|
||||
user_groups=user_group_info.get(document_id, []),
|
||||
is_public=is_public_anywhere, # ty: ignore[invalid-argument-type]
|
||||
is_public=is_public_anywhere,
|
||||
external_user_emails=list(ext_u_emails),
|
||||
external_user_group_ids=list(ext_u_groups),
|
||||
)
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
import os
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
|
||||
import jwt
|
||||
from fastapi import Depends
|
||||
@@ -59,7 +58,7 @@ def generate_anonymous_user_jwt_token(tenant_id: str) -> str:
|
||||
payload = {
|
||||
"tenant_id": tenant_id,
|
||||
# Token does not expire
|
||||
"iat": datetime.now(timezone.utc), # Issued at time
|
||||
"iat": datetime.utcnow(), # Issued at time
|
||||
}
|
||||
|
||||
return jwt.encode(payload, USER_AUTH_SECRET, algorithm="HS256")
|
||||
|
||||
@@ -1,10 +1,8 @@
|
||||
import time
|
||||
from typing import cast
|
||||
|
||||
from celery import shared_task
|
||||
from celery import Task
|
||||
from celery.exceptions import SoftTimeLimitExceeded
|
||||
from redis.client import Redis
|
||||
from redis.lock import Lock as RedisLock
|
||||
|
||||
from ee.onyx.server.tenants.product_gating import get_gated_tenants
|
||||
@@ -18,56 +16,9 @@ from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine.tenant_utils import get_all_tenant_ids
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import redis_lock_dump
|
||||
from onyx.redis.redis_tenant_work_gating import cleanup_expired
|
||||
from onyx.redis.redis_tenant_work_gating import get_active_tenants
|
||||
from onyx.redis.redis_tenant_work_gating import observe_active_set_size
|
||||
from onyx.redis.redis_tenant_work_gating import record_full_fanout_cycle
|
||||
from onyx.redis.redis_tenant_work_gating import record_gate_decision
|
||||
from onyx.server.runtime.onyx_runtime import OnyxRuntime
|
||||
from shared_configs.configs import IGNORED_SYNCING_TENANT_LIST
|
||||
|
||||
|
||||
_FULL_FANOUT_TIMESTAMP_KEY_PREFIX = "tenant_work_gating_last_full_fanout_ms"
|
||||
|
||||
|
||||
def _should_bypass_gate_for_full_fanout(
|
||||
redis_client: Redis, task_name: str, interval_seconds: int
|
||||
) -> bool:
|
||||
"""True if at least `interval_seconds` have elapsed since the last
|
||||
full-fanout bypass for this task. On True, updates the stored timestamp
|
||||
atomically-enough (it's a best-effort counter, not a lock)."""
|
||||
key = f"{_FULL_FANOUT_TIMESTAMP_KEY_PREFIX}:{task_name}"
|
||||
now_ms = int(time.time() * 1000)
|
||||
threshold_ms = now_ms - (interval_seconds * 1000)
|
||||
|
||||
try:
|
||||
raw = cast(bytes | None, redis_client.get(key))
|
||||
except Exception:
|
||||
task_logger.exception(f"full-fanout timestamp read failed: task={task_name}")
|
||||
# Fail open: treat as "interval elapsed" so we don't skip every
|
||||
# tenant during a Redis hiccup.
|
||||
return True
|
||||
|
||||
if raw is None:
|
||||
# First invocation — bypass so the set seeds cleanly.
|
||||
elapsed = True
|
||||
else:
|
||||
try:
|
||||
last_ms = int(raw.decode())
|
||||
elapsed = last_ms <= threshold_ms
|
||||
except ValueError:
|
||||
elapsed = True
|
||||
|
||||
if elapsed:
|
||||
try:
|
||||
redis_client.set(key, str(now_ms))
|
||||
except Exception:
|
||||
task_logger.exception(
|
||||
f"full-fanout timestamp write failed: task={task_name}"
|
||||
)
|
||||
return elapsed
|
||||
|
||||
|
||||
@shared_task(
|
||||
name=OnyxCeleryTask.CLOUD_BEAT_TASK_GENERATOR,
|
||||
ignore_result=True,
|
||||
@@ -81,7 +32,6 @@ def cloud_beat_task_generator(
|
||||
priority: int = OnyxCeleryPriority.MEDIUM,
|
||||
expires: int = BEAT_EXPIRES_DEFAULT,
|
||||
skip_gated: bool = True,
|
||||
work_gated: bool = False,
|
||||
) -> bool | None:
|
||||
"""a lightweight task used to kick off individual beat tasks per tenant."""
|
||||
time_start = time.monotonic()
|
||||
@@ -101,56 +51,8 @@ def cloud_beat_task_generator(
|
||||
tenant_ids: list[str] = []
|
||||
num_processed_tenants = 0
|
||||
num_skipped_gated = 0
|
||||
num_would_skip_work_gate = 0
|
||||
num_skipped_work_gate = 0
|
||||
|
||||
# Tenant-work-gating read path. Resolve once per invocation.
|
||||
gate_enabled = False
|
||||
gate_enforce = False
|
||||
full_fanout_cycle = False
|
||||
active_tenants: set[str] | None = None
|
||||
|
||||
try:
|
||||
# Gating setup is inside the try block so any exception still
|
||||
# reaches the finally that releases the beat lock.
|
||||
if work_gated:
|
||||
try:
|
||||
gate_enabled = OnyxRuntime.get_tenant_work_gating_enabled()
|
||||
gate_enforce = OnyxRuntime.get_tenant_work_gating_enforce()
|
||||
except Exception:
|
||||
task_logger.exception("tenant work gating: runtime flag read failed")
|
||||
gate_enabled = False
|
||||
|
||||
if gate_enabled:
|
||||
redis_failed = False
|
||||
interval_s = (
|
||||
OnyxRuntime.get_tenant_work_gating_full_fanout_interval_seconds()
|
||||
)
|
||||
full_fanout_cycle = _should_bypass_gate_for_full_fanout(
|
||||
redis_client, task_name, interval_s
|
||||
)
|
||||
if full_fanout_cycle:
|
||||
record_full_fanout_cycle(task_name)
|
||||
try:
|
||||
ttl_s = OnyxRuntime.get_tenant_work_gating_ttl_seconds()
|
||||
cleanup_expired(ttl_s)
|
||||
except Exception:
|
||||
task_logger.exception(
|
||||
"tenant work gating: cleanup_expired failed"
|
||||
)
|
||||
else:
|
||||
ttl_s = OnyxRuntime.get_tenant_work_gating_ttl_seconds()
|
||||
active_tenants = get_active_tenants(ttl_s)
|
||||
if active_tenants is None:
|
||||
full_fanout_cycle = True
|
||||
record_full_fanout_cycle(task_name)
|
||||
redis_failed = True
|
||||
|
||||
# Only refresh the gauge when Redis is known-reachable —
|
||||
# skip the ZCARD if we just failed open due to a Redis error.
|
||||
if not redis_failed:
|
||||
observe_active_set_size()
|
||||
|
||||
tenant_ids = get_all_tenant_ids()
|
||||
|
||||
# Per-task control over whether gated tenants are included. Most periodic tasks
|
||||
@@ -174,21 +76,6 @@ def cloud_beat_task_generator(
|
||||
if IGNORED_SYNCING_TENANT_LIST and tenant_id in IGNORED_SYNCING_TENANT_LIST:
|
||||
continue
|
||||
|
||||
# Tenant work gate: if the feature is on, check membership. Skip
|
||||
# unmarked tenants when enforce=True AND we're not in a full-
|
||||
# fanout cycle. Always log/emit the shadow counter.
|
||||
if work_gated and gate_enabled and not full_fanout_cycle:
|
||||
would_skip = (
|
||||
active_tenants is not None and tenant_id not in active_tenants
|
||||
)
|
||||
if would_skip:
|
||||
num_would_skip_work_gate += 1
|
||||
if gate_enforce:
|
||||
num_skipped_work_gate += 1
|
||||
record_gate_decision(task_name, skipped=True)
|
||||
continue
|
||||
record_gate_decision(task_name, skipped=False)
|
||||
|
||||
self.app.send_task(
|
||||
task_name,
|
||||
kwargs=dict(
|
||||
@@ -222,12 +109,6 @@ def cloud_beat_task_generator(
|
||||
f"task={task_name} "
|
||||
f"num_processed_tenants={num_processed_tenants} "
|
||||
f"num_skipped_gated={num_skipped_gated} "
|
||||
f"num_would_skip_work_gate={num_would_skip_work_gate} "
|
||||
f"num_skipped_work_gate={num_skipped_work_gate} "
|
||||
f"full_fanout_cycle={full_fanout_cycle} "
|
||||
f"work_gated={work_gated} "
|
||||
f"gate_enabled={gate_enabled} "
|
||||
f"gate_enforce={gate_enforce} "
|
||||
f"num_tenants={len(tenant_ids)} "
|
||||
f"elapsed={time_elapsed:.2f}"
|
||||
)
|
||||
|
||||
@@ -80,7 +80,6 @@ from onyx.redis.redis_connector_doc_perm_sync import RedisConnectorPermissionSyn
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import get_redis_replica_client
|
||||
from onyx.redis.redis_pool import redis_lock_dump
|
||||
from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
|
||||
from onyx.server.runtime.onyx_runtime import OnyxRuntime
|
||||
from onyx.server.utils import make_short_id
|
||||
from onyx.utils.logger import doc_permission_sync_ctx
|
||||
@@ -209,11 +208,6 @@ def check_for_doc_permissions_sync(self: Task, *, tenant_id: str) -> bool | None
|
||||
if _is_external_doc_permissions_sync_due(cc_pair):
|
||||
cc_pair_ids_to_sync.append(cc_pair.id)
|
||||
|
||||
# Tenant-work-gating hook: refresh this tenant's active-set membership
|
||||
# whenever doc-permission sync has any due cc_pairs to dispatch.
|
||||
if cc_pair_ids_to_sync:
|
||||
maybe_mark_tenant_active(tenant_id)
|
||||
|
||||
lock_beat.reacquire()
|
||||
for cc_pair_id in cc_pair_ids_to_sync:
|
||||
payload_id = try_creating_permissions_sync_task(
|
||||
|
||||
@@ -69,7 +69,6 @@ from onyx.redis.redis_connector_ext_group_sync import (
|
||||
)
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import get_redis_replica_client
|
||||
from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
|
||||
from onyx.server.runtime.onyx_runtime import OnyxRuntime
|
||||
from onyx.server.utils import make_short_id
|
||||
from onyx.utils.logger import format_error_for_logging
|
||||
@@ -203,11 +202,6 @@ def check_for_external_group_sync(self: Task, *, tenant_id: str) -> bool | None:
|
||||
if _is_external_group_sync_due(cc_pair):
|
||||
cc_pair_ids_to_sync.append(cc_pair.id)
|
||||
|
||||
# Tenant-work-gating hook: refresh this tenant's active-set membership
|
||||
# whenever external-group sync has any due cc_pairs to dispatch.
|
||||
if cc_pair_ids_to_sync:
|
||||
maybe_mark_tenant_active(tenant_id)
|
||||
|
||||
lock_beat.reacquire()
|
||||
for cc_pair_id in cc_pair_ids_to_sync:
|
||||
payload_id = try_creating_external_group_sync_task(
|
||||
|
||||
@@ -53,7 +53,7 @@ def fetch_query_analytics(
|
||||
.order_by(cast(ChatMessage.time_sent, Date))
|
||||
)
|
||||
|
||||
return db_session.execute(stmt).all() # ty: ignore[invalid-return-type]
|
||||
return db_session.execute(stmt).all() # type: ignore
|
||||
|
||||
|
||||
def fetch_per_user_query_analytics(
|
||||
@@ -92,7 +92,7 @@ def fetch_per_user_query_analytics(
|
||||
.order_by(cast(ChatMessage.time_sent, Date), ChatSession.user_id)
|
||||
)
|
||||
|
||||
return db_session.execute(stmt).all() # ty: ignore[invalid-return-type]
|
||||
return db_session.execute(stmt).all() # type: ignore
|
||||
|
||||
|
||||
def fetch_onyxbot_analytics(
|
||||
|
||||
@@ -9,7 +9,7 @@ logger = setup_logger()
|
||||
|
||||
|
||||
def fetch_sources_with_connectors(db_session: Session) -> list[DocumentSource]:
|
||||
sources = db_session.query(distinct(Connector.source)).all()
|
||||
sources = db_session.query(distinct(Connector.source)).all() # type: ignore
|
||||
|
||||
document_sources = [source[0] for source in sources]
|
||||
|
||||
|
||||
@@ -128,9 +128,9 @@ def get_used_seats(tenant_id: str | None = None) -> int:
|
||||
select(func.count())
|
||||
.select_from(User)
|
||||
.where(
|
||||
User.is_active == True, # noqa: E712
|
||||
User.is_active == True, # type: ignore # noqa: E712
|
||||
User.role != UserRole.EXT_PERM_USER,
|
||||
User.email != ANONYMOUS_USER_EMAIL,
|
||||
User.email != ANONYMOUS_USER_EMAIL, # type: ignore
|
||||
User.account_type != AccountType.SERVICE_ACCOUNT,
|
||||
)
|
||||
)
|
||||
|
||||
@@ -121,7 +121,7 @@ class ScimDAL(DAL):
|
||||
"""Update the last_used_at timestamp for a token."""
|
||||
token = self._session.get(ScimToken, token_id)
|
||||
if token:
|
||||
token.last_used_at = func.now()
|
||||
token.last_used_at = func.now() # type: ignore[assignment]
|
||||
|
||||
# ------------------------------------------------------------------
|
||||
# User mapping operations
|
||||
@@ -229,7 +229,7 @@ class ScimDAL(DAL):
|
||||
def get_user(self, user_id: UUID) -> User | None:
|
||||
"""Fetch a user by ID."""
|
||||
return self._session.scalar(
|
||||
select(User).where(User.id == user_id) # ty: ignore[invalid-argument-type]
|
||||
select(User).where(User.id == user_id) # type: ignore[arg-type]
|
||||
)
|
||||
|
||||
def get_user_by_email(self, email: str) -> User | None:
|
||||
@@ -293,22 +293,16 @@ class ScimDAL(DAL):
|
||||
if attr == "username":
|
||||
# arg-type: fastapi-users types User.email as str, not a column expression
|
||||
# assignment: union return type widens but query is still Select[tuple[User]]
|
||||
query = _apply_scim_string_op(
|
||||
query, User.email, scim_filter # ty: ignore[invalid-argument-type]
|
||||
)
|
||||
query = _apply_scim_string_op(query, User.email, scim_filter) # type: ignore[arg-type, assignment]
|
||||
elif attr == "active":
|
||||
query = query.where(
|
||||
User.is_active.is_( # ty: ignore[unresolved-attribute]
|
||||
scim_filter.value.lower() == "true"
|
||||
)
|
||||
User.is_active.is_(scim_filter.value.lower() == "true") # type: ignore[attr-defined]
|
||||
)
|
||||
elif attr == "externalid":
|
||||
mapping = self.get_user_mapping_by_external_id(scim_filter.value)
|
||||
if not mapping:
|
||||
return [], 0
|
||||
query = query.where(
|
||||
User.id == mapping.user_id # ty: ignore[invalid-argument-type]
|
||||
)
|
||||
query = query.where(User.id == mapping.user_id) # type: ignore[arg-type]
|
||||
else:
|
||||
raise ValueError(
|
||||
f"Unsupported filter attribute: {scim_filter.attribute}"
|
||||
@@ -324,9 +318,7 @@ class ScimDAL(DAL):
|
||||
offset = max(start_index - 1, 0)
|
||||
users = list(
|
||||
self._session.scalars(
|
||||
query.order_by(User.id) # ty: ignore[invalid-argument-type]
|
||||
.offset(offset)
|
||||
.limit(count)
|
||||
query.order_by(User.id).offset(offset).limit(count) # type: ignore[arg-type]
|
||||
)
|
||||
.unique()
|
||||
.all()
|
||||
@@ -585,7 +577,7 @@ class ScimDAL(DAL):
|
||||
attr = scim_filter.attribute.lower()
|
||||
if attr == "displayname":
|
||||
# assignment: union return type widens but query is still Select[tuple[UserGroup]]
|
||||
query = _apply_scim_string_op(query, UserGroup.name, scim_filter)
|
||||
query = _apply_scim_string_op(query, UserGroup.name, scim_filter) # type: ignore[assignment]
|
||||
elif attr == "externalid":
|
||||
mapping = self.get_group_mapping_by_external_id(scim_filter.value)
|
||||
if not mapping:
|
||||
@@ -623,9 +615,7 @@ class ScimDAL(DAL):
|
||||
|
||||
users = (
|
||||
self._session.scalars(
|
||||
select(User).where(
|
||||
User.id.in_(user_ids) # ty: ignore[unresolved-attribute]
|
||||
)
|
||||
select(User).where(User.id.in_(user_ids)) # type: ignore[attr-defined]
|
||||
)
|
||||
.unique()
|
||||
.all()
|
||||
@@ -650,9 +640,7 @@ class ScimDAL(DAL):
|
||||
return []
|
||||
existing_users = (
|
||||
self._session.scalars(
|
||||
select(User).where(
|
||||
User.id.in_(uuids) # ty: ignore[unresolved-attribute]
|
||||
)
|
||||
select(User).where(User.id.in_(uuids)) # type: ignore[attr-defined]
|
||||
)
|
||||
.unique()
|
||||
.all()
|
||||
|
||||
@@ -300,11 +300,8 @@ def fetch_user_groups_for_user(
|
||||
stmt = (
|
||||
select(UserGroup)
|
||||
.join(User__UserGroup, User__UserGroup.user_group_id == UserGroup.id)
|
||||
.join(
|
||||
User,
|
||||
User.id == User__UserGroup.user_id, # ty: ignore[invalid-argument-type]
|
||||
)
|
||||
.where(User.id == user_id) # ty: ignore[invalid-argument-type]
|
||||
.join(User, User.id == User__UserGroup.user_id) # type: ignore
|
||||
.where(User.id == user_id) # type: ignore
|
||||
)
|
||||
if only_curator_groups:
|
||||
stmt = stmt.where(User__UserGroup.is_curator == True) # noqa: E712
|
||||
@@ -433,7 +430,7 @@ def fetch_user_groups_for_documents(
|
||||
.group_by(Document.id)
|
||||
)
|
||||
|
||||
return db_session.execute(stmt).all() # ty: ignore[invalid-return-type]
|
||||
return db_session.execute(stmt).all() # type: ignore
|
||||
|
||||
|
||||
def _check_user_group_is_modifiable(user_group: UserGroup) -> None:
|
||||
@@ -807,9 +804,7 @@ def update_user_group(
|
||||
db_user_group.is_up_to_date = False
|
||||
|
||||
removed_users = db_session.scalars(
|
||||
select(User).where(
|
||||
User.id.in_(removed_user_ids) # ty: ignore[unresolved-attribute]
|
||||
)
|
||||
select(User).where(User.id.in_(removed_user_ids)) # type: ignore
|
||||
).unique()
|
||||
|
||||
# Filter out admin and global curator users before validating curator status
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from collections.abc import Iterator
|
||||
|
||||
from googleapiclient.discovery import Resource
|
||||
from googleapiclient.discovery import Resource # type: ignore
|
||||
|
||||
from ee.onyx.external_permissions.google_drive.models import GoogleDrivePermission
|
||||
from ee.onyx.external_permissions.google_drive.permission_retrieval import (
|
||||
@@ -38,7 +38,7 @@ def get_folder_permissions_by_ids(
|
||||
A list of permissions matching the provided permission IDs
|
||||
"""
|
||||
return get_permissions_by_ids(
|
||||
drive_service=service, # ty: ignore[invalid-argument-type]
|
||||
drive_service=service,
|
||||
doc_id=folder_id,
|
||||
permission_ids=permission_ids,
|
||||
)
|
||||
@@ -68,7 +68,7 @@ def get_modified_folders(
|
||||
|
||||
# Retrieve and yield folders
|
||||
for folder in execute_paginated_retrieval(
|
||||
retrieval_function=service.files().list, # ty: ignore[unresolved-attribute]
|
||||
retrieval_function=service.files().list,
|
||||
list_key="files",
|
||||
continue_on_404_or_403=True,
|
||||
corpora="allDrives",
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from googleapiclient.errors import HttpError
|
||||
from googleapiclient.errors import HttpError # type: ignore
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
@@ -183,7 +183,7 @@ def _get_drive_members(
|
||||
)
|
||||
|
||||
admin_user_info = (
|
||||
admin_service.users() # ty: ignore[unresolved-attribute]
|
||||
admin_service.users()
|
||||
.get(userKey=google_drive_connector.primary_admin_email)
|
||||
.execute()
|
||||
)
|
||||
@@ -197,7 +197,7 @@ def _get_drive_members(
|
||||
|
||||
try:
|
||||
for permission in execute_paginated_retrieval(
|
||||
drive_service.permissions().list, # ty: ignore[unresolved-attribute]
|
||||
drive_service.permissions().list,
|
||||
list_key="permissions",
|
||||
fileId=drive_id,
|
||||
fields="permissions(emailAddress, type),nextPageToken",
|
||||
@@ -256,7 +256,7 @@ def _get_all_google_groups(
|
||||
"""
|
||||
group_emails: set[str] = set()
|
||||
for group in execute_paginated_retrieval(
|
||||
admin_service.groups().list, # ty: ignore[unresolved-attribute]
|
||||
admin_service.groups().list,
|
||||
list_key="groups",
|
||||
domain=google_domain,
|
||||
fields="groups(email),nextPageToken",
|
||||
@@ -274,7 +274,7 @@ def _google_group_to_onyx_group(
|
||||
"""
|
||||
group_member_emails: set[str] = set()
|
||||
for member in execute_paginated_retrieval(
|
||||
admin_service.members().list, # ty: ignore[unresolved-attribute]
|
||||
admin_service.members().list,
|
||||
list_key="members",
|
||||
groupKey=group_email,
|
||||
fields="members(email),nextPageToken",
|
||||
@@ -298,7 +298,7 @@ def _map_group_email_to_member_emails(
|
||||
for group_email in group_emails:
|
||||
group_member_emails: set[str] = set()
|
||||
for member in execute_paginated_retrieval(
|
||||
admin_service.members().list, # ty: ignore[unresolved-attribute]
|
||||
admin_service.members().list,
|
||||
list_key="members",
|
||||
groupKey=group_email,
|
||||
fields="members(email),nextPageToken",
|
||||
|
||||
@@ -33,7 +33,7 @@ def get_permissions_by_ids(
|
||||
|
||||
# Fetch all permissions for the document
|
||||
fetched_permissions = execute_paginated_retrieval(
|
||||
retrieval_function=drive_service.permissions().list, # ty: ignore[unresolved-attribute]
|
||||
retrieval_function=drive_service.permissions().list,
|
||||
list_key="permissions",
|
||||
fileId=doc_id,
|
||||
fields="permissions(id, emailAddress, type, domain, allowFileDiscovery, permissionDetails),nextPageToken",
|
||||
|
||||
@@ -68,7 +68,7 @@ def _build_holder_map(permissions: list[dict]) -> dict[str, list[Holder]]:
|
||||
logger.warning(f"Expected a 'raw' field, but none was found: {raw_perm=}")
|
||||
continue
|
||||
|
||||
permission = Permission(**raw_perm.raw) # ty: ignore[invalid-argument-type]
|
||||
permission = Permission(**raw_perm.raw)
|
||||
|
||||
# We only care about ability to browse through projects + issues (not other permissions such as read/write).
|
||||
if permission.permission != "BROWSE_PROJECTS":
|
||||
|
||||
@@ -1,6 +1,6 @@
|
||||
from collections.abc import Generator
|
||||
|
||||
from office365.sharepoint.client_context import ClientContext
|
||||
from office365.sharepoint.client_context import ClientContext # type: ignore[import-untyped]
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
from ee.onyx.external_permissions.sharepoint.permission_utils import (
|
||||
|
||||
@@ -7,11 +7,11 @@ from typing import Any
|
||||
from urllib.parse import urlparse
|
||||
|
||||
import requests as _requests
|
||||
from office365.graph_client import GraphClient
|
||||
from office365.onedrive.driveitems.driveItem import DriveItem
|
||||
from office365.runtime.client_request import ClientRequestException
|
||||
from office365.sharepoint.client_context import ClientContext
|
||||
from office365.sharepoint.permissions.securable_object import RoleAssignmentCollection
|
||||
from office365.graph_client import GraphClient # type: ignore[import-untyped]
|
||||
from office365.onedrive.driveitems.driveItem import DriveItem # type: ignore[import-untyped]
|
||||
from office365.runtime.client_request import ClientRequestException # type: ignore
|
||||
from office365.sharepoint.client_context import ClientContext # type: ignore[import-untyped]
|
||||
from office365.sharepoint.permissions.securable_object import RoleAssignmentCollection # type: ignore[import-untyped]
|
||||
from pydantic import BaseModel
|
||||
|
||||
from ee.onyx.db.external_perm import ExternalUserGroup
|
||||
|
||||
@@ -46,10 +46,9 @@ def get_query_analytics(
|
||||
daily_query_usage_info = fetch_query_analytics(
|
||||
start=start
|
||||
or (
|
||||
datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
- datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
), # default is 30d lookback
|
||||
end=end or datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
end=end or datetime.datetime.utcnow(),
|
||||
db_session=db_session,
|
||||
)
|
||||
return [
|
||||
@@ -78,10 +77,9 @@ def get_user_analytics(
|
||||
daily_query_usage_info_per_user = fetch_per_user_query_analytics(
|
||||
start=start
|
||||
or (
|
||||
datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
- datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
), # default is 30d lookback
|
||||
end=end or datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
end=end or datetime.datetime.utcnow(),
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
@@ -113,10 +111,9 @@ def get_onyxbot_analytics(
|
||||
daily_onyxbot_info = fetch_onyxbot_analytics(
|
||||
start=start
|
||||
or (
|
||||
datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
- datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
), # default is 30d lookback
|
||||
end=end or datetime.datetime.now(tz=datetime.timezone.utc),
|
||||
end=end or datetime.datetime.utcnow(),
|
||||
db_session=db_session,
|
||||
)
|
||||
|
||||
@@ -149,10 +146,9 @@ def get_persona_messages(
|
||||
) -> list[PersonaMessageAnalyticsResponse]:
|
||||
"""Fetch daily message counts for a single persona within the given time range."""
|
||||
start = start or (
|
||||
datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
- datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
)
|
||||
end = end or datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
end = end or datetime.datetime.utcnow()
|
||||
|
||||
persona_message_counts = []
|
||||
for count, date in fetch_persona_message_analytics(
|
||||
@@ -230,10 +226,9 @@ def get_assistant_stats(
|
||||
along with the overall total messages and total distinct users.
|
||||
"""
|
||||
start = start or (
|
||||
datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
- datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
datetime.datetime.utcnow() - datetime.timedelta(days=_DEFAULT_LOOKBACK_DAYS)
|
||||
)
|
||||
end = end or datetime.datetime.now(tz=datetime.timezone.utc)
|
||||
end = end or datetime.datetime.utcnow()
|
||||
|
||||
if not user_can_view_assistant_stats(db_session, user, assistant_id):
|
||||
raise HTTPException(
|
||||
|
||||
@@ -287,10 +287,8 @@ def update_hook(
|
||||
validated_is_reachable: bool | None = None
|
||||
if endpoint_url_changing or api_key_changing or timeout_changing:
|
||||
existing = _get_hook_or_404(db_session, hook_id)
|
||||
effective_url: str = ( # ty: ignore[invalid-assignment]
|
||||
req.endpoint_url
|
||||
if endpoint_url_changing
|
||||
else existing.endpoint_url # endpoint_url is required on create and cannot be cleared on update
|
||||
effective_url: str = (
|
||||
req.endpoint_url if endpoint_url_changing else existing.endpoint_url # type: ignore[assignment] # endpoint_url is required on create and cannot be cleared on update
|
||||
)
|
||||
effective_api_key: str | None = (
|
||||
(api_key if not isinstance(api_key, UnsetType) else None)
|
||||
@@ -301,10 +299,8 @@ def update_hook(
|
||||
else None
|
||||
)
|
||||
)
|
||||
effective_timeout: float = ( # ty: ignore[invalid-assignment]
|
||||
req.timeout_seconds
|
||||
if timeout_changing
|
||||
else existing.timeout_seconds # req.timeout_seconds is non-None when timeout_changing (validated by HookUpdateRequest)
|
||||
effective_timeout: float = (
|
||||
req.timeout_seconds if timeout_changing else existing.timeout_seconds # type: ignore[assignment] # req.timeout_seconds is non-None when timeout_changing (validated by HookUpdateRequest)
|
||||
)
|
||||
validation = _validate_endpoint(
|
||||
endpoint_url=effective_url,
|
||||
|
||||
@@ -97,7 +97,7 @@ def fetch_and_process_chat_session_history(
|
||||
break
|
||||
|
||||
paged_snapshots = parallel_yield(
|
||||
[ # ty: ignore[invalid-argument-type]
|
||||
[
|
||||
yield_snapshot_from_chat_session(
|
||||
db_session=db_session,
|
||||
chat_session=chat_session,
|
||||
|
||||
@@ -1,6 +1,5 @@
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
|
||||
import jwt
|
||||
from fastapi import HTTPException
|
||||
@@ -20,8 +19,8 @@ def generate_data_plane_token() -> str:
|
||||
|
||||
payload = {
|
||||
"iss": "data_plane",
|
||||
"exp": datetime.now(tz=timezone.utc) + timedelta(minutes=5),
|
||||
"iat": datetime.now(tz=timezone.utc),
|
||||
"exp": datetime.utcnow() + timedelta(minutes=5),
|
||||
"iat": datetime.utcnow(),
|
||||
"scope": "api_access",
|
||||
}
|
||||
|
||||
|
||||
@@ -55,10 +55,8 @@ def run_alembic_migrations(schema_name: str) -> None:
|
||||
alembic_cfg.attributes["configure_logger"] = False
|
||||
|
||||
# Mimic command-line options by adding 'cmd_opts' to the config
|
||||
alembic_cfg.cmd_opts = SimpleNamespace() # ty: ignore[invalid-assignment]
|
||||
alembic_cfg.cmd_opts.x = [ # ty: ignore[invalid-assignment]
|
||||
f"schemas={schema_name}"
|
||||
]
|
||||
alembic_cfg.cmd_opts = SimpleNamespace() # type: ignore
|
||||
alembic_cfg.cmd_opts.x = [f"schemas={schema_name}"] # type: ignore
|
||||
|
||||
# Run migrations programmatically
|
||||
command.upgrade(alembic_cfg, "head")
|
||||
|
||||
@@ -349,9 +349,8 @@ def get_tenant_count(tenant_id: str) -> int:
|
||||
user_count = (
|
||||
db_session.query(User)
|
||||
.filter(
|
||||
User.email.in_(emails), # ty: ignore[unresolved-attribute]
|
||||
User.is_active # noqa: E712 # ty: ignore[invalid-argument-type]
|
||||
== True,
|
||||
User.email.in_(emails), # type: ignore
|
||||
User.is_active == True, # type: ignore # noqa: E712
|
||||
)
|
||||
.count()
|
||||
)
|
||||
|
||||
@@ -73,7 +73,7 @@ def capture_and_sync_with_alternate_posthog(
|
||||
cloud_props.pop("onyx_cloud_user_id", None)
|
||||
|
||||
posthog.identify(
|
||||
distinct_id=cloud_user_id, # ty: ignore[possibly-unresolved-reference]
|
||||
distinct_id=cloud_user_id,
|
||||
properties=cloud_props,
|
||||
)
|
||||
except Exception as e:
|
||||
@@ -105,7 +105,7 @@ def get_anon_id_from_request(request: Any) -> str | None:
|
||||
if (cookie_value := request.cookies.get(cookie_name)) and (
|
||||
parsed := parse_posthog_cookie(cookie_value)
|
||||
):
|
||||
return parsed.get("distinct_id") # ty: ignore[possibly-unresolved-reference]
|
||||
return parsed.get("distinct_id")
|
||||
|
||||
return None
|
||||
|
||||
|
||||
@@ -23,7 +23,7 @@
|
||||
# from shared_configs.model_server_models import IntentResponse
|
||||
|
||||
# if TYPE_CHECKING:
|
||||
# from setfit import SetFitModel
|
||||
# from setfit import SetFitModel # type: ignore[import-untyped]
|
||||
# from transformers import PreTrainedTokenizer, BatchEncoding
|
||||
|
||||
|
||||
@@ -423,7 +423,7 @@
|
||||
# def map_keywords(
|
||||
# input_ids: torch.Tensor, tokenizer: "PreTrainedTokenizer", is_keyword: list[bool]
|
||||
# ) -> list[str]:
|
||||
# tokens = tokenizer.convert_ids_to_tokens(input_ids)
|
||||
# tokens = tokenizer.convert_ids_to_tokens(input_ids) # type: ignore
|
||||
|
||||
# if not len(tokens) == len(is_keyword):
|
||||
# raise ValueError("Length of tokens and keyword predictions must match")
|
||||
|
||||
@@ -18,7 +18,7 @@
|
||||
# super().__init__()
|
||||
# config = DistilBertConfig()
|
||||
# self.distilbert = DistilBertModel(config)
|
||||
# config = self.distilbert.config
|
||||
# config = self.distilbert.config # type: ignore
|
||||
|
||||
# # Keyword tokenwise binary classification layer
|
||||
# self.keyword_classifier = nn.Linear(config.dim, 2)
|
||||
@@ -85,7 +85,7 @@
|
||||
|
||||
# self.config = config
|
||||
# self.distilbert = DistilBertModel(config)
|
||||
# config = self.distilbert.config
|
||||
# config = self.distilbert.config # type: ignore
|
||||
# self.connector_global_classifier = nn.Linear(config.dim, 1)
|
||||
# self.connector_match_classifier = nn.Linear(config.dim, 1)
|
||||
# self.tokenizer = DistilBertTokenizer.from_pretrained("distilbert-base-uncased")
|
||||
|
||||
@@ -96,14 +96,11 @@ def get_model_app() -> FastAPI:
|
||||
title="Onyx Model Server", version=__version__, lifespan=lifespan
|
||||
)
|
||||
if SENTRY_DSN:
|
||||
from onyx.configs.sentry import _add_instance_tags
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
integrations=[StarletteIntegration(), FastApiIntegration()],
|
||||
traces_sample_rate=0.1,
|
||||
release=__version__,
|
||||
before_send=_add_instance_tags,
|
||||
)
|
||||
logger.info("Sentry initialized")
|
||||
else:
|
||||
|
||||
@@ -7,8 +7,8 @@ from email.mime.text import MIMEText
|
||||
from email.utils import formatdate
|
||||
from email.utils import make_msgid
|
||||
|
||||
import sendgrid
|
||||
from sendgrid.helpers.mail import Attachment
|
||||
import sendgrid # type: ignore
|
||||
from sendgrid.helpers.mail import Attachment # type: ignore
|
||||
from sendgrid.helpers.mail import Content
|
||||
from sendgrid.helpers.mail import ContentId
|
||||
from sendgrid.helpers.mail import Disposition
|
||||
|
||||
@@ -10,7 +10,7 @@ from cryptography.hazmat.primitives.asymmetric.rsa import RSAPublicKey
|
||||
from jwt import decode as jwt_decode
|
||||
from jwt import InvalidTokenError
|
||||
from jwt import PyJWTError
|
||||
from jwt.algorithms import RSAAlgorithm # ty: ignore[possibly-missing-import]
|
||||
from jwt.algorithms import RSAAlgorithm
|
||||
|
||||
from onyx.configs.app_configs import JWT_PUBLIC_KEY_URL
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
@@ -46,10 +46,8 @@ async def _test_expire_oauth_token(
|
||||
|
||||
updated_data: Dict[str, Any] = {"expires_at": new_expires_at}
|
||||
|
||||
await user_manager.user_db.update_oauth_account( # ty: ignore[invalid-argument-type]
|
||||
user, # ty: ignore[invalid-argument-type]
|
||||
cast(Any, oauth_account),
|
||||
updated_data,
|
||||
await user_manager.user_db.update_oauth_account(
|
||||
user, cast(Any, oauth_account), updated_data
|
||||
)
|
||||
|
||||
return True
|
||||
@@ -134,10 +132,8 @@ async def refresh_oauth_token(
|
||||
)
|
||||
|
||||
# Update the OAuth account
|
||||
await user_manager.user_db.update_oauth_account( # ty: ignore[invalid-argument-type]
|
||||
user, # ty: ignore[invalid-argument-type]
|
||||
cast(Any, oauth_account),
|
||||
updated_data,
|
||||
await user_manager.user_db.update_oauth_account(
|
||||
user, cast(Any, oauth_account), updated_data
|
||||
)
|
||||
|
||||
logger.info(f"Successfully refreshed OAuth token for {user.email}")
|
||||
|
||||
@@ -191,7 +191,7 @@ class OAuthTokenManager:
|
||||
@staticmethod
|
||||
def _unwrap_sensitive_str(value: SensitiveValue[str] | str) -> str:
|
||||
if isinstance(value, SensitiveValue):
|
||||
return value.get_value(apply_mask=False) # ty: ignore[invalid-return-type]
|
||||
return value.get_value(apply_mask=False)
|
||||
return value
|
||||
|
||||
@staticmethod
|
||||
@@ -199,7 +199,5 @@ class OAuthTokenManager:
|
||||
token_data: SensitiveValue[dict[str, Any]] | dict[str, Any],
|
||||
) -> dict[str, Any]:
|
||||
if isinstance(token_data, SensitiveValue):
|
||||
return token_data.get_value( # ty: ignore[invalid-return-type]
|
||||
apply_mask=False
|
||||
)
|
||||
return token_data.get_value(apply_mask=False)
|
||||
return token_data
|
||||
|
||||
@@ -121,7 +121,5 @@ def require_permission(
|
||||
|
||||
return user
|
||||
|
||||
dependency._is_require_permission = ( # ty: ignore[unresolved-attribute]
|
||||
True # sentinel for auth_check detection
|
||||
)
|
||||
dependency._is_require_permission = True # type: ignore[attr-defined] # sentinel for auth_check detection
|
||||
return dependency
|
||||
|
||||
@@ -45,9 +45,7 @@ from fastapi_users import UUIDIDMixin
|
||||
from fastapi_users.authentication import AuthenticationBackend
|
||||
from fastapi_users.authentication import CookieTransport
|
||||
from fastapi_users.authentication import JWTStrategy
|
||||
from fastapi_users.authentication import (
|
||||
RedisStrategy, # ty: ignore[possibly-missing-import]
|
||||
)
|
||||
from fastapi_users.authentication import RedisStrategy
|
||||
from fastapi_users.authentication import Strategy
|
||||
from fastapi_users.authentication.strategy.db import AccessTokenDatabase
|
||||
from fastapi_users.authentication.strategy.db import DatabaseStrategy
|
||||
@@ -464,16 +462,14 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
self.user_db = tenant_user_db
|
||||
|
||||
if hasattr(user_create, "role"):
|
||||
user_create.role = UserRole.BASIC # ty: ignore[invalid-assignment]
|
||||
user_create.role = UserRole.BASIC
|
||||
|
||||
user_count = await get_user_count()
|
||||
if (
|
||||
user_count == 0
|
||||
or user_create.email in get_default_admin_user_emails()
|
||||
):
|
||||
user_create.role = ( # ty: ignore[invalid-assignment]
|
||||
UserRole.ADMIN
|
||||
)
|
||||
user_create.role = UserRole.ADMIN
|
||||
|
||||
# Check seat availability for new users (single-tenant only)
|
||||
with get_session_with_current_tenant() as sync_db:
|
||||
@@ -520,9 +516,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
# Expire so the async session re-fetches the row updated by
|
||||
# the sync session above.
|
||||
self.user_db.session.expire(user)
|
||||
user = await self.user_db.get( # ty: ignore[invalid-assignment]
|
||||
user_id
|
||||
)
|
||||
user = await self.user_db.get(user_id) # type: ignore[assignment]
|
||||
except exceptions.UserAlreadyExists:
|
||||
user = await self.get_by_email(user_create.email)
|
||||
|
||||
@@ -550,9 +544,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
# Expire so the async session re-fetches the row updated by
|
||||
# the sync session above.
|
||||
self.user_db.session.expire(user)
|
||||
user = await self.user_db.get( # ty: ignore[invalid-assignment]
|
||||
user_id
|
||||
)
|
||||
user = await self.user_db.get(user_id) # type: ignore[assignment]
|
||||
if user_created:
|
||||
await self._assign_default_pinned_assistants(user, db_session)
|
||||
remove_user_from_invited_users(user_create.email)
|
||||
@@ -600,11 +592,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
update nor the group assignment is visible without the other.
|
||||
"""
|
||||
with get_session_with_current_tenant() as sync_db:
|
||||
sync_user = (
|
||||
sync_db.query(User)
|
||||
.filter(User.id == user_id) # ty: ignore[invalid-argument-type]
|
||||
.first()
|
||||
)
|
||||
sync_user = sync_db.query(User).filter(User.id == user_id).first() # type: ignore[arg-type]
|
||||
if sync_user:
|
||||
sync_user.hashed_password = self.password_helper.hash(
|
||||
user_create.password
|
||||
@@ -625,9 +613,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
user_id,
|
||||
)
|
||||
|
||||
async def validate_password( # ty: ignore[invalid-method-override]
|
||||
self, password: str, _: schemas.UC | models.UP
|
||||
) -> None:
|
||||
async def validate_password(self, password: str, _: schemas.UC | models.UP) -> None:
|
||||
# Validate password according to configurable security policy (defined via environment variables)
|
||||
if len(password) < PASSWORD_MIN_LENGTH:
|
||||
raise exceptions.InvalidPasswordException(
|
||||
@@ -658,7 +644,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
return
|
||||
|
||||
@log_function_time(print_only=True)
|
||||
async def oauth_callback( # ty: ignore[invalid-method-override]
|
||||
async def oauth_callback(
|
||||
self,
|
||||
oauth_name: str,
|
||||
access_token: str,
|
||||
@@ -768,7 +754,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
user,
|
||||
# NOTE: OAuthAccount DOES implement the OAuthAccountProtocol
|
||||
# but the type checker doesn't know that :(
|
||||
existing_oauth_account, # ty: ignore[invalid-argument-type]
|
||||
existing_oauth_account, # type: ignore
|
||||
oauth_account_dict,
|
||||
)
|
||||
|
||||
@@ -802,11 +788,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
# transaction so neither change is visible without the other.
|
||||
was_inactive = not user.is_active
|
||||
with get_session_with_current_tenant() as sync_db:
|
||||
sync_user = (
|
||||
sync_db.query(User)
|
||||
.filter(User.id == user.id) # ty: ignore[invalid-argument-type]
|
||||
.first()
|
||||
)
|
||||
sync_user = sync_db.query(User).filter(User.id == user.id).first() # type: ignore[arg-type]
|
||||
if sync_user:
|
||||
sync_user.is_verified = is_verified_by_default
|
||||
sync_user.role = UserRole.BASIC
|
||||
@@ -826,7 +808,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
# otherwise, the oidc expiry will always be old, and the user will never be able to login
|
||||
if user.oidc_expiry is not None and not TRACK_EXTERNAL_IDP_EXPIRY:
|
||||
await self.user_db.update(user, {"oidc_expiry": None})
|
||||
user.oidc_expiry = None # ty: ignore[invalid-assignment]
|
||||
user.oidc_expiry = None # type: ignore
|
||||
remove_user_from_invited_users(user.email)
|
||||
if token:
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.reset(token)
|
||||
@@ -943,11 +925,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
and (marketing_cookie_value := request.cookies.get(marketing_cookie_name))
|
||||
and (parsed_cookie := parse_posthog_cookie(marketing_cookie_value))
|
||||
):
|
||||
marketing_anonymous_id = (
|
||||
parsed_cookie[ # ty: ignore[possibly-unresolved-reference]
|
||||
"distinct_id"
|
||||
]
|
||||
)
|
||||
marketing_anonymous_id = parsed_cookie["distinct_id"]
|
||||
|
||||
# Technically, USER_SIGNED_UP is only fired from the cloud site when
|
||||
# it is the first user in a tenant. However, it is semantically correct
|
||||
@@ -964,10 +942,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
|
||||
}
|
||||
|
||||
# Add all other values from the marketing cookie (featureFlags, etc.)
|
||||
for (
|
||||
key,
|
||||
value,
|
||||
) in parsed_cookie.items(): # ty: ignore[possibly-unresolved-reference]
|
||||
for key, value in parsed_cookie.items():
|
||||
if key != "distinct_id":
|
||||
properties.setdefault(key, value)
|
||||
|
||||
@@ -1529,7 +1504,7 @@ async def _sync_jwt_oidc_expiry(
|
||||
|
||||
if user.oidc_expiry is not None:
|
||||
await user_manager.user_db.update(user, {"oidc_expiry": None})
|
||||
user.oidc_expiry = None # ty: ignore[invalid-assignment]
|
||||
user.oidc_expiry = None # type: ignore
|
||||
|
||||
|
||||
async def _get_or_create_user_from_jwt(
|
||||
@@ -2257,7 +2232,7 @@ def get_oauth_router(
|
||||
|
||||
# Proceed to authenticate or create the user
|
||||
try:
|
||||
user = await user_manager.oauth_callback( # ty: ignore[invalid-argument-type]
|
||||
user = await user_manager.oauth_callback(
|
||||
oauth_client.name,
|
||||
token["access_token"],
|
||||
account_id,
|
||||
|
||||
@@ -6,16 +6,16 @@ from typing import Any
|
||||
from typing import cast
|
||||
|
||||
import sentry_sdk
|
||||
from celery import bootsteps # ty: ignore[unresolved-import]
|
||||
from celery import bootsteps # type: ignore
|
||||
from celery import Task
|
||||
from celery.app import trace # ty: ignore[unresolved-import]
|
||||
from celery.app import trace
|
||||
from celery.exceptions import WorkerShutdown
|
||||
from celery.signals import before_task_publish
|
||||
from celery.signals import task_postrun
|
||||
from celery.signals import task_prerun
|
||||
from celery.states import READY_STATES
|
||||
from celery.utils.log import get_task_logger
|
||||
from celery.worker import strategy # ty: ignore[unresolved-import]
|
||||
from celery.worker import strategy # type: ignore
|
||||
from redis.lock import Lock as RedisLock
|
||||
from sentry_sdk.integrations.celery import CeleryIntegration
|
||||
from sqlalchemy import text
|
||||
@@ -30,7 +30,6 @@ from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_PREFI
|
||||
from onyx.background.celery.tasks.vespa.document_sync import DOCUMENT_SYNC_TASKSET_KEY
|
||||
from onyx.configs.app_configs import DISABLE_VECTOR_DB
|
||||
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
|
||||
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
|
||||
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
|
||||
from onyx.configs.constants import OnyxRedisLocks
|
||||
from onyx.db.engine.sql_engine import get_sqlalchemy_engine
|
||||
@@ -64,14 +63,11 @@ logger = setup_logger()
|
||||
task_logger = get_task_logger(__name__)
|
||||
|
||||
if SENTRY_DSN:
|
||||
from onyx.configs.sentry import _add_instance_tags
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
integrations=[CeleryIntegration()],
|
||||
traces_sample_rate=0.1,
|
||||
release=__version__,
|
||||
before_send=_add_instance_tags,
|
||||
)
|
||||
logger.info("Sentry initialized")
|
||||
else:
|
||||
@@ -532,26 +528,23 @@ def reset_tenant_id(
|
||||
CURRENT_TENANT_ID_CONTEXTVAR.set(POSTGRES_DEFAULT_SCHEMA)
|
||||
|
||||
|
||||
def wait_for_document_index_or_shutdown() -> None:
|
||||
"""
|
||||
Waits for all configured document indices to become ready subject to a
|
||||
timeout.
|
||||
def wait_for_vespa_or_shutdown(
|
||||
sender: Any, # noqa: ARG001
|
||||
**kwargs: Any, # noqa: ARG001
|
||||
) -> None: # noqa: ARG001
|
||||
"""Waits for Vespa to become ready subject to a timeout.
|
||||
Raises WorkerShutdown if the timeout is reached."""
|
||||
|
||||
Raises WorkerShutdown if the timeout is reached.
|
||||
"""
|
||||
if DISABLE_VECTOR_DB:
|
||||
logger.info(
|
||||
"DISABLE_VECTOR_DB is set — skipping Vespa/OpenSearch readiness check."
|
||||
)
|
||||
return
|
||||
|
||||
if not ONYX_DISABLE_VESPA:
|
||||
if not wait_for_vespa_with_timeout():
|
||||
msg = (
|
||||
"[Vespa] Readiness probe did not succeed within the timeout. Exiting..."
|
||||
)
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
if not wait_for_vespa_with_timeout():
|
||||
msg = "[Vespa] Readiness probe did not succeed within the timeout. Exiting..."
|
||||
logger.error(msg)
|
||||
raise WorkerShutdown(msg)
|
||||
|
||||
if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
|
||||
if not wait_for_opensearch_with_timeout():
|
||||
|
||||
@@ -3,7 +3,7 @@ from typing import Any
|
||||
|
||||
from celery import Celery
|
||||
from celery import signals
|
||||
from celery.beat import PersistentScheduler # ty: ignore[unresolved-import]
|
||||
from celery.beat import PersistentScheduler # type: ignore
|
||||
from celery.signals import beat_init
|
||||
from celery.utils.log import get_task_logger
|
||||
|
||||
|
||||
@@ -4,4 +4,4 @@ import onyx.background.celery.apps.app_base as app_base
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.client")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
@@ -29,7 +29,7 @@ logger = setup_logger()
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.docfetching")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
|
||||
@signals.task_prerun.connect
|
||||
@@ -100,12 +100,12 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
logger.info("worker_init signal received.")
|
||||
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_DOCFETCHING_APP_NAME)
|
||||
pool_size = cast(int, sender.concurrency) # ty: ignore[unresolved-attribute]
|
||||
pool_size = cast(int, sender.concurrency) # type: ignore
|
||||
SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_document_index_or_shutdown()
|
||||
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
|
||||
@@ -30,7 +30,7 @@ logger = setup_logger()
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.docprocessing")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
|
||||
@signals.task_prerun.connect
|
||||
@@ -106,12 +106,12 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
# "SSL connection has been closed unexpectedly"
|
||||
# actually setting the spawn method in the cloud fixes 95% of these.
|
||||
# setting pre ping might help even more, but not worrying about that yet
|
||||
pool_size = cast(int, sender.concurrency) # ty: ignore[unresolved-attribute]
|
||||
pool_size = cast(int, sender.concurrency) # type: ignore
|
||||
SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_document_index_or_shutdown()
|
||||
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
|
||||
@@ -27,7 +27,7 @@ logger = setup_logger()
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.heavy")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
|
||||
@signals.task_prerun.connect
|
||||
@@ -92,12 +92,12 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
logger.info("worker_init signal received.")
|
||||
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_HEAVY_APP_NAME)
|
||||
pool_size = cast(int, sender.concurrency) # ty: ignore[unresolved-attribute]
|
||||
pool_size = cast(int, sender.concurrency) # type: ignore
|
||||
SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_document_index_or_shutdown()
|
||||
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
|
||||
@@ -29,7 +29,7 @@ logger = setup_logger()
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.light")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
|
||||
@signals.task_prerun.connect
|
||||
@@ -95,30 +95,23 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
|
||||
logger.info("worker_init signal received.")
|
||||
|
||||
logger.info(
|
||||
f"Concurrency: {sender.concurrency}" # ty: ignore[unresolved-attribute]
|
||||
)
|
||||
logger.info(f"Concurrency: {sender.concurrency}") # type: ignore
|
||||
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_LIGHT_APP_NAME)
|
||||
SqlEngine.init_engine(
|
||||
pool_size=sender.concurrency, # ty: ignore[unresolved-attribute]
|
||||
max_overflow=EXTRA_CONCURRENCY,
|
||||
)
|
||||
SqlEngine.init_engine(pool_size=sender.concurrency, max_overflow=EXTRA_CONCURRENCY) # type: ignore
|
||||
|
||||
if MANAGED_VESPA:
|
||||
httpx_init_vespa_pool(
|
||||
sender.concurrency + EXTRA_CONCURRENCY, # ty: ignore[unresolved-attribute]
|
||||
sender.concurrency + EXTRA_CONCURRENCY, # type: ignore
|
||||
ssl_cert=VESPA_CLOUD_CERT_PATH,
|
||||
ssl_key=VESPA_CLOUD_KEY_PATH,
|
||||
)
|
||||
else:
|
||||
httpx_init_vespa_pool(
|
||||
sender.concurrency + EXTRA_CONCURRENCY # ty: ignore[unresolved-attribute]
|
||||
)
|
||||
httpx_init_vespa_pool(sender.concurrency + EXTRA_CONCURRENCY) # type: ignore
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_document_index_or_shutdown()
|
||||
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
|
||||
@@ -20,7 +20,7 @@ logger = setup_logger()
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.monitoring")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
|
||||
@signals.task_prerun.connect
|
||||
|
||||
@@ -3,7 +3,7 @@ import os
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
|
||||
from celery import bootsteps # ty: ignore[unresolved-import]
|
||||
from celery import bootsteps # type: ignore
|
||||
from celery import Celery
|
||||
from celery import signals
|
||||
from celery import Task
|
||||
@@ -38,12 +38,6 @@ from onyx.redis.redis_connector_stop import RedisConnectorStop
|
||||
from onyx.redis.redis_document_set import RedisDocumentSet
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_usergroup import RedisUserGroup
|
||||
from onyx.server.metrics.celery_task_metrics import on_celery_task_postrun
|
||||
from onyx.server.metrics.celery_task_metrics import on_celery_task_prerun
|
||||
from onyx.server.metrics.celery_task_metrics import on_celery_task_rejected
|
||||
from onyx.server.metrics.celery_task_metrics import on_celery_task_retry
|
||||
from onyx.server.metrics.celery_task_metrics import on_celery_task_revoked
|
||||
from onyx.server.metrics.metrics_server import start_metrics_server
|
||||
from onyx.utils.logger import setup_logger
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
from shared_configs.configs import POSTGRES_DEFAULT_SCHEMA
|
||||
@@ -52,7 +46,7 @@ logger = setup_logger()
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.primary")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
|
||||
@signals.task_prerun.connect
|
||||
@@ -65,7 +59,6 @@ def on_task_prerun(
|
||||
**kwds: Any,
|
||||
) -> None:
|
||||
app_base.on_task_prerun(sender, task_id, task, args, kwargs, **kwds)
|
||||
on_celery_task_prerun(task_id, task)
|
||||
|
||||
|
||||
@signals.task_postrun.connect
|
||||
@@ -80,31 +73,6 @@ def on_task_postrun(
|
||||
**kwds: Any,
|
||||
) -> None:
|
||||
app_base.on_task_postrun(sender, task_id, task, args, kwargs, retval, state, **kwds)
|
||||
on_celery_task_postrun(task_id, task, state)
|
||||
|
||||
|
||||
@signals.task_retry.connect
|
||||
def on_task_retry(sender: Any | None = None, **kwargs: Any) -> None: # noqa: ARG001
|
||||
task_id = getattr(getattr(sender, "request", None), "id", None)
|
||||
on_celery_task_retry(task_id, sender)
|
||||
|
||||
|
||||
@signals.task_revoked.connect
|
||||
def on_task_revoked(sender: Any | None = None, **kwargs: Any) -> None:
|
||||
task_name = getattr(sender, "name", None) or str(sender)
|
||||
on_celery_task_revoked(kwargs.get("task_id"), task_name)
|
||||
|
||||
|
||||
@signals.task_rejected.connect
|
||||
def on_task_rejected(sender: Any | None = None, **kwargs: Any) -> None: # noqa: ARG001
|
||||
message = kwargs.get("message")
|
||||
task_name: str | None = None
|
||||
if message is not None:
|
||||
headers = getattr(message, "headers", None) or {}
|
||||
task_name = headers.get("task")
|
||||
if task_name is None:
|
||||
task_name = "unknown"
|
||||
on_celery_task_rejected(None, task_name)
|
||||
|
||||
|
||||
@celeryd_init.connect
|
||||
@@ -117,14 +85,14 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
logger.info("worker_init signal received.")
|
||||
|
||||
SqlEngine.set_app_name(POSTGRES_CELERY_WORKER_PRIMARY_APP_NAME)
|
||||
pool_size = cast(int, sender.concurrency) # ty: ignore[unresolved-attribute]
|
||||
pool_size = cast(int, sender.concurrency) # type: ignore
|
||||
SqlEngine.init_engine(
|
||||
pool_size=pool_size, max_overflow=CELERY_WORKER_PRIMARY_POOL_OVERFLOW
|
||||
)
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_document_index_or_shutdown()
|
||||
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
|
||||
|
||||
logger.info(f"Running as the primary celery worker: pid={os.getpid()}")
|
||||
|
||||
@@ -177,7 +145,7 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
raise WorkerShutdown("Primary worker lock could not be acquired!")
|
||||
|
||||
# tacking on our own user data to the sender
|
||||
sender.primary_worker_lock = lock # ty: ignore[unresolved-attribute]
|
||||
sender.primary_worker_lock = lock # type: ignore
|
||||
|
||||
# As currently designed, when this worker starts as "primary", we reinitialize redis
|
||||
# to a clean state (for our purposes, anyway)
|
||||
@@ -244,7 +212,6 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
|
||||
@worker_ready.connect
|
||||
def on_worker_ready(sender: Any, **kwargs: Any) -> None:
|
||||
start_metrics_server("primary")
|
||||
app_base.on_worker_ready(sender, **kwargs)
|
||||
|
||||
|
||||
@@ -355,6 +322,7 @@ celery_app.autodiscover_tasks(
|
||||
"onyx.background.celery.tasks.vespa",
|
||||
"onyx.background.celery.tasks.llm_model_update",
|
||||
"onyx.background.celery.tasks.user_file_processing",
|
||||
"onyx.server.features.proposal_review.engine",
|
||||
]
|
||||
)
|
||||
)
|
||||
|
||||
@@ -22,7 +22,7 @@ logger = setup_logger()
|
||||
|
||||
celery_app = Celery(__name__)
|
||||
celery_app.config_from_object("onyx.background.celery.configs.user_file_processing")
|
||||
celery_app.Task = app_base.TenantAwareTask # ty: ignore[invalid-assignment]
|
||||
celery_app.Task = app_base.TenantAwareTask # type: ignore [misc]
|
||||
|
||||
|
||||
@signals.task_prerun.connect
|
||||
@@ -66,12 +66,12 @@ def on_worker_init(sender: Worker, **kwargs: Any) -> None:
|
||||
# "SSL connection has been closed unexpectedly"
|
||||
# actually setting the spawn method in the cloud fixes 95% of these.
|
||||
# setting pre ping might help even more, but not worrying about that yet
|
||||
pool_size = cast(int, sender.concurrency) # ty: ignore[unresolved-attribute]
|
||||
pool_size = cast(int, sender.concurrency) # type: ignore
|
||||
SqlEngine.init_engine(pool_size=pool_size, max_overflow=8)
|
||||
|
||||
app_base.wait_for_redis(sender, **kwargs)
|
||||
app_base.wait_for_db(sender, **kwargs)
|
||||
app_base.wait_for_document_index_or_shutdown()
|
||||
app_base.wait_for_vespa_or_shutdown(sender, **kwargs)
|
||||
|
||||
# Less startup checks in multi-tenant case
|
||||
if MULTI_TENANT:
|
||||
|
||||
@@ -179,7 +179,7 @@ def celery_inspect_get_workers(name_filter: str | None, app: Celery) -> list[str
|
||||
|
||||
# filter for and create an indexing specific inspect object
|
||||
inspect = app.control.inspect()
|
||||
workers: dict[str, Any] = inspect.ping() # ty: ignore[invalid-assignment]
|
||||
workers: dict[str, Any] = inspect.ping() # type: ignore
|
||||
if workers:
|
||||
for worker_name in list(workers.keys()):
|
||||
# if the name filter not set, return all worker names
|
||||
@@ -208,9 +208,7 @@ def celery_inspect_get_reserved(worker_names: list[str], app: Celery) -> set[str
|
||||
inspect = app.control.inspect(destination=worker_names)
|
||||
|
||||
# get the list of reserved tasks
|
||||
reserved_tasks: dict[str, list] | None = ( # ty: ignore[invalid-assignment]
|
||||
inspect.reserved()
|
||||
)
|
||||
reserved_tasks: dict[str, list] | None = inspect.reserved() # type: ignore
|
||||
if reserved_tasks:
|
||||
for _, task_list in reserved_tasks.items():
|
||||
for task in task_list:
|
||||
@@ -231,9 +229,7 @@ def celery_inspect_get_active(worker_names: list[str], app: Celery) -> set[str]:
|
||||
inspect = app.control.inspect(destination=worker_names)
|
||||
|
||||
# get the list of reserved tasks
|
||||
active_tasks: dict[str, list] | None = ( # ty: ignore[invalid-assignment]
|
||||
inspect.active()
|
||||
)
|
||||
active_tasks: dict[str, list] | None = inspect.active() # type: ignore
|
||||
if active_tasks:
|
||||
for _, task_list in active_tasks.items():
|
||||
for task in task_list:
|
||||
|
||||
@@ -6,11 +6,9 @@ from celery.schedules import crontab
|
||||
|
||||
from onyx.configs.app_configs import AUTO_LLM_CONFIG_URL
|
||||
from onyx.configs.app_configs import AUTO_LLM_UPDATE_INTERVAL_SECONDS
|
||||
from onyx.configs.app_configs import DISABLE_OPENSEARCH_MIGRATION_TASK
|
||||
from onyx.configs.app_configs import DISABLE_VECTOR_DB
|
||||
from onyx.configs.app_configs import ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
|
||||
from onyx.configs.app_configs import ENTERPRISE_EDITION_ENABLED
|
||||
from onyx.configs.app_configs import ONYX_DISABLE_VESPA
|
||||
from onyx.configs.app_configs import SCHEDULED_EVAL_DATASET_NAMES
|
||||
from onyx.configs.constants import ONYX_CLOUD_CELERY_TASK_PREFIX
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
@@ -68,7 +66,6 @@ beat_task_templates: list[dict] = [
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"work_gated": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -82,6 +79,15 @@ beat_task_templates: list[dict] = [
|
||||
"skip_gated": False,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "check-for-dangling-import-jobs",
|
||||
"task": OnyxCeleryTask.CHECK_FOR_DANGLING_IMPORT_JOBS,
|
||||
"schedule": timedelta(minutes=10),
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.LOW,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
},
|
||||
},
|
||||
{
|
||||
"name": "check-for-index-attempt-cleanup",
|
||||
"task": OnyxCeleryTask.CHECK_FOR_INDEX_ATTEMPT_CLEANUP,
|
||||
@@ -102,7 +108,6 @@ beat_task_templates: list[dict] = [
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
# Gated tenants may still have connectors awaiting deletion.
|
||||
"skip_gated": False,
|
||||
"work_gated": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -112,7 +117,6 @@ beat_task_templates: list[dict] = [
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"work_gated": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -122,7 +126,6 @@ beat_task_templates: list[dict] = [
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"work_gated": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -160,7 +163,6 @@ beat_task_templates: list[dict] = [
|
||||
"priority": OnyxCeleryPriority.LOW,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"queue": OnyxCeleryQueues.SANDBOX,
|
||||
"work_gated": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -185,7 +187,6 @@ if ENTERPRISE_EDITION_ENABLED:
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"work_gated": True,
|
||||
},
|
||||
},
|
||||
{
|
||||
@@ -195,7 +196,6 @@ if ENTERPRISE_EDITION_ENABLED:
|
||||
"options": {
|
||||
"priority": OnyxCeleryPriority.MEDIUM,
|
||||
"expires": BEAT_EXPIRES_DEFAULT,
|
||||
"work_gated": True,
|
||||
},
|
||||
},
|
||||
]
|
||||
@@ -235,11 +235,7 @@ if SCHEDULED_EVAL_DATASET_NAMES:
|
||||
)
|
||||
|
||||
# Add OpenSearch migration task if enabled.
|
||||
if (
|
||||
ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
|
||||
and not DISABLE_OPENSEARCH_MIGRATION_TASK
|
||||
and not ONYX_DISABLE_VESPA
|
||||
):
|
||||
if ENABLE_OPENSEARCH_INDEXING_FOR_ONYX:
|
||||
beat_task_templates.append(
|
||||
{
|
||||
"name": "migrate-chunks-from-vespa-to-opensearch",
|
||||
@@ -292,7 +288,7 @@ def make_cloud_generator_task(task: dict[str, Any]) -> dict[str, Any]:
|
||||
cloud_task["kwargs"] = {}
|
||||
cloud_task["kwargs"]["task_name"] = task["task"]
|
||||
|
||||
optional_fields = ["queue", "priority", "expires", "skip_gated", "work_gated"]
|
||||
optional_fields = ["queue", "priority", "expires", "skip_gated"]
|
||||
for field in optional_fields:
|
||||
if field in task["options"]:
|
||||
cloud_task["kwargs"][field] = task["options"][field]
|
||||
@@ -385,14 +381,12 @@ if not MULTI_TENANT:
|
||||
]
|
||||
)
|
||||
|
||||
# `skip_gated` and `work_gated` are cloud-only hints consumed by
|
||||
# `cloud_beat_task_generator`. Strip them before extending the self-hosted
|
||||
# schedule so they don't leak into apply_async as unrecognised options on
|
||||
# every fired task message.
|
||||
# `skip_gated` is a cloud-only hint consumed by `cloud_beat_task_generator`. Strip
|
||||
# it before extending the self-hosted schedule so it doesn't leak into apply_async
|
||||
# as an unrecognised option on every fired task message.
|
||||
for _template in beat_task_templates:
|
||||
_self_hosted_template = copy.deepcopy(_template)
|
||||
_self_hosted_template["options"].pop("skip_gated", None)
|
||||
_self_hosted_template["options"].pop("work_gated", None)
|
||||
tasks_to_schedule.append(_self_hosted_template)
|
||||
|
||||
|
||||
|
||||
@@ -59,7 +59,6 @@ from onyx.redis.redis_connector_delete import RedisConnectorDelete
|
||||
from onyx.redis.redis_connector_delete import RedisConnectorDeletePayload
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import get_redis_replica_client
|
||||
from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
|
||||
from onyx.server.metrics.deletion_metrics import inc_deletion_blocked
|
||||
from onyx.server.metrics.deletion_metrics import inc_deletion_completed
|
||||
from onyx.server.metrics.deletion_metrics import inc_deletion_fence_reset
|
||||
@@ -166,22 +165,12 @@ def check_for_connector_deletion_task(self: Task, *, tenant_id: str) -> bool | N
|
||||
|
||||
r.set(OnyxRedisSignals.BLOCK_VALIDATE_CONNECTOR_DELETION_FENCES, 1, ex=300)
|
||||
|
||||
# collect cc_pair_ids and note whether any are in DELETING status
|
||||
# collect cc_pair_ids
|
||||
cc_pair_ids: list[int] = []
|
||||
has_deleting_cc_pair = False
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
cc_pairs = get_connector_credential_pairs(db_session)
|
||||
for cc_pair in cc_pairs:
|
||||
cc_pair_ids.append(cc_pair.id)
|
||||
if cc_pair.status == ConnectorCredentialPairStatus.DELETING:
|
||||
has_deleting_cc_pair = True
|
||||
|
||||
# Tenant-work-gating hook: mark only when at least one cc_pair is in
|
||||
# DELETING status. Marking on bare cc_pair existence would keep
|
||||
# nearly every tenant in the active set since most have cc_pairs
|
||||
# but almost none are actively being deleted on any given cycle.
|
||||
if has_deleting_cc_pair:
|
||||
maybe_mark_tenant_active(tenant_id)
|
||||
|
||||
# try running cleanup on the cc_pair_ids
|
||||
for cc_pair_id in cc_pair_ids:
|
||||
|
||||
@@ -34,7 +34,6 @@ from onyx.db.index_attempt import mark_attempt_canceled
|
||||
from onyx.db.index_attempt import mark_attempt_failed
|
||||
from onyx.db.indexing_coordination import IndexingCoordination
|
||||
from onyx.redis.redis_connector import RedisConnector
|
||||
from onyx.server.metrics.connector_health_metrics import on_index_attempt_status_change
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.variable_functionality import global_version
|
||||
from shared_configs.configs import SENTRY_DSN
|
||||
@@ -136,13 +135,10 @@ def _docfetching_task(
|
||||
# Since connector_indexing_proxy_task spawns a new process using this function as
|
||||
# the entrypoint, we init Sentry here.
|
||||
if SENTRY_DSN:
|
||||
from onyx.configs.sentry import _add_instance_tags
|
||||
|
||||
sentry_sdk.init(
|
||||
dsn=SENTRY_DSN,
|
||||
traces_sample_rate=0.1,
|
||||
release=__version__,
|
||||
before_send=_add_instance_tags,
|
||||
)
|
||||
logger.info("Sentry initialized")
|
||||
else:
|
||||
@@ -471,15 +467,6 @@ def docfetching_proxy_task(
|
||||
index_attempt.connector_credential_pair.connector.source.value
|
||||
)
|
||||
|
||||
cc_pair = index_attempt.connector_credential_pair
|
||||
on_index_attempt_status_change(
|
||||
tenant_id=tenant_id,
|
||||
source=result.connector_source,
|
||||
cc_pair_id=cc_pair_id,
|
||||
connector_name=cc_pair.connector.name or f"cc_pair_{cc_pair_id}",
|
||||
status="in_progress",
|
||||
)
|
||||
|
||||
while True:
|
||||
sleep(5)
|
||||
|
||||
|
||||
@@ -3,7 +3,6 @@ import os
|
||||
import time
|
||||
import traceback
|
||||
from collections import defaultdict
|
||||
from dataclasses import dataclass
|
||||
from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
@@ -51,7 +50,6 @@ from onyx.configs.constants import AuthType
|
||||
from onyx.configs.constants import CELERY_GENERIC_BEAT_LOCK_TIMEOUT
|
||||
from onyx.configs.constants import CELERY_INDEXING_LOCK_TIMEOUT
|
||||
from onyx.configs.constants import MilestoneRecordType
|
||||
from onyx.configs.constants import NotificationType
|
||||
from onyx.configs.constants import OnyxCeleryPriority
|
||||
from onyx.configs.constants import OnyxCeleryQueues
|
||||
from onyx.configs.constants import OnyxCeleryTask
|
||||
@@ -87,8 +85,6 @@ from onyx.db.indexing_coordination import INDEXING_PROGRESS_TIMEOUT_HOURS
|
||||
from onyx.db.indexing_coordination import IndexingCoordination
|
||||
from onyx.db.models import IndexAttempt
|
||||
from onyx.db.models import SearchSettings
|
||||
from onyx.db.notification import create_notification
|
||||
from onyx.db.notification import get_notifications
|
||||
from onyx.db.search_settings import get_current_search_settings
|
||||
from onyx.db.search_settings import get_secondary_search_settings
|
||||
from onyx.db.swap_index import check_and_perform_index_swap
|
||||
@@ -108,11 +104,7 @@ from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import get_redis_replica_client
|
||||
from onyx.redis.redis_pool import redis_lock_dump
|
||||
from onyx.redis.redis_pool import SCAN_ITER_COUNT_DEFAULT
|
||||
from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
|
||||
from onyx.redis.redis_utils import is_fence
|
||||
from onyx.server.metrics.connector_health_metrics import on_connector_error_state_change
|
||||
from onyx.server.metrics.connector_health_metrics import on_connector_indexing_success
|
||||
from onyx.server.metrics.connector_health_metrics import on_index_attempt_status_change
|
||||
from onyx.server.runtime.onyx_runtime import OnyxRuntime
|
||||
from onyx.utils.logger import setup_logger
|
||||
from onyx.utils.middleware import make_randomized_onyx_request_id
|
||||
@@ -408,6 +400,7 @@ def check_indexing_completion(
|
||||
tenant_id: str,
|
||||
task: Task,
|
||||
) -> None:
|
||||
|
||||
logger.info(
|
||||
f"Checking for indexing completion: attempt={index_attempt_id} tenant={tenant_id}"
|
||||
)
|
||||
@@ -528,25 +521,13 @@ def check_indexing_completion(
|
||||
|
||||
# Update CC pair status if successful
|
||||
cc_pair = get_connector_credential_pair_from_id(
|
||||
db_session,
|
||||
attempt.connector_credential_pair_id,
|
||||
eager_load_connector=True,
|
||||
db_session, attempt.connector_credential_pair_id
|
||||
)
|
||||
if cc_pair is None:
|
||||
raise RuntimeError(
|
||||
f"CC pair {attempt.connector_credential_pair_id} not found in database"
|
||||
)
|
||||
|
||||
source = cc_pair.connector.source.value
|
||||
connector_name = cc_pair.connector.name or f"cc_pair_{cc_pair.id}"
|
||||
on_index_attempt_status_change(
|
||||
tenant_id=tenant_id,
|
||||
source=source,
|
||||
cc_pair_id=cc_pair.id,
|
||||
connector_name=connector_name,
|
||||
status=attempt.status.value,
|
||||
)
|
||||
|
||||
if attempt.status.is_successful():
|
||||
# NOTE: we define the last successful index time as the time the last successful
|
||||
# attempt finished. This is distinct from the poll_range_end of the last successful
|
||||
@@ -567,41 +548,10 @@ def check_indexing_completion(
|
||||
event=MilestoneRecordType.CONNECTOR_SUCCEEDED,
|
||||
)
|
||||
|
||||
on_connector_indexing_success(
|
||||
tenant_id=tenant_id,
|
||||
source=source,
|
||||
cc_pair_id=cc_pair.id,
|
||||
connector_name=connector_name,
|
||||
docs_indexed=attempt.new_docs_indexed or 0,
|
||||
success_timestamp=attempt.time_updated.timestamp(),
|
||||
)
|
||||
|
||||
# Clear repeated error state on success
|
||||
if cc_pair.in_repeated_error_state:
|
||||
cc_pair.in_repeated_error_state = False
|
||||
|
||||
# Delete any existing error notification for this CC pair so a
|
||||
# fresh one is created if the connector fails again later.
|
||||
for notif in get_notifications(
|
||||
user=None,
|
||||
db_session=db_session,
|
||||
notif_type=NotificationType.CONNECTOR_REPEATED_ERRORS,
|
||||
include_dismissed=True,
|
||||
):
|
||||
if (
|
||||
notif.additional_data
|
||||
and notif.additional_data.get("cc_pair_id") == cc_pair.id
|
||||
):
|
||||
db_session.delete(notif)
|
||||
|
||||
db_session.commit()
|
||||
on_connector_error_state_change(
|
||||
tenant_id=tenant_id,
|
||||
source=source,
|
||||
cc_pair_id=cc_pair.id,
|
||||
connector_name=connector_name,
|
||||
in_error=False,
|
||||
)
|
||||
|
||||
if attempt.status == IndexingStatus.SUCCESS:
|
||||
logger.info(
|
||||
@@ -658,27 +608,6 @@ def active_indexing_attempt(
|
||||
return bool(active_indexing_attempt)
|
||||
|
||||
|
||||
@dataclass
|
||||
class _KickoffResult:
|
||||
"""Tracks diagnostic counts from a _kickoff_indexing_tasks run."""
|
||||
|
||||
created: int = 0
|
||||
skipped_active: int = 0
|
||||
skipped_not_found: int = 0
|
||||
skipped_not_indexable: int = 0
|
||||
failed_to_create: int = 0
|
||||
|
||||
@property
|
||||
def evaluated(self) -> int:
|
||||
return (
|
||||
self.created
|
||||
+ self.skipped_active
|
||||
+ self.skipped_not_found
|
||||
+ self.skipped_not_indexable
|
||||
+ self.failed_to_create
|
||||
)
|
||||
|
||||
|
||||
def _kickoff_indexing_tasks(
|
||||
celery_app: Celery,
|
||||
db_session: Session,
|
||||
@@ -688,12 +617,12 @@ def _kickoff_indexing_tasks(
|
||||
redis_client: Redis,
|
||||
lock_beat: RedisLock,
|
||||
tenant_id: str,
|
||||
) -> _KickoffResult:
|
||||
) -> int:
|
||||
"""Kick off indexing tasks for the given cc_pair_ids and search_settings.
|
||||
|
||||
Returns a _KickoffResult with diagnostic counts.
|
||||
Returns the number of tasks successfully created.
|
||||
"""
|
||||
result = _KickoffResult()
|
||||
tasks_created = 0
|
||||
|
||||
for cc_pair_id in cc_pair_ids:
|
||||
lock_beat.reacquire()
|
||||
@@ -704,7 +633,6 @@ def _kickoff_indexing_tasks(
|
||||
search_settings_id=search_settings.id,
|
||||
db_session=db_session,
|
||||
):
|
||||
result.skipped_active += 1
|
||||
continue
|
||||
|
||||
cc_pair = get_connector_credential_pair_from_id(
|
||||
@@ -715,7 +643,6 @@ def _kickoff_indexing_tasks(
|
||||
task_logger.warning(
|
||||
f"_kickoff_indexing_tasks - CC pair not found: cc_pair={cc_pair_id}"
|
||||
)
|
||||
result.skipped_not_found += 1
|
||||
continue
|
||||
|
||||
# Heavyweight check after fetching cc pair
|
||||
@@ -730,7 +657,6 @@ def _kickoff_indexing_tasks(
|
||||
f"search_settings={search_settings.id}, "
|
||||
f"secondary_index_building={secondary_index_building}"
|
||||
)
|
||||
result.skipped_not_indexable += 1
|
||||
continue
|
||||
|
||||
task_logger.debug(
|
||||
@@ -770,14 +696,13 @@ def _kickoff_indexing_tasks(
|
||||
task_logger.info(
|
||||
f"Connector indexing queued: index_attempt={attempt_id} cc_pair={cc_pair.id} search_settings={search_settings.id}"
|
||||
)
|
||||
result.created += 1
|
||||
tasks_created += 1
|
||||
else:
|
||||
task_logger.error(
|
||||
f"Failed to create indexing task: cc_pair={cc_pair.id} search_settings={search_settings.id}"
|
||||
)
|
||||
result.failed_to_create += 1
|
||||
|
||||
return result
|
||||
return tasks_created
|
||||
|
||||
|
||||
@shared_task(
|
||||
@@ -803,15 +728,13 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
task_logger.warning("check_for_indexing - Starting")
|
||||
|
||||
tasks_created = 0
|
||||
primary_result = _KickoffResult()
|
||||
secondary_result: _KickoffResult | None = None
|
||||
locked = False
|
||||
redis_client = get_redis_client()
|
||||
redis_client_replica = get_redis_replica_client()
|
||||
|
||||
# we need to use celery's redis client to access its redis data
|
||||
# (which lives on a different db number)
|
||||
# redis_client_celery: Redis = self.app.broker_connection().channel().client
|
||||
# redis_client_celery: Redis = self.app.broker_connection().channel().client # type: ignore
|
||||
|
||||
lock_beat: RedisLock = redis_client.lock(
|
||||
OnyxRedisLocks.CHECK_INDEXING_BEAT_LOCK,
|
||||
@@ -925,43 +848,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
cc_pair_id=cc_pair_id,
|
||||
in_repeated_error_state=True,
|
||||
)
|
||||
error_connector_name = (
|
||||
cc_pair.connector.name or f"cc_pair_{cc_pair.id}"
|
||||
)
|
||||
on_connector_error_state_change(
|
||||
tenant_id=tenant_id,
|
||||
source=cc_pair.connector.source.value,
|
||||
cc_pair_id=cc_pair_id,
|
||||
connector_name=error_connector_name,
|
||||
in_error=True,
|
||||
)
|
||||
|
||||
connector_name = (
|
||||
cc_pair.name
|
||||
or cc_pair.connector.name
|
||||
or f"CC pair {cc_pair.id}"
|
||||
)
|
||||
source = cc_pair.connector.source.value
|
||||
connector_url = f"/admin/connector/{cc_pair.id}"
|
||||
create_notification(
|
||||
user_id=None,
|
||||
notif_type=NotificationType.CONNECTOR_REPEATED_ERRORS,
|
||||
db_session=db_session,
|
||||
title=f"Connector '{connector_name}' has entered repeated error state",
|
||||
description=(
|
||||
f"The {source} connector has failed repeatedly and "
|
||||
f"has been flagged. View indexing history in the "
|
||||
f"Advanced section: {connector_url}"
|
||||
),
|
||||
additional_data={"cc_pair_id": cc_pair.id},
|
||||
)
|
||||
|
||||
task_logger.error(
|
||||
f"Connector entered repeated error state: "
|
||||
f"cc_pair={cc_pair.id} "
|
||||
f"connector={cc_pair.connector.name} "
|
||||
f"source={source}"
|
||||
)
|
||||
# When entering repeated error state, also pause the connector
|
||||
# to prevent continued indexing retry attempts burning through embedding credits.
|
||||
# NOTE: only for Cloud, since most self-hosted users use self-hosted embedding
|
||||
@@ -977,7 +863,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
# Heavy check, should_index(), is called in _kickoff_indexing_tasks
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
# Primary first
|
||||
primary_result = _kickoff_indexing_tasks(
|
||||
tasks_created += _kickoff_indexing_tasks(
|
||||
celery_app=self.app,
|
||||
db_session=db_session,
|
||||
search_settings=current_search_settings,
|
||||
@@ -987,7 +873,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
lock_beat=lock_beat,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
tasks_created += primary_result.created
|
||||
|
||||
# Secondary indexing (only if secondary search settings exist and switchover_type is not INSTANT)
|
||||
if (
|
||||
@@ -995,7 +880,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
and secondary_search_settings.switchover_type != SwitchoverType.INSTANT
|
||||
and secondary_cc_pair_ids
|
||||
):
|
||||
secondary_result = _kickoff_indexing_tasks(
|
||||
tasks_created += _kickoff_indexing_tasks(
|
||||
celery_app=self.app,
|
||||
db_session=db_session,
|
||||
search_settings=secondary_search_settings,
|
||||
@@ -1005,7 +890,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
lock_beat=lock_beat,
|
||||
tenant_id=tenant_id,
|
||||
)
|
||||
tasks_created += secondary_result.created
|
||||
elif (
|
||||
secondary_search_settings
|
||||
and secondary_search_settings.switchover_type == SwitchoverType.INSTANT
|
||||
@@ -1014,14 +898,6 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
f"Skipping secondary indexing: switchover_type=INSTANT for search_settings={secondary_search_settings.id}"
|
||||
)
|
||||
|
||||
# Tenant-work-gating hook: refresh membership only when indexing
|
||||
# actually dispatched at least one docfetching task. `_kickoff_indexing_tasks`
|
||||
# internally calls `should_index()` to decide per-cc_pair; using
|
||||
# `tasks_created > 0` here gives us a "real work was done" signal
|
||||
# rather than just "tenant has a cc_pair somewhere."
|
||||
if tasks_created > 0:
|
||||
maybe_mark_tenant_active(tenant_id)
|
||||
|
||||
# 2/3: VALIDATE
|
||||
# Check for inconsistent index attempts - active attempts without task IDs
|
||||
# This can happen if attempt creation fails partway through
|
||||
@@ -1126,26 +1002,7 @@ def check_for_indexing(self: Task, *, tenant_id: str) -> int | None:
|
||||
redis_lock_dump(lock_beat, redis_client)
|
||||
|
||||
time_elapsed = time.monotonic() - time_start
|
||||
task_logger.info(
|
||||
f"check_for_indexing finished: "
|
||||
f"elapsed={time_elapsed:.2f}s "
|
||||
f"primary=[evaluated={primary_result.evaluated} "
|
||||
f"created={primary_result.created} "
|
||||
f"skipped_active={primary_result.skipped_active} "
|
||||
f"skipped_not_found={primary_result.skipped_not_found} "
|
||||
f"skipped_not_indexable={primary_result.skipped_not_indexable} "
|
||||
f"failed={primary_result.failed_to_create}]"
|
||||
+ (
|
||||
f" secondary=[evaluated={secondary_result.evaluated} "
|
||||
f"created={secondary_result.created} "
|
||||
f"skipped_active={secondary_result.skipped_active} "
|
||||
f"skipped_not_found={secondary_result.skipped_not_found} "
|
||||
f"skipped_not_indexable={secondary_result.skipped_not_indexable} "
|
||||
f"failed={secondary_result.failed_to_create}]"
|
||||
if secondary_result
|
||||
else ""
|
||||
)
|
||||
)
|
||||
task_logger.info(f"check_for_indexing finished: elapsed={time_elapsed:.2f}")
|
||||
return tasks_created
|
||||
|
||||
|
||||
|
||||
@@ -51,6 +51,7 @@ from onyx.db.enums import ConnectorCredentialPairStatus
|
||||
from onyx.db.enums import SyncStatus
|
||||
from onyx.db.enums import SyncType
|
||||
from onyx.db.hierarchy import delete_orphaned_hierarchy_nodes
|
||||
from onyx.db.hierarchy import link_hierarchy_nodes_to_documents
|
||||
from onyx.db.hierarchy import remove_stale_hierarchy_node_cc_pair_entries
|
||||
from onyx.db.hierarchy import reparent_orphaned_hierarchy_nodes
|
||||
from onyx.db.hierarchy import update_document_parent_hierarchy_nodes
|
||||
@@ -72,7 +73,6 @@ from onyx.redis.redis_hierarchy import get_source_node_id_from_cache
|
||||
from onyx.redis.redis_hierarchy import HierarchyNodeCacheEntry
|
||||
from onyx.redis.redis_pool import get_redis_client
|
||||
from onyx.redis.redis_pool import get_redis_replica_client
|
||||
from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
|
||||
from onyx.server.metrics.pruning_metrics import observe_pruning_diff_duration
|
||||
from onyx.server.runtime.onyx_runtime import OnyxRuntime
|
||||
from onyx.server.utils import make_short_id
|
||||
@@ -229,7 +229,6 @@ def check_for_pruning(self: Task, *, tenant_id: str) -> bool | None:
|
||||
for cc_pair_entry in cc_pairs:
|
||||
cc_pair_ids.append(cc_pair_entry.id)
|
||||
|
||||
prune_dispatched = False
|
||||
for cc_pair_id in cc_pair_ids:
|
||||
lock_beat.reacquire()
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
@@ -252,18 +251,9 @@ def check_for_pruning(self: Task, *, tenant_id: str) -> bool | None:
|
||||
logger.info(f"Pruning not created: {cc_pair_id}")
|
||||
continue
|
||||
|
||||
prune_dispatched = True
|
||||
task_logger.info(
|
||||
f"Pruning queued: cc_pair={cc_pair.id} id={payload_id}"
|
||||
)
|
||||
|
||||
# Tenant-work-gating hook: mark only when at least one cc_pair
|
||||
# was actually due for pruning AND a prune task was dispatched.
|
||||
# Marking on bare cc_pair existence over-counts the population
|
||||
# since most tenants have cc_pairs but almost none are due on
|
||||
# any given cycle.
|
||||
if prune_dispatched:
|
||||
maybe_mark_tenant_active(tenant_id)
|
||||
r.set(OnyxRedisSignals.BLOCK_PRUNING, 1, ex=_get_pruning_block_expiration())
|
||||
|
||||
# we want to run this less frequently than the overall task
|
||||
@@ -653,6 +643,16 @@ def connector_pruning_generator_task(
|
||||
raw_id_to_parent=all_connector_doc_ids,
|
||||
)
|
||||
|
||||
# Link hierarchy nodes to documents for sources where pages can be
|
||||
# both hierarchy nodes AND documents (e.g. Notion, Confluence)
|
||||
all_doc_id_list = list(all_connector_doc_ids.keys())
|
||||
link_hierarchy_nodes_to_documents(
|
||||
db_session=db_session,
|
||||
document_ids=all_doc_id_list,
|
||||
source=source,
|
||||
commit=True,
|
||||
)
|
||||
|
||||
diff_start = time.monotonic()
|
||||
try:
|
||||
# a list of docs in our local index
|
||||
|
||||
@@ -248,7 +248,6 @@ def document_by_cc_pair_cleanup_task(
|
||||
),
|
||||
)
|
||||
mark_document_as_modified(document_id, db_session)
|
||||
db_session.commit()
|
||||
completion_status = (
|
||||
OnyxCeleryTaskCompletionStatus.NON_RETRYABLE_EXCEPTION
|
||||
)
|
||||
|
||||
@@ -15,7 +15,6 @@ from onyx.configs.constants import OnyxCeleryTask
|
||||
from onyx.configs.constants import OnyxRedisConstants
|
||||
from onyx.db.document import construct_document_id_select_by_needs_sync
|
||||
from onyx.db.document import count_documents_by_needs_sync
|
||||
from onyx.redis.redis_tenant_work_gating import maybe_mark_tenant_active
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
# Redis keys for document sync tracking
|
||||
@@ -151,10 +150,6 @@ def try_generate_stale_document_sync_tasks(
|
||||
logger.info("No stale documents found. Skipping sync tasks generation.")
|
||||
return None
|
||||
|
||||
# Tenant-work-gating hook: refresh this tenant's active-set membership
|
||||
# whenever vespa sync actually has stale docs to dispatch.
|
||||
maybe_mark_tenant_active(tenant_id)
|
||||
|
||||
logger.info(
|
||||
f"Stale documents found (at least {stale_doc_count}). Generating sync tasks in one batch."
|
||||
)
|
||||
|
||||
@@ -61,9 +61,7 @@ def load_checkpoint(
|
||||
checkpoint_io = file_store.read_file(checkpoint_pointer, mode="rb")
|
||||
checkpoint_data = checkpoint_io.read().decode("utf-8")
|
||||
if isinstance(connector, CheckpointedConnector):
|
||||
return connector.validate_checkpoint_json( # ty: ignore[invalid-return-type]
|
||||
checkpoint_data
|
||||
)
|
||||
return connector.validate_checkpoint_json(checkpoint_data)
|
||||
return ConnectorCheckpoint.model_validate_json(checkpoint_data)
|
||||
|
||||
|
||||
|
||||
@@ -5,7 +5,6 @@ from datetime import datetime
|
||||
from datetime import timedelta
|
||||
from datetime import timezone
|
||||
|
||||
import sentry_sdk
|
||||
from celery import Celery
|
||||
from sqlalchemy.orm import Session
|
||||
|
||||
@@ -557,27 +556,6 @@ def connector_document_extraction(
|
||||
|
||||
# save record of any failures at the connector level
|
||||
if failure is not None:
|
||||
if failure.exception is not None:
|
||||
with sentry_sdk.new_scope() as scope:
|
||||
scope.set_tag("stage", "connector_fetch")
|
||||
scope.set_tag("connector_source", db_connector.source.value)
|
||||
scope.set_tag("cc_pair_id", str(cc_pair_id))
|
||||
scope.set_tag("index_attempt_id", str(index_attempt_id))
|
||||
scope.set_tag("tenant_id", tenant_id)
|
||||
if failure.failed_document:
|
||||
scope.set_tag(
|
||||
"doc_id", failure.failed_document.document_id
|
||||
)
|
||||
if failure.failed_entity:
|
||||
scope.set_tag(
|
||||
"entity_id", failure.failed_entity.entity_id
|
||||
)
|
||||
scope.fingerprint = [
|
||||
"connector-fetch-failure",
|
||||
db_connector.source.value,
|
||||
type(failure.exception).__name__,
|
||||
]
|
||||
sentry_sdk.capture_exception(failure.exception)
|
||||
total_failures += 1
|
||||
with get_session_with_current_tenant() as db_session:
|
||||
create_index_attempt_error(
|
||||
|
||||
@@ -1164,10 +1164,7 @@ def run_llm_loop(
|
||||
|
||||
emitter.emit(
|
||||
Packet(
|
||||
placement=Placement(
|
||||
turn_index=llm_cycle_count # ty: ignore[possibly-unresolved-reference]
|
||||
+ reasoning_cycles
|
||||
),
|
||||
placement=Placement(turn_index=llm_cycle_count + reasoning_cycles),
|
||||
obj=OverallStop(type="stop"),
|
||||
)
|
||||
)
|
||||
|
||||
@@ -826,12 +826,6 @@ def translate_history_to_llm_format(
|
||||
base64_data = img_file.to_base64()
|
||||
image_url = f"data:{image_type};base64,{base64_data}"
|
||||
|
||||
content_parts.append(
|
||||
TextContentPart(
|
||||
type="text",
|
||||
text=f"[attached image — file_id: {img_file.file_id}]",
|
||||
)
|
||||
)
|
||||
image_part = ImageContentPart(
|
||||
type="image_url",
|
||||
image_url=ImageUrlDetail(
|
||||
|
||||
@@ -282,7 +282,6 @@ OPENSEARCH_ADMIN_USERNAME = os.environ.get("OPENSEARCH_ADMIN_USERNAME", "admin")
|
||||
OPENSEARCH_ADMIN_PASSWORD = os.environ.get(
|
||||
"OPENSEARCH_ADMIN_PASSWORD", "StrongPassword123!"
|
||||
)
|
||||
OPENSEARCH_USE_SSL = os.environ.get("OPENSEARCH_USE_SSL", "true").lower() == "true"
|
||||
USING_AWS_MANAGED_OPENSEARCH = (
|
||||
os.environ.get("USING_AWS_MANAGED_OPENSEARCH", "").lower() == "true"
|
||||
)
|
||||
@@ -325,10 +324,6 @@ ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX = (
|
||||
ENABLE_OPENSEARCH_INDEXING_FOR_ONYX
|
||||
and os.environ.get("ENABLE_OPENSEARCH_RETRIEVAL_FOR_ONYX", "").lower() == "true"
|
||||
)
|
||||
DISABLE_OPENSEARCH_MIGRATION_TASK = (
|
||||
os.environ.get("DISABLE_OPENSEARCH_MIGRATION_TASK", "").lower() == "true"
|
||||
)
|
||||
ONYX_DISABLE_VESPA = os.environ.get("ONYX_DISABLE_VESPA", "").lower() == "true"
|
||||
# Whether we should check for and create an index if necessary every time we
|
||||
# instantiate an OpenSearchDocumentIndex on multitenant cloud. Defaults to True.
|
||||
VERIFY_CREATE_OPENSEARCH_INDEX_ON_INIT_MT = (
|
||||
@@ -845,29 +840,6 @@ MAX_FILE_SIZE_BYTES = int(
|
||||
os.environ.get("MAX_FILE_SIZE_BYTES") or 2 * 1024 * 1024 * 1024
|
||||
) # 2GB in bytes
|
||||
|
||||
# Maximum embedded images allowed in a single file. PDFs (and other formats)
|
||||
# with thousands of embedded images can OOM the user-file-processing worker
|
||||
# because every image is decoded with PIL and then sent to the vision LLM.
|
||||
# Enforced both at upload time (rejects the file) and during extraction
|
||||
# (defense-in-depth: caps the number of images materialized).
|
||||
#
|
||||
# Clamped to >= 0; a negative env value would turn upload validation into
|
||||
# always-fail and extraction into always-stop, which is never desired. 0
|
||||
# disables image extraction entirely, which is a valid (if aggressive) setting.
|
||||
MAX_EMBEDDED_IMAGES_PER_FILE = max(
|
||||
0, int(os.environ.get("MAX_EMBEDDED_IMAGES_PER_FILE") or 500)
|
||||
)
|
||||
|
||||
# Maximum embedded images allowed across all files in a single upload batch.
|
||||
# Protects against the scenario where a user uploads many files that each
|
||||
# fall under MAX_EMBEDDED_IMAGES_PER_FILE but aggregate to enough work
|
||||
# (serial-ish celery fan-out plus per-image vision-LLM calls) to OOM the
|
||||
# worker under concurrency or run up surprise latency/cost. Also clamped
|
||||
# to >= 0.
|
||||
MAX_EMBEDDED_IMAGES_PER_UPLOAD = max(
|
||||
0, int(os.environ.get("MAX_EMBEDDED_IMAGES_PER_UPLOAD") or 1000)
|
||||
)
|
||||
|
||||
# Use document summary for contextual rag
|
||||
USE_DOCUMENT_SUMMARY = os.environ.get("USE_DOCUMENT_SUMMARY", "true").lower() == "true"
|
||||
# Use chunk summary for contextual rag
|
||||
@@ -1153,32 +1125,6 @@ DEFAULT_IMAGE_ANALYSIS_MAX_SIZE_MB = 20
|
||||
# Number of pre-provisioned tenants to maintain
|
||||
TARGET_AVAILABLE_TENANTS = int(os.environ.get("TARGET_AVAILABLE_TENANTS", "5"))
|
||||
|
||||
# Master switch for the tenant work-gating feature. Controls the `enabled`
|
||||
# axis only — flipping this True puts the feature in shadow mode (compute
|
||||
# the gate, log skip counts, but do not actually skip). The `enforce` axis
|
||||
# is Redis-only with a hard-coded default of False, so this env flag alone
|
||||
# cannot cause real tenants to be skipped. Default off.
|
||||
ENABLE_TENANT_WORK_GATING = (
|
||||
os.environ.get("ENABLE_TENANT_WORK_GATING", "").lower() == "true"
|
||||
)
|
||||
|
||||
# Membership TTL for the `active_tenants` sorted set. Members older than this
|
||||
# are treated as inactive by the gate read path. Must be > the full-fanout
|
||||
# interval so self-healing re-adds a genuinely-working tenant before their
|
||||
# membership expires. Default 30 min.
|
||||
TENANT_WORK_GATING_TTL_SECONDS = int(
|
||||
os.environ.get("TENANT_WORK_GATING_TTL_SECONDS", 30 * 60)
|
||||
)
|
||||
|
||||
# Minimum wall-clock interval between full-fanout cycles. When this many
|
||||
# seconds have elapsed since the last bypass, the generator ignores the gate
|
||||
# on the next invocation and dispatches to every non-gated tenant, letting
|
||||
# consumers re-populate the active set. Schedule-independent so beat drift
|
||||
# or backlog can't make the self-heal bursty or sparse. Default 20 min.
|
||||
TENANT_WORK_GATING_FULL_FANOUT_INTERVAL_SECONDS = int(
|
||||
os.environ.get("TENANT_WORK_GATING_FULL_FANOUT_INTERVAL_SECONDS", 20 * 60)
|
||||
)
|
||||
|
||||
|
||||
# Image summarization configuration
|
||||
IMAGE_SUMMARIZATION_SYSTEM_PROMPT = os.environ.get(
|
||||
|
||||
@@ -283,7 +283,6 @@ class NotificationType(str, Enum):
|
||||
RELEASE_NOTES = "release_notes"
|
||||
ASSISTANT_FILES_READY = "assistant_files_ready"
|
||||
FEATURE_ANNOUNCEMENT = "feature_announcement"
|
||||
CONNECTOR_REPEATED_ERRORS = "connector_repeated_errors"
|
||||
|
||||
|
||||
class BlobType(str, Enum):
|
||||
@@ -450,6 +449,7 @@ class OnyxRedisLocks:
|
||||
"da_lock:check_connector_external_group_sync_beat"
|
||||
)
|
||||
OPENSEARCH_MIGRATION_BEAT_LOCK = "da_lock:opensearch_migration_beat"
|
||||
CHECK_DANGLING_IMPORT_JOBS_BEAT_LOCK = "da_lock:check_dangling_import_jobs_beat"
|
||||
|
||||
MONITOR_BACKGROUND_PROCESSES_LOCK = "da_lock:monitor_background_processes"
|
||||
CHECK_AVAILABLE_TENANTS_LOCK = "da_lock:check_available_tenants"
|
||||
@@ -613,6 +613,9 @@ class OnyxCeleryTask:
|
||||
# Hook execution log retention
|
||||
HOOK_EXECUTION_LOG_CLEANUP_TASK = "hook_execution_log_cleanup_task"
|
||||
|
||||
# Proposal review import cleanup
|
||||
CHECK_FOR_DANGLING_IMPORT_JOBS = "check_for_dangling_import_jobs"
|
||||
|
||||
# Sandbox cleanup
|
||||
CLEANUP_IDLE_SANDBOXES = "cleanup_idle_sandboxes"
|
||||
CLEANUP_OLD_SNAPSHOTS = "cleanup_old_snapshots"
|
||||
@@ -639,11 +642,9 @@ REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPINTVL] = 15
|
||||
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPCNT] = 3
|
||||
|
||||
if platform.system() == "Darwin":
|
||||
REDIS_SOCKET_KEEPALIVE_OPTIONS[
|
||||
socket.TCP_KEEPALIVE # ty: ignore[unresolved-attribute]
|
||||
] = 60
|
||||
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPALIVE] = 60 # type: ignore[attr-defined,unused-ignore]
|
||||
else:
|
||||
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPIDLE] = 60
|
||||
REDIS_SOCKET_KEEPALIVE_OPTIONS[socket.TCP_KEEPIDLE] = 60 # type: ignore[attr-defined,unused-ignore]
|
||||
|
||||
|
||||
class OnyxCallTypes(str, Enum):
|
||||
|
||||
@@ -1,48 +0,0 @@
|
||||
from typing import Any
|
||||
|
||||
from sentry_sdk.types import Event
|
||||
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
_instance_id_resolved = False
|
||||
|
||||
|
||||
def _add_instance_tags(
|
||||
event: Event,
|
||||
hint: dict[str, Any], # noqa: ARG001
|
||||
) -> Event | None:
|
||||
"""Sentry before_send hook that lazily attaches instance identification tags.
|
||||
|
||||
On the first event, resolves the instance UUID from the KV store (requires DB)
|
||||
and sets it as a global Sentry tag. Subsequent events pick it up automatically.
|
||||
"""
|
||||
global _instance_id_resolved
|
||||
|
||||
if _instance_id_resolved:
|
||||
return event
|
||||
|
||||
try:
|
||||
import sentry_sdk
|
||||
|
||||
from shared_configs.configs import MULTI_TENANT
|
||||
|
||||
if MULTI_TENANT:
|
||||
instance_id = "multi-tenant-cloud"
|
||||
else:
|
||||
from onyx.utils.telemetry import get_or_generate_uuid
|
||||
|
||||
instance_id = get_or_generate_uuid()
|
||||
|
||||
sentry_sdk.set_tag("instance_id", instance_id)
|
||||
|
||||
# Also set on this event since set_tag won't retroactively apply
|
||||
event.setdefault("tags", {})["instance_id"] = instance_id
|
||||
|
||||
# Only mark resolved after success — if DB wasn't ready, retry next event
|
||||
_instance_id_resolved = True
|
||||
except Exception:
|
||||
logger.debug("Failed to resolve instance_id for Sentry tagging")
|
||||
|
||||
return event
|
||||
@@ -547,7 +547,7 @@ class AirtableConnector(LoadConnector):
|
||||
for record in batch_records:
|
||||
# Capture the current context so that the thread gets the current tenant ID
|
||||
current_context = contextvars.copy_context()
|
||||
future_to_record[ # ty: ignore[invalid-assignment]
|
||||
future_to_record[
|
||||
executor.submit(
|
||||
current_context.run,
|
||||
self._process_record,
|
||||
|
||||
@@ -3,7 +3,7 @@ from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from typing import Dict
|
||||
|
||||
import asana
|
||||
import asana # type: ignore
|
||||
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
|
||||
@@ -290,8 +290,8 @@ class AxeroConnector(PollConnector):
|
||||
if not self.axero_key or not self.base_url:
|
||||
raise ConnectorMissingCredentialError("Axero")
|
||||
|
||||
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
|
||||
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
|
||||
start_datetime = datetime.utcfromtimestamp(start).replace(tzinfo=timezone.utc)
|
||||
end_datetime = datetime.utcfromtimestamp(end).replace(tzinfo=timezone.utc)
|
||||
|
||||
entity_types = []
|
||||
if self.include_article:
|
||||
@@ -327,7 +327,7 @@ class AxeroConnector(PollConnector):
|
||||
)
|
||||
|
||||
all_axero_forums = _map_post_to_parent(
|
||||
posts=forums_posts, # ty: ignore[invalid-argument-type]
|
||||
posts=forums_posts,
|
||||
api_key=self.axero_key,
|
||||
axero_base_url=self.base_url,
|
||||
)
|
||||
|
||||
@@ -26,10 +26,6 @@ from onyx.configs.constants import FileOrigin
|
||||
from onyx.connectors.cross_connector_utils.miscellaneous_utils import (
|
||||
process_onyx_metadata,
|
||||
)
|
||||
from onyx.connectors.cross_connector_utils.tabular_section_utils import is_tabular_file
|
||||
from onyx.connectors.cross_connector_utils.tabular_section_utils import (
|
||||
tabular_file_to_sections,
|
||||
)
|
||||
from onyx.connectors.exceptions import ConnectorValidationError
|
||||
from onyx.connectors.exceptions import CredentialExpiredError
|
||||
from onyx.connectors.exceptions import InsufficientPermissionsError
|
||||
@@ -42,7 +38,6 @@ from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import HierarchyNode
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.file_processing.extract_file_text import extract_text_and_images
|
||||
from onyx.file_processing.extract_file_text import get_file_ext
|
||||
@@ -76,9 +71,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
self.bucket_region: Optional[str] = None
|
||||
self.european_residency: bool = european_residency
|
||||
|
||||
def set_allow_images( # ty: ignore[invalid-method-override]
|
||||
self, allow_images: bool
|
||||
) -> None:
|
||||
def set_allow_images(self, allow_images: bool) -> None:
|
||||
"""Set whether to process images in this connector."""
|
||||
logger.info(f"Setting allow_images to {allow_images}.")
|
||||
self._allow_images = allow_images
|
||||
@@ -197,9 +190,7 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
method="sts-assume-role",
|
||||
)
|
||||
botocore_session = get_session()
|
||||
botocore_session._credentials = ( # ty: ignore[unresolved-attribute]
|
||||
refreshable
|
||||
)
|
||||
botocore_session._credentials = refreshable # type: ignore[attr-defined]
|
||||
session = boto3.Session(botocore_session=botocore_session)
|
||||
self.s3_client = session.client("s3")
|
||||
elif authentication_method == "assume_role":
|
||||
@@ -460,40 +451,6 @@ class BlobStorageConnector(LoadConnector, PollConnector):
|
||||
logger.exception(f"Error processing image {key}")
|
||||
continue
|
||||
|
||||
# Handle tabular files (xlsx, csv, tsv) — produce one
|
||||
# TabularSection per sheet (or per file for csv/tsv)
|
||||
# instead of a flat TextSection.
|
||||
if is_tabular_file(file_name):
|
||||
try:
|
||||
downloaded_file = self._download_object(key)
|
||||
if downloaded_file is None:
|
||||
continue
|
||||
tabular_sections = tabular_file_to_sections(
|
||||
BytesIO(downloaded_file),
|
||||
file_name=file_name,
|
||||
link=link,
|
||||
)
|
||||
batch.append(
|
||||
Document(
|
||||
id=f"{self.bucket_type}:{self.bucket_name}:{key}",
|
||||
sections=(
|
||||
tabular_sections
|
||||
if tabular_sections
|
||||
else [TabularSection(link=link, text="")]
|
||||
),
|
||||
source=DocumentSource(self.bucket_type.value),
|
||||
semantic_identifier=file_name,
|
||||
doc_updated_at=last_modified,
|
||||
metadata={},
|
||||
)
|
||||
)
|
||||
if len(batch) == self.batch_size:
|
||||
yield batch
|
||||
batch = []
|
||||
except Exception:
|
||||
logger.exception(f"Error processing tabular file {key}")
|
||||
continue
|
||||
|
||||
# Handle text and document files
|
||||
try:
|
||||
downloaded_file = self._download_object(key)
|
||||
|
||||
@@ -2,7 +2,6 @@ import html
|
||||
import time
|
||||
from collections.abc import Callable
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from typing import Any
|
||||
|
||||
from onyx.configs.app_configs import INDEX_BATCH_SIZE
|
||||
@@ -57,14 +56,14 @@ class BookstackConnector(LoadConnector, PollConnector):
|
||||
}
|
||||
|
||||
if start:
|
||||
params["filter[updated_at:gte]"] = datetime.fromtimestamp(
|
||||
start, tz=timezone.utc
|
||||
params["filter[updated_at:gte]"] = datetime.utcfromtimestamp(
|
||||
start
|
||||
).strftime("%Y-%m-%d")
|
||||
|
||||
if end:
|
||||
params["filter[updated_at:lte]"] = datetime.fromtimestamp(
|
||||
end, tz=timezone.utc
|
||||
).strftime("%Y-%m-%d")
|
||||
params["filter[updated_at:lte]"] = datetime.utcfromtimestamp(end).strftime(
|
||||
"%Y-%m-%d"
|
||||
)
|
||||
|
||||
batch = bookstack_client.get(endpoint, params=params).get("data", [])
|
||||
doc_batch: list[Document | HierarchyNode] = [
|
||||
|
||||
@@ -27,19 +27,16 @@ _STATUS_TO_ERROR_CODE: dict[int, OnyxErrorCode] = {
|
||||
401: OnyxErrorCode.CREDENTIAL_EXPIRED,
|
||||
403: OnyxErrorCode.INSUFFICIENT_PERMISSIONS,
|
||||
404: OnyxErrorCode.BAD_GATEWAY,
|
||||
429: OnyxErrorCode.RATE_LIMITED,
|
||||
}
|
||||
|
||||
|
||||
def _error_code_for_status(status_code: int) -> OnyxErrorCode:
|
||||
"""Map an HTTP status code to the appropriate OnyxErrorCode.
|
||||
|
||||
Expects a >= 400 status code. Known codes (401, 403, 404) are
|
||||
Expects a >= 400 status code. Known codes (401, 403, 404, 429) are
|
||||
mapped to specific error codes; all other codes (unrecognised 4xx
|
||||
and 5xx) map to BAD_GATEWAY as unexpected upstream errors.
|
||||
|
||||
Note: 429 is intentionally omitted — the rl_requests wrapper
|
||||
handles rate limits transparently at the HTTP layer, so 429
|
||||
responses never reach this function.
|
||||
"""
|
||||
if status_code in _STATUS_TO_ERROR_CODE:
|
||||
return _STATUS_TO_ERROR_CODE[status_code]
|
||||
|
||||
@@ -1,9 +1,10 @@
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from enum import StrEnum
|
||||
from typing import Any
|
||||
from typing import cast
|
||||
from typing import Literal
|
||||
from typing import NoReturn
|
||||
from typing import TypeAlias
|
||||
|
||||
from pydantic import BaseModel
|
||||
from retry import retry
|
||||
@@ -24,11 +25,8 @@ from onyx.connectors.interfaces import GenerateSlimDocumentOutput
|
||||
from onyx.connectors.interfaces import SecondsSinceUnixEpoch
|
||||
from onyx.connectors.interfaces import SlimConnectorWithPermSync
|
||||
from onyx.connectors.models import ConnectorCheckpoint
|
||||
from onyx.connectors.models import ConnectorFailure
|
||||
from onyx.connectors.models import ConnectorMissingCredentialError
|
||||
from onyx.connectors.models import Document
|
||||
from onyx.connectors.models import DocumentFailure
|
||||
from onyx.connectors.models import EntityFailure
|
||||
from onyx.connectors.models import ImageSection
|
||||
from onyx.connectors.models import TextSection
|
||||
from onyx.error_handling.exceptions import OnyxError
|
||||
@@ -49,6 +47,10 @@ def _handle_canvas_api_error(e: OnyxError) -> NoReturn:
|
||||
raise InsufficientPermissionsError(
|
||||
"Canvas API token does not have sufficient permissions (HTTP 403)."
|
||||
)
|
||||
elif e.status_code == 429:
|
||||
raise ConnectorValidationError(
|
||||
"Canvas rate-limit exceeded (HTTP 429). Please try again later."
|
||||
)
|
||||
elif e.status_code >= 500:
|
||||
raise UnexpectedValidationError(
|
||||
f"Unexpected Canvas HTTP error (status={e.status_code}): {e}"
|
||||
@@ -59,60 +61,6 @@ def _handle_canvas_api_error(e: OnyxError) -> NoReturn:
|
||||
)
|
||||
|
||||
|
||||
class CanvasStage(StrEnum):
|
||||
PAGES = "pages"
|
||||
ASSIGNMENTS = "assignments"
|
||||
ANNOUNCEMENTS = "announcements"
|
||||
|
||||
|
||||
_STAGE_CONFIG: dict[CanvasStage, dict[str, Any]] = {
|
||||
CanvasStage.PAGES: {
|
||||
"endpoint": "courses/{course_id}/pages",
|
||||
"params": {
|
||||
"per_page": "100",
|
||||
"include[]": "body",
|
||||
"published": "true",
|
||||
"sort": "updated_at",
|
||||
"order": "desc",
|
||||
},
|
||||
},
|
||||
CanvasStage.ASSIGNMENTS: {
|
||||
"endpoint": "courses/{course_id}/assignments",
|
||||
"params": {"per_page": "100", "published": "true"},
|
||||
},
|
||||
CanvasStage.ANNOUNCEMENTS: {
|
||||
"endpoint": "announcements",
|
||||
"params": {
|
||||
"per_page": "100",
|
||||
"context_codes[]": "course_{course_id}",
|
||||
"active_only": "true",
|
||||
},
|
||||
},
|
||||
}
|
||||
|
||||
|
||||
def _parse_canvas_dt(timestamp_str: str) -> datetime:
|
||||
"""Parse a Canvas ISO-8601 timestamp (e.g. '2025-06-15T12:00:00Z')
|
||||
into a timezone-aware UTC datetime.
|
||||
|
||||
Canvas returns timestamps with a trailing 'Z' instead of '+00:00',
|
||||
so we normalise before parsing.
|
||||
"""
|
||||
return datetime.fromisoformat(timestamp_str.replace("Z", "+00:00")).astimezone(
|
||||
timezone.utc
|
||||
)
|
||||
|
||||
|
||||
def _unix_to_canvas_time(epoch: float) -> str:
|
||||
"""Convert a Unix timestamp to Canvas ISO-8601 format (e.g. '2025-06-15T12:00:00Z')."""
|
||||
return datetime.fromtimestamp(epoch, tz=timezone.utc).strftime("%Y-%m-%dT%H:%M:%SZ")
|
||||
|
||||
|
||||
def _in_time_window(timestamp_str: str, start: float, end: float) -> bool:
|
||||
"""Check whether a Canvas ISO-8601 timestamp falls within (start, end]."""
|
||||
return start < _parse_canvas_dt(timestamp_str).timestamp() <= end
|
||||
|
||||
|
||||
class CanvasCourse(BaseModel):
|
||||
id: int
|
||||
name: str | None = None
|
||||
@@ -197,6 +145,9 @@ class CanvasAnnouncement(BaseModel):
|
||||
)
|
||||
|
||||
|
||||
CanvasStage: TypeAlias = Literal["pages", "assignments", "announcements"]
|
||||
|
||||
|
||||
class CanvasConnectorCheckpoint(ConnectorCheckpoint):
|
||||
"""Checkpoint state for resumable Canvas indexing.
|
||||
|
||||
@@ -214,30 +165,15 @@ class CanvasConnectorCheckpoint(ConnectorCheckpoint):
|
||||
|
||||
course_ids: list[int] = []
|
||||
current_course_index: int = 0
|
||||
stage: CanvasStage = CanvasStage.PAGES
|
||||
stage: CanvasStage = "pages"
|
||||
next_url: str | None = None
|
||||
|
||||
def advance_course(self) -> None:
|
||||
"""Move to the next course and reset within-course state."""
|
||||
self.current_course_index += 1
|
||||
self.stage = CanvasStage.PAGES
|
||||
self.stage = "pages"
|
||||
self.next_url = None
|
||||
|
||||
def advance_stage(self) -> None:
|
||||
"""Advance past the current stage.
|
||||
|
||||
Moves to the next stage within the same course, or to the next
|
||||
course if the current stage is the last one. Resets next_url so
|
||||
the next call starts fresh on the new stage.
|
||||
"""
|
||||
self.next_url = None
|
||||
stages: list[CanvasStage] = list(CanvasStage)
|
||||
next_idx = stages.index(self.stage) + 1
|
||||
if next_idx < len(stages):
|
||||
self.stage = stages[next_idx]
|
||||
else:
|
||||
self.advance_course()
|
||||
|
||||
|
||||
class CanvasConnector(
|
||||
CheckpointedConnectorWithPermSync[CanvasConnectorCheckpoint],
|
||||
@@ -359,7 +295,13 @@ class CanvasConnector(
|
||||
if body_text:
|
||||
text_parts.append(body_text)
|
||||
|
||||
doc_updated_at = _parse_canvas_dt(page.updated_at) if page.updated_at else None
|
||||
doc_updated_at = (
|
||||
datetime.fromisoformat(page.updated_at.replace("Z", "+00:00")).astimezone(
|
||||
timezone.utc
|
||||
)
|
||||
if page.updated_at
|
||||
else None
|
||||
)
|
||||
|
||||
document = self._build_document(
|
||||
doc_id=f"canvas-page-{page.course_id}-{page.page_id}",
|
||||
@@ -383,11 +325,17 @@ class CanvasConnector(
|
||||
if desc_text:
|
||||
text_parts.append(desc_text)
|
||||
if assignment.due_at:
|
||||
due_dt = _parse_canvas_dt(assignment.due_at)
|
||||
due_dt = datetime.fromisoformat(
|
||||
assignment.due_at.replace("Z", "+00:00")
|
||||
).astimezone(timezone.utc)
|
||||
text_parts.append(f"Due: {due_dt.strftime('%B %d, %Y %H:%M UTC')}")
|
||||
|
||||
doc_updated_at = (
|
||||
_parse_canvas_dt(assignment.updated_at) if assignment.updated_at else None
|
||||
datetime.fromisoformat(
|
||||
assignment.updated_at.replace("Z", "+00:00")
|
||||
).astimezone(timezone.utc)
|
||||
if assignment.updated_at
|
||||
else None
|
||||
)
|
||||
|
||||
document = self._build_document(
|
||||
@@ -413,7 +361,11 @@ class CanvasConnector(
|
||||
text_parts.append(msg_text)
|
||||
|
||||
doc_updated_at = (
|
||||
_parse_canvas_dt(announcement.posted_at) if announcement.posted_at else None
|
||||
datetime.fromisoformat(
|
||||
announcement.posted_at.replace("Z", "+00:00")
|
||||
).astimezone(timezone.utc)
|
||||
if announcement.posted_at
|
||||
else None
|
||||
)
|
||||
|
||||
document = self._build_document(
|
||||
@@ -448,314 +400,6 @@ class CanvasConnector(
|
||||
self._canvas_client = client
|
||||
return None
|
||||
|
||||
def _fetch_stage_page(
|
||||
self,
|
||||
next_url: str | None,
|
||||
endpoint: str,
|
||||
params: dict[str, Any],
|
||||
) -> tuple[list[Any], str | None]:
|
||||
"""Fetch one page of API results for the current stage.
|
||||
|
||||
Returns (items, next_url). All error handling is done by the
|
||||
caller (_load_from_checkpoint).
|
||||
"""
|
||||
if next_url:
|
||||
# Resuming mid-pagination: the next_url from Canvas's
|
||||
# Link header already contains endpoint + query params.
|
||||
response, result_next_url = self.canvas_client.get(full_url=next_url)
|
||||
else:
|
||||
# First request for this stage: build from endpoint + params.
|
||||
response, result_next_url = self.canvas_client.get(
|
||||
endpoint=endpoint, params=params
|
||||
)
|
||||
return response or [], result_next_url
|
||||
|
||||
def _process_items(
|
||||
self,
|
||||
response: list[Any],
|
||||
stage: CanvasStage,
|
||||
course_id: int,
|
||||
start: float,
|
||||
end: float,
|
||||
include_permissions: bool,
|
||||
) -> tuple[list[Document | ConnectorFailure], bool]:
|
||||
"""Process a page of API results into documents.
|
||||
|
||||
Returns (docs, early_exit). early_exit is True when pages
|
||||
(sorted desc by updated_at) hit an item older than start,
|
||||
signaling that pagination should stop.
|
||||
"""
|
||||
results: list[Document | ConnectorFailure] = []
|
||||
early_exit = False
|
||||
|
||||
for item in response:
|
||||
try:
|
||||
if stage == CanvasStage.PAGES:
|
||||
page = CanvasPage.from_api(item, course_id=course_id)
|
||||
if not page.updated_at:
|
||||
continue
|
||||
# Pages are sorted by updated_at desc — once we see
|
||||
# an item at or before `start`, all remaining items
|
||||
# on this and subsequent pages are older too.
|
||||
if not _in_time_window(page.updated_at, start, end):
|
||||
if _parse_canvas_dt(page.updated_at).timestamp() <= start:
|
||||
early_exit = True
|
||||
break
|
||||
# ts > end: page is newer than our window, skip it
|
||||
continue
|
||||
doc = self._convert_page_to_document(page)
|
||||
results.append(
|
||||
self._maybe_attach_permissions(
|
||||
doc, course_id, include_permissions
|
||||
)
|
||||
)
|
||||
|
||||
elif stage == CanvasStage.ASSIGNMENTS:
|
||||
assignment = CanvasAssignment.from_api(item, course_id=course_id)
|
||||
if not assignment.updated_at or not _in_time_window(
|
||||
assignment.updated_at, start, end
|
||||
):
|
||||
continue
|
||||
doc = self._convert_assignment_to_document(assignment)
|
||||
results.append(
|
||||
self._maybe_attach_permissions(
|
||||
doc, course_id, include_permissions
|
||||
)
|
||||
)
|
||||
|
||||
elif stage == CanvasStage.ANNOUNCEMENTS:
|
||||
announcement = CanvasAnnouncement.from_api(
|
||||
item, course_id=course_id
|
||||
)
|
||||
if not announcement.posted_at:
|
||||
logger.debug(
|
||||
f"Skipping announcement {announcement.id} in "
|
||||
f"course {course_id}: no posted_at"
|
||||
)
|
||||
continue
|
||||
if not _in_time_window(announcement.posted_at, start, end):
|
||||
continue
|
||||
doc = self._convert_announcement_to_document(announcement)
|
||||
results.append(
|
||||
self._maybe_attach_permissions(
|
||||
doc, course_id, include_permissions
|
||||
)
|
||||
)
|
||||
|
||||
except Exception as e:
|
||||
item_id = item.get("id") or item.get("page_id", "unknown")
|
||||
if stage == CanvasStage.PAGES:
|
||||
doc_link = (
|
||||
f"{self.canvas_base_url}/courses/{course_id}"
|
||||
f"/pages/{item.get('url', '')}"
|
||||
)
|
||||
else:
|
||||
doc_link = item.get("html_url", "")
|
||||
results.append(
|
||||
ConnectorFailure(
|
||||
failed_document=DocumentFailure(
|
||||
document_id=f"canvas-{stage.removesuffix('s')}-{course_id}-{item_id}",
|
||||
document_link=doc_link,
|
||||
),
|
||||
failure_message=f"Failed to process {stage.removesuffix('s')}: {e}",
|
||||
exception=e,
|
||||
)
|
||||
)
|
||||
|
||||
return results, early_exit
|
||||
|
||||
def _maybe_attach_permissions(
|
||||
self,
|
||||
document: Document,
|
||||
course_id: int,
|
||||
include_permissions: bool,
|
||||
) -> Document:
|
||||
if include_permissions:
|
||||
document.external_access = self._get_course_permissions(course_id)
|
||||
return document
|
||||
|
||||
def _load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: CanvasConnectorCheckpoint,
|
||||
include_permissions: bool = False,
|
||||
) -> CheckpointOutput[CanvasConnectorCheckpoint]:
|
||||
"""Shared implementation for load_from_checkpoint and load_from_checkpoint_with_perm_sync."""
|
||||
new_checkpoint = checkpoint.model_copy(deep=True)
|
||||
|
||||
# First call: materialize the list of course IDs.
|
||||
# On failure, let the exception propagate so the framework fails the
|
||||
# attempt cleanly. Swallowing errors here would leave the checkpoint
|
||||
# state unchanged and cause an infinite retry loop.
|
||||
if not new_checkpoint.course_ids:
|
||||
try:
|
||||
courses = self._list_courses()
|
||||
except OnyxError as e:
|
||||
if e.status_code in (401, 403):
|
||||
_handle_canvas_api_error(e) # NoReturn — always raises
|
||||
raise
|
||||
new_checkpoint.course_ids = [c.id for c in courses]
|
||||
logger.info(f"Found {len(courses)} Canvas courses to process")
|
||||
new_checkpoint.has_more = len(new_checkpoint.course_ids) > 0
|
||||
return new_checkpoint
|
||||
|
||||
# All courses done.
|
||||
if new_checkpoint.current_course_index >= len(new_checkpoint.course_ids):
|
||||
new_checkpoint.has_more = False
|
||||
return new_checkpoint
|
||||
|
||||
course_id = new_checkpoint.course_ids[new_checkpoint.current_course_index]
|
||||
try:
|
||||
stage = CanvasStage(new_checkpoint.stage)
|
||||
except ValueError as e:
|
||||
raise ValueError(
|
||||
f"Invalid checkpoint stage: {new_checkpoint.stage!r}. "
|
||||
f"Valid stages: {[s.value for s in CanvasStage]}"
|
||||
) from e
|
||||
|
||||
# Build endpoint + params from the static template.
|
||||
config = _STAGE_CONFIG[stage]
|
||||
endpoint = config["endpoint"].format(course_id=course_id)
|
||||
params = {k: v.format(course_id=course_id) for k, v in config["params"].items()}
|
||||
# Only the announcements API supports server-side date filtering
|
||||
# (start_date/end_date). Pages support server-side sorting
|
||||
# (sort=updated_at desc) enabling early exit, but not date
|
||||
# filtering. Assignments support neither. Both are filtered
|
||||
# client-side via _in_time_window after fetching.
|
||||
if stage == CanvasStage.ANNOUNCEMENTS:
|
||||
params["start_date"] = _unix_to_canvas_time(start)
|
||||
params["end_date"] = _unix_to_canvas_time(end)
|
||||
|
||||
try:
|
||||
response, result_next_url = self._fetch_stage_page(
|
||||
next_url=new_checkpoint.next_url,
|
||||
endpoint=endpoint,
|
||||
params=params,
|
||||
)
|
||||
except OnyxError as oe:
|
||||
# Security errors from _parse_next_link (host/scheme
|
||||
# mismatch on pagination URLs) have no status code override
|
||||
# and must not be silenced.
|
||||
is_api_error = oe._status_code_override is not None
|
||||
if not is_api_error:
|
||||
raise
|
||||
if oe.status_code in (401, 403):
|
||||
_handle_canvas_api_error(oe) # NoReturn — always raises
|
||||
|
||||
# 404 means the course itself is gone or inaccessible. The
|
||||
# other stages on this course will hit the same 404, so skip
|
||||
# the whole course rather than burning API calls on each stage.
|
||||
if oe.status_code == 404:
|
||||
logger.warning(
|
||||
f"Canvas course {course_id} not found while fetching "
|
||||
f"{stage} (HTTP 404). Skipping course."
|
||||
)
|
||||
yield ConnectorFailure(
|
||||
failed_entity=EntityFailure(
|
||||
entity_id=f"canvas-course-{course_id}",
|
||||
),
|
||||
failure_message=(f"Canvas course {course_id} not found: {oe}"),
|
||||
exception=oe,
|
||||
)
|
||||
new_checkpoint.advance_course()
|
||||
else:
|
||||
logger.warning(
|
||||
f"Failed to fetch {stage} for course {course_id}: {oe}. "
|
||||
f"Skipping remainder of this stage."
|
||||
)
|
||||
yield ConnectorFailure(
|
||||
failed_entity=EntityFailure(
|
||||
entity_id=f"canvas-{stage}-{course_id}",
|
||||
),
|
||||
failure_message=(
|
||||
f"Failed to fetch {stage} for course {course_id}: {oe}"
|
||||
),
|
||||
exception=oe,
|
||||
)
|
||||
new_checkpoint.advance_stage()
|
||||
new_checkpoint.has_more = new_checkpoint.current_course_index < len(
|
||||
new_checkpoint.course_ids
|
||||
)
|
||||
return new_checkpoint
|
||||
except Exception as e:
|
||||
# Unknown error — skip the stage and try to continue.
|
||||
logger.warning(
|
||||
f"Failed to fetch {stage} for course {course_id}: {e}. "
|
||||
f"Skipping remainder of this stage."
|
||||
)
|
||||
yield ConnectorFailure(
|
||||
failed_entity=EntityFailure(
|
||||
entity_id=f"canvas-{stage}-{course_id}",
|
||||
),
|
||||
failure_message=(
|
||||
f"Failed to fetch {stage} for course {course_id}: {e}"
|
||||
),
|
||||
exception=e,
|
||||
)
|
||||
new_checkpoint.advance_stage()
|
||||
new_checkpoint.has_more = new_checkpoint.current_course_index < len(
|
||||
new_checkpoint.course_ids
|
||||
)
|
||||
return new_checkpoint
|
||||
|
||||
# Process fetched items
|
||||
results, early_exit = self._process_items(
|
||||
response, stage, course_id, start, end, include_permissions
|
||||
)
|
||||
for result in results:
|
||||
yield result
|
||||
|
||||
# If we hit an item older than our window (pages sorted desc),
|
||||
# skip remaining pagination and advance to the next stage.
|
||||
if early_exit:
|
||||
result_next_url = None
|
||||
|
||||
# If there are more pages, save the cursor and return
|
||||
if result_next_url:
|
||||
new_checkpoint.next_url = result_next_url
|
||||
else:
|
||||
# Stage complete — advance to next stage (or next course if last).
|
||||
new_checkpoint.advance_stage()
|
||||
|
||||
new_checkpoint.has_more = new_checkpoint.current_course_index < len(
|
||||
new_checkpoint.course_ids
|
||||
)
|
||||
return new_checkpoint
|
||||
|
||||
@override
|
||||
def load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: CanvasConnectorCheckpoint,
|
||||
) -> CheckpointOutput[CanvasConnectorCheckpoint]:
|
||||
return self._load_from_checkpoint(
|
||||
start, end, checkpoint, include_permissions=False
|
||||
)
|
||||
|
||||
@override
|
||||
def load_from_checkpoint_with_perm_sync(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: CanvasConnectorCheckpoint,
|
||||
) -> CheckpointOutput[CanvasConnectorCheckpoint]:
|
||||
"""Load documents from checkpoint with permission information included."""
|
||||
return self._load_from_checkpoint(
|
||||
start, end, checkpoint, include_permissions=True
|
||||
)
|
||||
|
||||
@override
|
||||
def build_dummy_checkpoint(self) -> CanvasConnectorCheckpoint:
|
||||
return CanvasConnectorCheckpoint(has_more=True)
|
||||
|
||||
@override
|
||||
def validate_checkpoint_json(
|
||||
self, checkpoint_json: str
|
||||
) -> CanvasConnectorCheckpoint:
|
||||
return CanvasConnectorCheckpoint.model_validate_json(checkpoint_json)
|
||||
|
||||
@override
|
||||
def validate_connector_settings(self) -> None:
|
||||
"""Validate Canvas connector settings by testing API access."""
|
||||
@@ -771,6 +415,38 @@ class CanvasConnector(
|
||||
f"Unexpected error during Canvas settings validation: {exc}"
|
||||
)
|
||||
|
||||
@override
|
||||
def load_from_checkpoint(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: CanvasConnectorCheckpoint,
|
||||
) -> CheckpointOutput[CanvasConnectorCheckpoint]:
|
||||
# TODO(benwu408): implemented in PR3 (checkpoint)
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
def load_from_checkpoint_with_perm_sync(
|
||||
self,
|
||||
start: SecondsSinceUnixEpoch,
|
||||
end: SecondsSinceUnixEpoch,
|
||||
checkpoint: CanvasConnectorCheckpoint,
|
||||
) -> CheckpointOutput[CanvasConnectorCheckpoint]:
|
||||
# TODO(benwu408): implemented in PR3 (checkpoint)
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
def build_dummy_checkpoint(self) -> CanvasConnectorCheckpoint:
|
||||
# TODO(benwu408): implemented in PR3 (checkpoint)
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
def validate_checkpoint_json(
|
||||
self, checkpoint_json: str
|
||||
) -> CanvasConnectorCheckpoint:
|
||||
# TODO(benwu408): implemented in PR3 (checkpoint)
|
||||
raise NotImplementedError
|
||||
|
||||
@override
|
||||
def retrieve_all_slim_docs_perm_sync(
|
||||
self,
|
||||
|
||||
@@ -95,13 +95,11 @@ class ClickupConnector(LoadConnector, PollConnector):
|
||||
params["date_updated_lt"] = end
|
||||
|
||||
if self.connector_type == "list":
|
||||
params["list_ids[]"] = self.connector_ids # ty: ignore[invalid-assignment]
|
||||
params["list_ids[]"] = self.connector_ids
|
||||
elif self.connector_type == "folder":
|
||||
params["project_ids[]"] = ( # ty: ignore[invalid-assignment]
|
||||
self.connector_ids
|
||||
)
|
||||
params["project_ids[]"] = self.connector_ids
|
||||
elif self.connector_type == "space":
|
||||
params["space_ids[]"] = self.connector_ids # ty: ignore[invalid-assignment]
|
||||
params["space_ids[]"] = self.connector_ids
|
||||
|
||||
url_endpoint = f"/team/{self.team_id}/task"
|
||||
|
||||
@@ -173,10 +171,7 @@ class ClickupConnector(LoadConnector, PollConnector):
|
||||
document.metadata[extra_field] = task[extra_field]
|
||||
|
||||
if self.retrieve_task_comments:
|
||||
document.sections = [
|
||||
*document.sections,
|
||||
*self._get_task_comments(task["id"]),
|
||||
]
|
||||
document.sections.extend(self._get_task_comments(task["id"]))
|
||||
|
||||
doc_batch.append(document)
|
||||
|
||||
|
||||
@@ -6,7 +6,7 @@ from datetime import timezone
|
||||
from typing import Any
|
||||
from urllib.parse import quote
|
||||
|
||||
from atlassian.errors import ApiError
|
||||
from atlassian.errors import ApiError # type: ignore
|
||||
from requests.exceptions import HTTPError
|
||||
from typing_extensions import override
|
||||
|
||||
|
||||
@@ -26,7 +26,7 @@ from typing import TypeVar
|
||||
from urllib.parse import quote
|
||||
|
||||
import bs4
|
||||
from atlassian import Confluence
|
||||
from atlassian import Confluence # type:ignore
|
||||
from redis import Redis
|
||||
from requests import HTTPError
|
||||
|
||||
@@ -971,7 +971,7 @@ class OnyxConfluence:
|
||||
:return: Returns the user details
|
||||
"""
|
||||
|
||||
from atlassian.errors import ApiPermissionError
|
||||
from atlassian.errors import ApiPermissionError # type:ignore
|
||||
|
||||
url = "rest/api/user/current"
|
||||
params = {}
|
||||
|
||||
@@ -165,7 +165,7 @@ class ConnectorRunner(Generic[CT]):
|
||||
checkpoint_connector_generator = load_from_checkpoint(
|
||||
start=self.time_range[0].timestamp(),
|
||||
end=self.time_range[1].timestamp(),
|
||||
checkpoint=checkpoint, # ty: ignore[invalid-argument-type]
|
||||
checkpoint=checkpoint,
|
||||
)
|
||||
next_checkpoint: CT | None = None
|
||||
# this is guaranteed to always run at least once with next_checkpoint being non-None
|
||||
@@ -174,9 +174,7 @@ class ConnectorRunner(Generic[CT]):
|
||||
hierarchy_node,
|
||||
failure,
|
||||
next_checkpoint,
|
||||
) in CheckpointOutputWrapper[CT]()(
|
||||
checkpoint_connector_generator # ty: ignore[invalid-argument-type]
|
||||
):
|
||||
) in CheckpointOutputWrapper[CT]()(checkpoint_connector_generator):
|
||||
if document is not None:
|
||||
self.doc_batch.append(document)
|
||||
|
||||
|
||||
@@ -83,9 +83,7 @@ class OnyxDBCredentialsProvider(
|
||||
f"No credential found: credential={self._credential_id}"
|
||||
)
|
||||
|
||||
credential.credential_json = ( # ty: ignore[invalid-assignment]
|
||||
credential_json
|
||||
)
|
||||
credential.credential_json = credential_json # type: ignore[assignment]
|
||||
db_session.commit()
|
||||
except Exception:
|
||||
db_session.rollback()
|
||||
|
||||
@@ -3,7 +3,6 @@ from collections.abc import Callable
|
||||
from collections.abc import Iterator
|
||||
from datetime import datetime
|
||||
from datetime import timezone
|
||||
from email.utils import parsedate_to_datetime
|
||||
from typing import Any
|
||||
from typing import TypeVar
|
||||
from urllib.parse import urljoin
|
||||
@@ -11,6 +10,7 @@ from urllib.parse import urlparse
|
||||
|
||||
import requests
|
||||
from dateutil.parser import parse
|
||||
from dateutil.parser import ParserError
|
||||
|
||||
from onyx.configs.app_configs import CONNECTOR_LOCALHOST_OVERRIDE
|
||||
from onyx.configs.constants import DocumentSource
|
||||
@@ -56,16 +56,18 @@ def time_str_to_utc(datetime_str: str) -> datetime:
|
||||
if fixed not in candidates:
|
||||
candidates.append(fixed)
|
||||
|
||||
# dateutil is the primary; the stdlib RFC 2822 parser is a fallback for
|
||||
# inputs dateutil rejects (e.g. headers concatenated without a CRLF —
|
||||
# TZ may be dropped, datetime_to_utc then assumes UTC).
|
||||
for parser in (parse, parsedate_to_datetime):
|
||||
for candidate in candidates:
|
||||
try:
|
||||
return datetime_to_utc(parser(candidate))
|
||||
except (TypeError, ValueError, OverflowError):
|
||||
continue
|
||||
last_exception: Exception | None = None
|
||||
for candidate in candidates:
|
||||
try:
|
||||
dt = parse(candidate)
|
||||
return datetime_to_utc(dt)
|
||||
except (ValueError, ParserError) as exc:
|
||||
last_exception = exc
|
||||
|
||||
if last_exception is not None:
|
||||
raise last_exception
|
||||
|
||||
# Fallback in case parsing failed without raising (should not happen)
|
||||
raise ValueError(f"Unable to parse datetime string: {datetime_str}")
|
||||
|
||||
|
||||
|
||||
@@ -1,69 +0,0 @@
|
||||
import csv
|
||||
import io
|
||||
from typing import IO
|
||||
|
||||
from onyx.connectors.models import TabularSection
|
||||
from onyx.file_processing.extract_file_text import file_io_to_text
|
||||
from onyx.file_processing.extract_file_text import xlsx_sheet_extraction
|
||||
from onyx.file_processing.file_types import OnyxFileExtensions
|
||||
from onyx.utils.logger import setup_logger
|
||||
|
||||
logger = setup_logger()
|
||||
|
||||
|
||||
def is_tabular_file(file_name: str) -> bool:
|
||||
lowered = file_name.lower()
|
||||
return any(lowered.endswith(ext) for ext in OnyxFileExtensions.TABULAR_EXTENSIONS)
|
||||
|
||||
|
||||
def _tsv_to_csv(tsv_text: str) -> str:
|
||||
"""Re-serialize tab-separated text as CSV so downstream parsers that
|
||||
assume the default Excel dialect read the columns correctly."""
|
||||
out = io.StringIO()
|
||||
csv.writer(out, lineterminator="\n").writerows(
|
||||
csv.reader(io.StringIO(tsv_text), dialect="excel-tab")
|
||||
)
|
||||
return out.getvalue().rstrip("\n")
|
||||
|
||||
|
||||
def tabular_file_to_sections(
|
||||
file: IO[bytes],
|
||||
file_name: str,
|
||||
link: str = "",
|
||||
) -> list[TabularSection]:
|
||||
"""Convert a tabular file into one or more TabularSections.
|
||||
|
||||
- .xlsx → one TabularSection per non-empty sheet.
|
||||
- .csv / .tsv → a single TabularSection containing the full decoded
|
||||
file.
|
||||
|
||||
Returns an empty list when the file yields no extractable content.
|
||||
"""
|
||||
lowered = file_name.lower()
|
||||
|
||||
if lowered.endswith(tuple(OnyxFileExtensions.SPREADSHEET_EXTENSIONS)):
|
||||
return [
|
||||
TabularSection(
|
||||
link=link or file_name,
|
||||
text=csv_text,
|
||||
heading=f"{file_name} :: {sheet_title}",
|
||||
)
|
||||
for csv_text, sheet_title in xlsx_sheet_extraction(
|
||||
file, file_name=file_name
|
||||
)
|
||||
]
|
||||
|
||||
if not lowered.endswith((".csv", ".tsv")):
|
||||
raise ValueError(f"{file_name!r} is not a tabular file")
|
||||
|
||||
try:
|
||||
text = file_io_to_text(file).strip()
|
||||
except Exception:
|
||||
logger.exception(f"Failure decoding {file_name}")
|
||||
raise
|
||||
|
||||
if not text:
|
||||
return []
|
||||
if lowered.endswith(".tsv"):
|
||||
text = _tsv_to_csv(text)
|
||||
return [TabularSection(link=link or file_name, text=text)]
|
||||
@@ -53,10 +53,8 @@ def _convert_message_to_document(
|
||||
if isinstance(message.channel, TextChannel) and (
|
||||
channel_name := message.channel.name
|
||||
):
|
||||
metadata["Channel"] = channel_name # ty: ignore[possibly-unresolved-reference]
|
||||
semantic_substring += (
|
||||
f" in Channel: #{channel_name}" # ty: ignore[possibly-unresolved-reference]
|
||||
)
|
||||
metadata["Channel"] = channel_name
|
||||
semantic_substring += f" in Channel: #{channel_name}"
|
||||
|
||||
# Single messages dont have a title
|
||||
title = ""
|
||||
|
||||
@@ -221,8 +221,8 @@ class DiscourseConnector(PollConnector):
|
||||
if self.permissions is None:
|
||||
raise ConnectorMissingCredentialError("Discourse")
|
||||
|
||||
start_datetime = datetime.fromtimestamp(start, tz=timezone.utc)
|
||||
end_datetime = datetime.fromtimestamp(end, tz=timezone.utc)
|
||||
start_datetime = datetime.utcfromtimestamp(start).replace(tzinfo=timezone.utc)
|
||||
end_datetime = datetime.utcfromtimestamp(end).replace(tzinfo=timezone.utc)
|
||||
|
||||
self._get_categories_map()
|
||||
|
||||
|
||||
Some files were not shown because too many files have changed in this diff Show More
Reference in New Issue
Block a user