Compare commits

...

2 Commits

Author SHA1 Message Date
Justin Tahara
c39c97d92a fix(db): add missing FK indexes for hot celery worker tables (#10363) 2026-04-21 02:32:38 +00:00
Nikolas Garza
8865d5b676 refactor(auth): run disposable-email check before captcha on signup (#10404) 2026-04-21 02:06:00 +00:00
3 changed files with 98 additions and 22 deletions

View File

@@ -0,0 +1,75 @@
"""add missing FK indexes for hot tables
Adds indexes on foreign key columns that are frequently queried by celery
worker tasks but were missing indexes, causing full sequential scans.
Postgres does NOT automatically create indexes on foreign key columns
(unlike MySQL). These four columns were identified via Aurora Performance
Insights as the top contributors to primary-worker AAS after the
ix_document_needs_sync partial index was deployed.
- index_attempt_errors.index_attempt_id: #1 query in pg_stat_statements
(56ms mean on a 532K-row / 997MB table). Queried every check_for_indexing
cycle to load errors per attempt.
- index_attempt_errors.connector_credential_pair_id: same table, queried
when filtering errors by cc_pair.
- index_attempt.connector_credential_pair_id: queried by check_for_indexing
to find attempts for each cc_pair. 73ms mean on tenant_dev.
- hierarchy_node.document_id: 18MB+ on large tenants, queried during
document deletion and hierarchy rebuilds.
Note: Index names follow SQLAlchemy's ix_<table>_<column> convention to match
the `index=True` declarations in models.py. This prevents autogenerate from
detecting a mismatch and creating duplicate indexes.
Revision ID: 856bcbe14d79
Revises: a6fcd3d631f9
Create Date: 2026-04-19 18:00:00.000000
"""
from alembic import op
# revision identifiers, used by Alembic.
revision = "856bcbe14d79"
down_revision = "91d150c361f6"
branch_labels = None
depends_on = None
def upgrade() -> None:
op.create_index(
"ix_index_attempt_errors_index_attempt_id",
"index_attempt_errors",
["index_attempt_id"],
)
op.create_index(
"ix_index_attempt_errors_connector_credential_pair_id",
"index_attempt_errors",
["connector_credential_pair_id"],
)
op.create_index(
"ix_index_attempt_connector_credential_pair_id",
"index_attempt",
["connector_credential_pair_id"],
)
op.create_index(
"ix_hierarchy_node_document_id",
"hierarchy_node",
["document_id"],
)
def downgrade() -> None:
op.drop_index("ix_hierarchy_node_document_id", table_name="hierarchy_node")
op.drop_index(
"ix_index_attempt_connector_credential_pair_id", table_name="index_attempt"
)
op.drop_index(
"ix_index_attempt_errors_connector_credential_pair_id",
table_name="index_attempt_errors",
)
op.drop_index(
"ix_index_attempt_errors_index_attempt_id", table_name="index_attempt_errors"
)

View File

@@ -380,6 +380,24 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
safe: bool = False,
request: Optional[Request] = None,
) -> User:
# Check for disposable emails FIRST so obvious throwaway domains are
# rejected before hitting Google's siteverify API. Cheap local check.
try:
verify_email_domain(user_create.email, is_registration=True)
except OnyxError as e:
# Log blocked disposable email attempts
if "Disposable email" in e.detail:
domain = (
user_create.email.split("@")[-1]
if "@" in user_create.email
else "unknown"
)
logger.warning(
f"Blocked disposable email registration attempt: {domain}",
extra={"email_domain": domain},
)
raise
# Verify captcha if enabled (for cloud signup protection)
from onyx.auth.captcha import CaptchaVerificationError
from onyx.auth.captcha import is_captcha_enabled
@@ -407,24 +425,6 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
user_create.password, cast(schemas.UC, user_create)
)
# Check for disposable emails BEFORE provisioning tenant
# This prevents creating tenants for throwaway email addresses
try:
verify_email_domain(user_create.email, is_registration=True)
except OnyxError as e:
# Log blocked disposable email attempts
if "Disposable email" in e.detail:
domain = (
user_create.email.split("@")[-1]
if "@" in user_create.email
else "unknown"
)
logger.warning(
f"Blocked disposable email registration attempt: {domain}",
extra={"email_domain": domain},
)
raise
user_count: int | None = None
referral_source = (
request.cookies.get("referral_source", None)
@@ -620,8 +620,7 @@ class UserManager(UUIDIDMixin, BaseUserManager[User, uuid.UUID]):
sync_db.commit()
else:
logger.warning(
"User %s not found in sync session during upgrade to standard; "
"skipping upgrade",
"User %s not found in sync session during upgrade to standard; skipping upgrade",
user_id,
)
@@ -1614,7 +1613,6 @@ async def optional_user(
async_db_session: AsyncSession = Depends(get_async_session),
user: User | None = Depends(optional_fastapi_current_user),
) -> User | None:
if user := await _check_for_saml_and_jwt(request, user, async_db_session):
# If user is already set, _check_for_saml_and_jwt returns the same user object
return user

View File

@@ -894,7 +894,7 @@ class HierarchyNode(Base):
# For hierarchy nodes that are also documents (e.g., Confluence pages)
# SET NULL when document is deleted - node can exist without its document
document_id: Mapped[str | None] = mapped_column(
ForeignKey("document.id", ondelete="SET NULL"), nullable=True
ForeignKey("document.id", ondelete="SET NULL"), nullable=True, index=True
)
# Self-referential FK for tree structure
@@ -2204,6 +2204,7 @@ class IndexAttempt(Base):
connector_credential_pair_id: Mapped[int] = mapped_column(
ForeignKey("connector_credential_pair.id"),
nullable=False,
index=True,
)
# Some index attempts that run from beginning will still have this as False
@@ -2407,10 +2408,12 @@ class IndexAttemptError(Base):
index_attempt_id: Mapped[int] = mapped_column(
ForeignKey("index_attempt.id"),
nullable=False,
index=True,
)
connector_credential_pair_id: Mapped[int] = mapped_column(
ForeignKey("connector_credential_pair.id"),
nullable=False,
index=True,
)
document_id: Mapped[str | None] = mapped_column(String, nullable=True)