chore(devtools): ods screenshot-diff for visual regression testing (#8386)

Co-authored-by: cubic-dev-ai[bot] <191113872+cubic-dev-ai[bot]@users.noreply.github.com>
2026-02-16 23:35:46 +00:00 · 2026-02-12 16:04:22 -08:00
parent 6749f63f09
commit 27e676c48f
17 changed files with 2253 additions and 21 deletions
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -52,6 +52,9 @@ env:
  MCP_SERVER_PUBLIC_HOST: host.docker.internal
  MCP_SERVER_PUBLIC_URL: http://host.docker.internal:8004/mcp

+  # Visual regression S3 bucket (shared across all jobs)
+  PLAYWRIGHT_S3_BUCKET: onyx-playwright-artifacts
+
 jobs:
  build-web-image:
    runs-on:
@@ -239,6 +242,9 @@ jobs:
  playwright-tests:
    needs: [build-web-image, build-backend-image, build-model-server-image]
    name: Playwright Tests (${{ matrix.project }})
+    permissions:
+      id-token: write # Required for OIDC-based AWS credential exchange (S3 access)
+      contents: read
    runs-on:
      - runs-on
      - runner=8cpu-linux-arm64
@@ -428,8 +434,6 @@ jobs:
        env:
          PROJECT: ${{ matrix.project }}
        run: |
-          # Create test-results directory to ensure it exists for artifact upload
-          mkdir -p test-results
          npx playwright test --project ${PROJECT}

      - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
@@ -437,9 +441,124 @@ jobs:
        with:
          # Includes test results and trace.zip files
          name: playwright-test-results-${{ matrix.project }}-${{ github.run_id }}
-          path: ./web/test-results/
+          path: ./web/output/playwright/
          retention-days: 30

+      - name: Upload screenshots
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        if: always()
+        with:
+          name: playwright-screenshots-${{ matrix.project }}-${{ github.run_id }}
+          path: ./web/output/screenshots/
+          retention-days: 30
+
+      # --- Visual Regression Diff ---
+      - name: Configure AWS credentials
+        if: always()
+        uses: aws-actions/configure-aws-credentials@61815dcd50bd041e203e49132bacad1fd04d2708
+        with:
+          role-to-assume: ${{ secrets.AWS_OIDC_ROLE_ARN }}
+          aws-region: us-east-2
+
+      - name: Install the latest version of uv
+        if: always()
+        uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # ratchet:astral-sh/setup-uv@v7
+        with:
+          enable-cache: false
+          version: "0.9.9"
+
+      - name: Determine baseline revision
+        if: always()
+        id: baseline-rev
+        env:
+          EVENT_NAME: ${{ github.event_name }}
+          BASE_REF: ${{ github.event.pull_request.base.ref }}
+          GH_REF: ${{ github.ref }}
+          REF_NAME: ${{ github.ref_name }}
+        run: |
+          if [ "${EVENT_NAME}" = "pull_request" ]; then
+            # PRs compare against the base branch (e.g. main, release/2.5)
+            echo "rev=${BASE_REF}" >> "$GITHUB_OUTPUT"
+          elif [[ "${GH_REF}" == refs/tags/* ]]; then
+            # Tag builds compare against the tag name
+            echo "rev=${REF_NAME}" >> "$GITHUB_OUTPUT"
+          else
+            # Push builds (main, release/*) compare against the branch name
+            echo "rev=${REF_NAME}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Generate screenshot diff report
+        if: always()
+        env:
+          PROJECT: ${{ matrix.project }}
+          PLAYWRIGHT_S3_BUCKET: ${{ env.PLAYWRIGHT_S3_BUCKET }}
+          BASELINE_REV: ${{ steps.baseline-rev.outputs.rev }}
+        run: |
+          uv run --no-sync --with onyx-devtools ods screenshot-diff compare \
+            --project "${PROJECT}" \
+            --rev "${BASELINE_REV}"
+
+      - name: Upload visual diff report to S3
+        if: always()
+        env:
+          PROJECT: ${{ matrix.project }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          RUN_ID: ${{ github.run_id }}
+        run: |
+          SUMMARY_FILE="web/output/screenshot-diff/${PROJECT}/summary.json"
+          if [ ! -f "${SUMMARY_FILE}" ]; then
+            echo "No summary file found — skipping S3 upload."
+            exit 0
+          fi
+
+          HAS_DIFF=$(jq -r '.has_differences' "${SUMMARY_FILE}")
+          if [ "${HAS_DIFF}" != "true" ]; then
+            echo "No visual differences for ${PROJECT} — skipping S3 upload."
+            exit 0
+          fi
+
+          aws s3 sync "web/output/screenshot-diff/${PROJECT}/" \
+            "s3://${PLAYWRIGHT_S3_BUCKET}/reports/pr-${PR_NUMBER}/${RUN_ID}/${PROJECT}/"
+
+      - name: Upload visual diff summary
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        if: always()
+        with:
+          name: screenshot-diff-summary-${{ matrix.project }}
+          path: ./web/output/screenshot-diff/${{ matrix.project }}/summary.json
+          if-no-files-found: ignore
+          retention-days: 5
+
+      - name: Upload visual diff report artifact
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        if: always()
+        with:
+          name: screenshot-diff-report-${{ matrix.project }}-${{ github.run_id }}
+          path: ./web/output/screenshot-diff/${{ matrix.project }}/
+          if-no-files-found: ignore
+          retention-days: 30
+
+      - name: Update S3 baselines
+        if: >-
+          success() && (
+            github.ref == 'refs/heads/main' ||
+            startsWith(github.ref, 'refs/heads/release/') ||
+            startsWith(github.ref, 'refs/tags/v')
+          )
+        env:
+          PROJECT: ${{ matrix.project }}
+          PLAYWRIGHT_S3_BUCKET: ${{ env.PLAYWRIGHT_S3_BUCKET }}
+          BASELINE_REV: ${{ steps.baseline-rev.outputs.rev }}
+        run: |
+          if [ -d "web/output/screenshots/" ] && [ "$(ls -A web/output/screenshots/)" ]; then
+            uv run --no-sync --with onyx-devtools ods screenshot-diff upload-baselines \
+              --project "${PROJECT}" \
+              --rev "${BASELINE_REV}" \
+              --delete
+          else
+            echo "No screenshots to upload for ${PROJECT} — skipping baseline update."
+          fi
+
      # save before stopping the containers so the logs can be captured
      - name: Save Docker logs
        if: success() || failure()
@@ -457,6 +576,95 @@ jobs:
          name: docker-logs-${{ matrix.project }}-${{ github.run_id }}
          path: ${{ github.workspace }}/docker-compose.log

+  # Post a single combined visual regression comment after all matrix jobs finish
+  visual-regression-comment:
+    needs: [playwright-tests]
+    if: always() && github.event_name == 'pull_request'
+    runs-on: ubuntu-slim
+    timeout-minutes: 5
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Download visual diff summaries
+        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # ratchet:actions/download-artifact@v4
+        with:
+          pattern: screenshot-diff-summary-*
+          path: summaries/
+
+      - name: Post combined PR comment
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          RUN_ID: ${{ github.run_id }}
+          REPO: ${{ github.repository }}
+          S3_BUCKET: ${{ env.PLAYWRIGHT_S3_BUCKET }}
+        run: |
+          MARKER="<!-- visual-regression-report -->"
+
+          # Build the markdown table from all summary files
+          TABLE_HEADER="| Project | Changed | Added | Removed | Unchanged | Report |"
+          TABLE_DIVIDER="|---------|---------|-------|---------|-----------|--------|"
+          TABLE_ROWS=""
+          HAS_ANY_SUMMARY=false
+
+          for SUMMARY_DIR in summaries/screenshot-diff-summary-*/; do
+            SUMMARY_FILE="${SUMMARY_DIR}summary.json"
+            if [ ! -f "${SUMMARY_FILE}" ]; then
+              continue
+            fi
+
+            HAS_ANY_SUMMARY=true
+            PROJECT=$(jq -r '.project' "${SUMMARY_FILE}")
+            CHANGED=$(jq -r '.changed' "${SUMMARY_FILE}")
+            ADDED=$(jq -r '.added' "${SUMMARY_FILE}")
+            REMOVED=$(jq -r '.removed' "${SUMMARY_FILE}")
+            UNCHANGED=$(jq -r '.unchanged' "${SUMMARY_FILE}")
+            TOTAL=$(jq -r '.total' "${SUMMARY_FILE}")
+            HAS_DIFF=$(jq -r '.has_differences' "${SUMMARY_FILE}")
+
+            if [ "${TOTAL}" = "0" ]; then
+              REPORT_LINK="_No screenshots_"
+            elif [ "${HAS_DIFF}" = "true" ]; then
+              REPORT_URL="https://${S3_BUCKET}.s3.us-east-2.amazonaws.com/reports/pr-${PR_NUMBER}/${RUN_ID}/${PROJECT}/index.html"
+              REPORT_LINK="[View Report](${REPORT_URL})"
+            else
+              REPORT_LINK="✅ No changes"
+            fi
+
+            TABLE_ROWS="${TABLE_ROWS}| \`${PROJECT}\` | ${CHANGED} | ${ADDED} | ${REMOVED} | ${UNCHANGED} | ${REPORT_LINK} |\n"
+          done
+
+          if [ "${HAS_ANY_SUMMARY}" = "false" ]; then
+            echo "No visual diff summaries found — skipping PR comment."
+            exit 0
+          fi
+
+          BODY=$(printf '%s\n' \
+            "${MARKER}" \
+            "### 🖼️ Visual Regression Report" \
+            "" \
+            "${TABLE_HEADER}" \
+            "${TABLE_DIVIDER}" \
+            "$(printf '%b' "${TABLE_ROWS}")")
+
+          # Upsert: find existing comment with the marker, or create a new one
+          EXISTING_COMMENT_ID=$(gh api \
+            "repos/${REPO}/issues/${PR_NUMBER}/comments" \
+            --jq ".[] | select(.body | startswith(\"${MARKER}\")) | .id" \
+            2>/dev/null | head -1)
+
+          if [ -n "${EXISTING_COMMENT_ID}" ]; then
+            gh api \
+              --method PATCH \
+              "repos/${REPO}/issues/comments/${EXISTING_COMMENT_ID}" \
+              -f body="${BODY}"
+          else
+            gh api \
+              --method POST \
+              "repos/${REPO}/issues/${PR_NUMBER}/comments" \
+              -f body="${BODY}"
+          fi
+
  playwright-required:
    # NOTE: Github-hosted runners have about 20s faster queue times and are preferred here.
    runs-on: ubuntu-slim
--- a/backend/requirements/dev.txt
+++ b/backend/requirements/dev.txt
@@ -317,7 +317,7 @@ oauthlib==3.2.2
    # via
    #   kubernetes
    #   requests-oauthlib
-onyx-devtools==0.5.3
+onyx-devtools==0.5.7
    # via onyx
 openai==2.14.0
    # via
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -144,7 +144,7 @@ dev = [
    "matplotlib==3.10.8",
    "mypy-extensions==1.0.0",
    "mypy==1.13.0",
-    "onyx-devtools==0.5.3",
+    "onyx-devtools==0.5.7",
    "openapi-generator-cli==7.17.0",
    "pandas-stubs~=2.3.3",
    "pre-commit==3.2.2",
--- a/tools/ods/README.md
+++ b/tools/ods/README.md
@@ -29,6 +29,10 @@ Some commands require external tools to be installed and configured:
  - Install from [cli.github.com](https://cli.github.com/)
  - Authenticate with `gh auth login`

+- **AWS CLI** - Required for `screenshot-diff` commands (S3 baseline sync)
+  - Install from [aws.amazon.com/cli](https://aws.amazon.com/cli/)
+  - Authenticate with `aws sso login` or `aws configure`
+
 ### Autocomplete

 `ods` provides autocomplete for `bash`, `fish`, `powershell` and `zsh` shells.
@@ -239,6 +243,100 @@ ods cherry-pick abc123 --release 2.5 --release 2.6
 ods cherry-pick abc123 def456 ghi789 --release 2.5
 ```

+### `screenshot-diff` - Visual Regression Testing
+
+Compare Playwright screenshots against baselines and generate visual diff reports.
+Baselines are stored per-project and per-revision in S3:
+
+```
+s3://<bucket>/baselines/<project>/<rev>/
+```
+
+This allows storing baselines for `main`, release branches (`release/2.5`), and
+version tags (`v2.0.0`) side-by-side. Revisions containing `/` are sanitised to
+`-` in the S3 path (e.g. `release/2.5` → `release-2.5`).
+
+```shell
+ods screenshot-diff <subcommand>
+```
+
+**Subcommands:**
+
+- `compare` - Compare screenshots against baselines and generate a diff report
+- `upload-baselines` - Upload screenshots to S3 as new baselines
+
+The `--project` flag provides sensible defaults so you don't need to specify every path.
+When set, the following defaults are applied:
+
+| Flag | Default |
+|------|---------|
+| `--baseline` | `s3://onyx-playwright-artifacts/baselines/<project>/<rev>/` |
+| `--current` | `web/output/screenshots/` |
+| `--output` | `web/output/screenshot-diff/<project>/index.html` |
+| `--rev` | `main` |
+
+The S3 bucket defaults to `onyx-playwright-artifacts` and can be overridden with the
+`PLAYWRIGHT_S3_BUCKET` environment variable.
+
+**`compare` Flags:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--project` | | Project name (e.g. `admin`); sets sensible defaults |
+| `--rev` | `main` | Revision baseline to compare against |
+| `--from-rev` | | Source (older) revision for cross-revision comparison |
+| `--to-rev` | | Target (newer) revision for cross-revision comparison |
+| `--baseline` | | Baseline directory or S3 URL (`s3://...`) |
+| `--current` | | Current screenshots directory or S3 URL (`s3://...`) |
+| `--output` | `screenshot-diff/index.html` | Output path for the HTML report |
+| `--threshold` | `0.2` | Per-channel pixel difference threshold (0.0–1.0) |
+| `--max-diff-ratio` | `0.01` | Max diff pixel ratio before marking as changed |
+
+**`upload-baselines` Flags:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--project` | | Project name (e.g. `admin`); sets sensible defaults |
+| `--rev` | `main` | Revision to store the baseline under |
+| `--dir` | | Local directory containing screenshots to upload |
+| `--dest` | | S3 destination URL (`s3://...`) |
+| `--delete` | `false` | Delete S3 files not present locally |
+
+**Examples:**
+
+```shell
+# Compare local screenshots against the main baseline (default)
+ods screenshot-diff compare --project admin
+
+# Compare against a release branch baseline
+ods screenshot-diff compare --project admin --rev release/2.5
+
+# Compare two revisions directly (both sides fetched from S3)
+ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+# Compare with explicit paths
+ods screenshot-diff compare \
+  --baseline ./baselines \
+  --current ./web/output/screenshots/ \
+  --output ./report/index.html
+
+# Upload baselines for main (default)
+ods screenshot-diff upload-baselines --project admin
+
+# Upload baselines for a release branch
+ods screenshot-diff upload-baselines --project admin --rev release/2.5
+
+# Upload baselines for a version tag
+ods screenshot-diff upload-baselines --project admin --rev v2.0.0
+
+# Upload with delete (remove old baselines not in current set)
+ods screenshot-diff upload-baselines --project admin --delete
+```
+
+The `compare` subcommand writes a `summary.json` alongside the report with aggregate
+counts (changed, added, removed, unchanged). The HTML report is only generated when
+visual differences are detected.
+
 ### Testing Changes Locally (Dry Run)

 Both `run-ci` and `cherry-pick` support `--dry-run` to test without making remote changes:
--- a/tools/ods/cmd/root.go
+++ b/tools/ods/cmd/root.go
@@ -49,6 +49,7 @@ func NewRootCommand() *cobra.Command {
 	cmd.AddCommand(NewLogsCommand())
 	cmd.AddCommand(NewPullCommand())
 	cmd.AddCommand(NewRunCICommand())
+	cmd.AddCommand(NewScreenshotDiffCommand())

 	return cmd
 }
--- a/tools/ods/cmd/screenshot_diff.go
+++ b/tools/ods/cmd/screenshot_diff.go
@@ -0,0 +1,500 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	log "github.com/sirupsen/logrus"
+	"github.com/spf13/cobra"
+
+	"github.com/onyx-dot-app/onyx/tools/ods/internal/imgdiff"
+	"github.com/onyx-dot-app/onyx/tools/ods/internal/s3"
+)
+
+const (
+	// DefaultS3Bucket is the default S3 bucket for Playwright visual regression artifacts.
+	DefaultS3Bucket = "onyx-playwright-artifacts"
+
+	// DefaultScreenshotDir is the default local directory for captured screenshots,
+	// relative to the repository root.
+	DefaultScreenshotDir = "web/output/screenshots"
+
+	// DefaultOutputDir is the default base directory for screenshot diff output,
+	// relative to the repository root.
+	DefaultOutputDir = "web/output/screenshot-diff"
+
+	// DefaultRev is the default revision used when --rev is not specified.
+	DefaultRev = "main"
+)
+
+// getS3Bucket returns the S3 bucket name, preferring the PLAYWRIGHT_S3_BUCKET
+// environment variable over the compiled-in default.
+func getS3Bucket() string {
+	if bucket := os.Getenv("PLAYWRIGHT_S3_BUCKET"); bucket != "" {
+		return bucket
+	}
+	return DefaultS3Bucket
+}
+
+// sanitizeRev normalises a git ref for use as an S3 path segment.
+// Slashes are replaced with dashes (e.g. "release/2.5" → "release-2.5").
+func sanitizeRev(rev string) string {
+	return strings.ReplaceAll(rev, "/", "-")
+}
+
+// ScreenshotDiffCompareOptions holds options for the compare subcommand.
+type ScreenshotDiffCompareOptions struct {
+	Project      string
+	Rev          string // revision whose baseline to compare against (default: "main")
+	FromRev      string // cross-revision mode: source (older) revision
+	ToRev        string // cross-revision mode: target (newer) revision
+	Baseline     string
+	Current      string
+	Output       string
+	Threshold    float64
+	MaxDiffRatio float64
+}
+
+// ScreenshotDiffUploadOptions holds options for the upload-baselines subcommand.
+type ScreenshotDiffUploadOptions struct {
+	Project string
+	Rev     string // revision to store the baseline under (default: "main")
+	Dir     string
+	Dest    string
+	Delete  bool
+}
+
+// NewScreenshotDiffCommand creates the screenshot-diff command with subcommands.
+func NewScreenshotDiffCommand() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "screenshot-diff",
+		Short: "Visual regression testing for Playwright screenshots",
+		Long: `Compare Playwright screenshots against baselines and generate visual diff reports.
+
+Supports comparing local directories and downloading baselines from S3.
+The generated HTML report is self-contained (images base64-inlined) and can
+be opened locally or hosted on S3.
+
+Baselines are stored per-project and per-revision in S3:
+
+  s3://<bucket>/baselines/<project>/<rev>/
+
+The --project flag provides sensible defaults so you don't need to specify
+every path. For example:
+
+  # Compare local screenshots against the "main" baseline (default)
+  ods screenshot-diff compare --project admin
+
+  # Compare against a release branch baseline
+  ods screenshot-diff compare --project admin --rev release/2.5
+
+  # Compare two revisions directly (no local screenshots needed)
+  ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+  # Upload new baselines for the "admin" project on main
+  ods screenshot-diff upload-baselines --project admin
+
+  # Upload baselines for a release branch
+  ods screenshot-diff upload-baselines --project admin --rev release/2.5
+
+You can override any default with explicit flags:
+
+  ods screenshot-diff compare --baseline ./my-baselines --current ./my-screenshots`,
+		Run: func(cmd *cobra.Command, args []string) {
+			_ = cmd.Help()
+		},
+	}
+
+	cmd.AddCommand(newCompareCommand())
+	cmd.AddCommand(newUploadBaselinesCommand())
+
+	return cmd
+}
+
+func newCompareCommand() *cobra.Command {
+	opts := &ScreenshotDiffCompareOptions{}
+
+	cmd := &cobra.Command{
+		Use:   "compare",
+		Short: "Compare screenshots against baselines and generate a diff report",
+		Long: `Compare current screenshots against baseline screenshots and produce
+a self-contained HTML visual diff report with a JSON summary.
+
+Baselines are stored per-revision in S3:
+
+  s3://<bucket>/baselines/<project>/<rev>/
+
+When --project is specified, the following defaults are applied:
+  --baseline  → s3://<bucket>/baselines/<project>/<rev>/
+  --current   → web/output/screenshots/
+  --output    → web/output/screenshot-diff/<project>/index.html
+  --rev       → main
+
+The bucket defaults to "onyx-playwright-artifacts" and can be overridden
+with the PLAYWRIGHT_S3_BUCKET environment variable.
+
+A summary.json file is always written next to the HTML report. If there
+are no visual differences, the HTML report is skipped.
+
+CROSS-REVISION MODE:
+
+Use --from-rev and --to-rev to compare two stored revisions directly.
+Both sides are downloaded from S3 — no local screenshots are needed.
+
+  ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+Examples:
+
+  # Compare local screenshots against main (default)
+  ods screenshot-diff compare --project admin
+
+  # Compare against a specific revision
+  ods screenshot-diff compare --project admin --rev release/2.5
+
+  # Compare two revisions
+  ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+  # Override specific flags
+  ods screenshot-diff compare --project admin --current ./custom-dir/
+
+  # Fully manual (no project flag)
+  ods screenshot-diff compare \
+    --baseline s3://my-bucket/baselines/admin/main/ \
+    --current ./web/output/screenshots/ \
+    --output ./web/output/screenshot-diff/admin/index.html`,
+		Run: func(cmd *cobra.Command, args []string) {
+			runCompare(opts)
+		},
+	}
+
+	cmd.Flags().StringVar(&opts.Project, "project", "", "Project name (e.g. admin); sets sensible defaults for baseline, current, and output")
+	cmd.Flags().StringVar(&opts.Rev, "rev", "", "Revision to compare against (default: main). Ignored when --from-rev/--to-rev are set")
+	cmd.Flags().StringVar(&opts.FromRev, "from-rev", "", "Source (older) revision for cross-revision comparison")
+	cmd.Flags().StringVar(&opts.ToRev, "to-rev", "", "Target (newer) revision for cross-revision comparison")
+	cmd.Flags().StringVar(&opts.Baseline, "baseline", "", "Baseline directory or S3 URL (s3://...)")
+	cmd.Flags().StringVar(&opts.Current, "current", "", "Current screenshots directory or S3 URL (s3://...)")
+	cmd.Flags().StringVar(&opts.Output, "output", "", "Output path for the HTML report")
+	cmd.Flags().Float64Var(&opts.Threshold, "threshold", 0.2, "Per-channel pixel difference threshold (0.0-1.0)")
+	cmd.Flags().Float64Var(&opts.MaxDiffRatio, "max-diff-ratio", 0.01, "Max diff pixel ratio before marking as changed (informational)")
+
+	return cmd
+}
+
+func newUploadBaselinesCommand() *cobra.Command {
+	opts := &ScreenshotDiffUploadOptions{}
+
+	cmd := &cobra.Command{
+		Use:   "upload-baselines",
+		Short: "Upload screenshots to S3 as new baselines",
+		Long: `Upload a local directory of screenshots to S3 to serve as the new
+baseline for future comparisons. Typically run after tests pass on the
+main branch or a release branch.
+
+Baselines are stored per-revision in S3:
+
+  s3://<bucket>/baselines/<project>/<rev>/
+
+When --project is specified, the following defaults are applied:
+  --dir   → web/output/screenshots/
+  --dest  → s3://<bucket>/baselines/<project>/<rev>/
+  --rev   → main
+
+Examples:
+
+  # Upload baselines for main (default)
+  ods screenshot-diff upload-baselines --project admin
+
+  # Upload baselines for a release branch
+  ods screenshot-diff upload-baselines --project admin --rev release/2.5
+
+  # Upload baselines for a version tag
+  ods screenshot-diff upload-baselines --project admin --rev v2.0.0
+
+  # With delete (remove old baselines not in current set)
+  ods screenshot-diff upload-baselines --project admin --delete
+
+  # Fully manual
+  ods screenshot-diff upload-baselines \
+    --dir ./web/output/screenshots/ \
+    --dest s3://onyx-playwright-artifacts/baselines/admin/main/`,
+		Run: func(cmd *cobra.Command, args []string) {
+			runUploadBaselines(opts)
+		},
+	}
+
+	cmd.Flags().StringVar(&opts.Project, "project", "", "Project name (e.g. admin); sets sensible defaults for dir and dest")
+	cmd.Flags().StringVar(&opts.Rev, "rev", "", "Revision to store the baseline under (default: main)")
+	cmd.Flags().StringVar(&opts.Dir, "dir", "", "Local directory containing screenshots to upload")
+	cmd.Flags().StringVar(&opts.Dest, "dest", "", "S3 destination URL (s3://...)")
+	cmd.Flags().BoolVar(&opts.Delete, "delete", false, "Delete S3 files not present locally")
+
+	return cmd
+}
+
+// resolveCompareDefaults fills in missing flags from the --project default when set.
+func resolveCompareDefaults(opts *ScreenshotDiffCompareOptions) {
+	bucket := getS3Bucket()
+
+	if opts.Project != "" {
+		// Cross-revision mode: both sides come from S3
+		if opts.FromRev != "" && opts.ToRev != "" {
+			if opts.Baseline == "" {
+				opts.Baseline = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+					bucket, opts.Project, sanitizeRev(opts.FromRev))
+			}
+			if opts.Current == "" {
+				opts.Current = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+					bucket, opts.Project, sanitizeRev(opts.ToRev))
+			}
+		} else {
+			// Standard mode: compare local screenshots against a revision
+			rev := opts.Rev
+			if rev == "" {
+				rev = DefaultRev
+			}
+			if opts.Baseline == "" {
+				opts.Baseline = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+					bucket, opts.Project, sanitizeRev(rev))
+			}
+			if opts.Current == "" {
+				opts.Current = DefaultScreenshotDir
+			}
+		}
+
+		if opts.Output == "" {
+			opts.Output = filepath.Join(DefaultOutputDir, opts.Project, "index.html")
+		}
+	}
+
+	// Fall back for output even without --project
+	if opts.Output == "" {
+		opts.Output = "screenshot-diff/index.html"
+	}
+}
+
+// resolveUploadDefaults fills in missing flags from the --project default when set.
+func resolveUploadDefaults(opts *ScreenshotDiffUploadOptions) {
+	bucket := getS3Bucket()
+
+	if opts.Project != "" {
+		rev := opts.Rev
+		if rev == "" {
+			rev = DefaultRev
+		}
+		if opts.Dir == "" {
+			opts.Dir = DefaultScreenshotDir
+		}
+		if opts.Dest == "" {
+			opts.Dest = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+				bucket, opts.Project, sanitizeRev(rev))
+		}
+	}
+}
+
+// downloadS3Dir downloads an S3 URL into a local temporary directory and
+// returns the path. The caller is responsible for cleaning up the directory.
+func downloadS3Dir(s3URL string, prefix string) (string, error) {
+	tmpDir, err := os.MkdirTemp("", prefix)
+	if err != nil {
+		return "", fmt.Errorf("failed to create temp directory: %w", err)
+	}
+
+	if err := s3.SyncDown(s3URL, tmpDir); err != nil {
+		_ = os.RemoveAll(tmpDir)
+		return "", fmt.Errorf("failed to download from S3 (%s): %w", s3URL, err)
+	}
+
+	return tmpDir, nil
+}
+
+func runCompare(opts *ScreenshotDiffCompareOptions) {
+	// Validate cross-revision flags are used together
+	if (opts.FromRev != "") != (opts.ToRev != "") {
+		log.Fatal("--from-rev and --to-rev must be used together")
+	}
+
+	resolveCompareDefaults(opts)
+
+	// Validate required fields
+	if opts.Baseline == "" {
+		log.Fatal("--baseline is required (or use --project to set defaults)")
+	}
+	if opts.Current == "" {
+		log.Fatal("--current is required (or use --project to set defaults)")
+	}
+
+	// Determine the project name for the summary (use flag or derive from path)
+	project := opts.Project
+	if project == "" {
+		project = "default"
+	}
+
+	// Track temp dirs for cleanup
+	var tempDirs []string
+	defer func() {
+		for _, d := range tempDirs {
+			_ = os.RemoveAll(d)
+		}
+	}()
+
+	// Resolve baseline directory
+	baselineDir := opts.Baseline
+	if strings.HasPrefix(opts.Baseline, "s3://") {
+		dir, err := downloadS3Dir(opts.Baseline, "screenshot-baseline-*")
+		if err != nil {
+			log.Fatalf("Failed to download baselines: %v", err)
+		}
+		tempDirs = append(tempDirs, dir)
+		baselineDir = dir
+	}
+
+	// Resolve current directory (may also be S3 in cross-revision mode)
+	currentDir := opts.Current
+	if strings.HasPrefix(opts.Current, "s3://") {
+		dir, err := downloadS3Dir(opts.Current, "screenshot-current-*")
+		if err != nil {
+			log.Fatalf("Failed to download current screenshots: %v", err)
+		}
+		tempDirs = append(tempDirs, dir)
+		currentDir = dir
+	}
+
+	// Verify baseline directory exists
+	if _, err := os.Stat(baselineDir); os.IsNotExist(err) {
+		log.Warnf("Baseline directory does not exist: %s", baselineDir)
+		log.Warn("This may be the first run -- no baselines to compare against.")
+		// Create an empty dir so CompareDirectories works (all files will be "added")
+		if err := os.MkdirAll(baselineDir, 0755); err != nil {
+			log.Fatalf("Failed to create baseline directory: %v", err)
+		}
+	}
+
+	// Resolve the output path
+	outputPath := opts.Output
+	if !filepath.IsAbs(outputPath) {
+		cwd, err := os.Getwd()
+		if err != nil {
+			log.Fatalf("Failed to get working directory: %v", err)
+		}
+		outputPath = filepath.Join(cwd, outputPath)
+	}
+	summaryPath := filepath.Join(filepath.Dir(outputPath), "summary.json")
+
+	// If the current screenshots directory doesn't exist, write an empty summary and exit
+	if _, err := os.Stat(currentDir); os.IsNotExist(err) {
+		log.Warnf("Current screenshots directory does not exist: %s", currentDir)
+		log.Warn("No screenshots captured for this project — writing empty summary.")
+
+		summary := imgdiff.Summary{Project: project}
+		if err := imgdiff.WriteSummary(summary, summaryPath); err != nil {
+			log.Fatalf("Failed to write summary: %v", err)
+		}
+		log.Infof("Summary written to: %s", summaryPath)
+		return
+	}
+
+	log.Infof("Comparing screenshots...")
+	log.Infof("  Baseline: %s", opts.Baseline)
+	log.Infof("  Current:  %s", opts.Current)
+	log.Infof("  Threshold: %.2f", opts.Threshold)
+
+	results, err := imgdiff.CompareDirectories(baselineDir, currentDir, opts.Threshold)
+	if err != nil {
+		log.Fatalf("Comparison failed: %v", err)
+	}
+
+	// Print terminal summary
+	printSummary(results)
+
+	// Build and write JSON summary (always)
+	summary := imgdiff.BuildSummary(project, results)
+	if err := imgdiff.WriteSummary(summary, summaryPath); err != nil {
+		log.Fatalf("Failed to write summary: %v", err)
+	}
+	log.Infof("Summary written to: %s", summaryPath)
+
+	// Generate HTML report only if there are differences
+	if summary.HasDifferences {
+		log.Infof("Generating report: %s", outputPath)
+		if err := imgdiff.GenerateReport(results, outputPath); err != nil {
+			log.Fatalf("Failed to generate report: %v", err)
+		}
+		log.Infof("Report generated successfully: %s", outputPath)
+	} else {
+		log.Infof("No visual differences detected — skipping report generation.")
+	}
+}
+
+func runUploadBaselines(opts *ScreenshotDiffUploadOptions) {
+	resolveUploadDefaults(opts)
+
+	// Validate required fields
+	if opts.Dir == "" {
+		log.Fatal("--dir is required (or use --project to set defaults)")
+	}
+	if opts.Dest == "" {
+		log.Fatal("--dest is required (or use --project to set defaults)")
+	}
+
+	if _, err := os.Stat(opts.Dir); os.IsNotExist(err) {
+		log.Fatalf("Screenshots directory does not exist: %s", opts.Dir)
+	}
+
+	if !strings.HasPrefix(opts.Dest, "s3://") {
+		log.Fatalf("Destination must be an S3 URL (s3://...): %s", opts.Dest)
+	}
+
+	log.Infof("Uploading baselines...")
+	log.Infof("  Source: %s", opts.Dir)
+	log.Infof("  Dest:   %s", opts.Dest)
+
+	if err := s3.SyncUp(opts.Dir, opts.Dest, opts.Delete); err != nil {
+		log.Fatalf("Failed to upload baselines: %v", err)
+	}
+
+	log.Info("Baselines uploaded successfully.")
+}
+
+func printSummary(results []imgdiff.Result) {
+	changed, added, removed, unchanged := 0, 0, 0, 0
+	for _, r := range results {
+		switch r.Status {
+		case imgdiff.StatusChanged:
+			changed++
+		case imgdiff.StatusAdded:
+			added++
+		case imgdiff.StatusRemoved:
+			removed++
+		case imgdiff.StatusUnchanged:
+			unchanged++
+		}
+	}
+
+	fmt.Println()
+	fmt.Println("╔══════════════════════════════════════════════╗")
+	fmt.Println("║          Visual Regression Summary           ║")
+	fmt.Println("╠══════════════════════════════════════════════╣")
+	fmt.Printf("║  Changed:   %-32d ║\n", changed)
+	fmt.Printf("║  Added:     %-32d ║\n", added)
+	fmt.Printf("║  Removed:   %-32d ║\n", removed)
+	fmt.Printf("║  Unchanged: %-32d ║\n", unchanged)
+	fmt.Printf("║  Total:     %-32d ║\n", len(results))
+	fmt.Println("╚══════════════════════════════════════════════╝")
+	fmt.Println()
+
+	if changed > 0 || added > 0 || removed > 0 {
+		for _, r := range results {
+			switch r.Status {
+			case imgdiff.StatusChanged:
+				fmt.Printf("  ⚠ CHANGED  %s (%.2f%% diff)\n", r.Name, r.DiffPercent)
+			case imgdiff.StatusAdded:
+				fmt.Printf("  ✚ ADDED    %s\n", r.Name)
+			case imgdiff.StatusRemoved:
+				fmt.Printf("  ✖ REMOVED  %s\n", r.Name)
+			}
+		}
+		fmt.Println()
+	}
+}
--- a/tools/ods/internal/imgdiff/compare.go
+++ b/tools/ods/internal/imgdiff/compare.go
@@ -0,0 +1,321 @@
+package imgdiff
+
+import (
+	"fmt"
+	"image"
+	"image/color"
+	"image/png"
+	"math"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// Status represents the comparison status of a screenshot.
+type Status int
+
+const (
+	// StatusUnchanged means the baseline and current images are identical (within threshold).
+	StatusUnchanged Status = iota
+	// StatusChanged means the images differ beyond the threshold.
+	StatusChanged
+	// StatusAdded means the image exists only in the current directory (no baseline).
+	StatusAdded
+	// StatusRemoved means the image exists only in the baseline directory (no current).
+	StatusRemoved
+)
+
+// String returns a human-readable string for the status.
+func (s Status) String() string {
+	switch s {
+	case StatusUnchanged:
+		return "unchanged"
+	case StatusChanged:
+		return "changed"
+	case StatusAdded:
+		return "added"
+	case StatusRemoved:
+		return "removed"
+	default:
+		return "unknown"
+	}
+}
+
+// Result holds the comparison result for a single screenshot.
+type Result struct {
+	// Name is the filename of the screenshot (e.g. "admin-documents-explorer.png").
+	Name string
+
+	// Status is the comparison status.
+	Status Status
+
+	// DiffPercent is the percentage of pixels that differ (0.0 to 100.0).
+	DiffPercent float64
+
+	// DiffPixels is the number of pixels that differ.
+	DiffPixels int
+
+	// TotalPixels is the total number of pixels compared.
+	TotalPixels int
+
+	// BaselinePath is the path to the baseline image (empty if added).
+	BaselinePath string
+
+	// CurrentPath is the path to the current image (empty if removed).
+	CurrentPath string
+
+	// DiffImage is the generated diff overlay image (nil if unchanged, added, or removed).
+	DiffImage image.Image
+}
+
+// Compare compares two PNG images pixel-by-pixel and returns the result.
+// The threshold parameter (0.0 to 1.0) controls per-channel sensitivity:
+// a pixel is considered different if any channel differs by more than threshold * 255.
+func Compare(baselinePath, currentPath string, threshold float64) (*Result, error) {
+	baseline, err := decodePNG(baselinePath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to decode baseline %s: %w", baselinePath, err)
+	}
+
+	current, err := decodePNG(currentPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to decode current %s: %w", currentPath, err)
+	}
+
+	baselineBounds := baseline.Bounds()
+	currentBounds := current.Bounds()
+
+	// Use the larger dimensions to ensure we compare the full area
+	width := max(baselineBounds.Dx(), currentBounds.Dx())
+	height := max(baselineBounds.Dy(), currentBounds.Dy())
+	totalPixels := width * height
+
+	if totalPixels == 0 {
+		return &Result{
+			Name:         filepath.Base(currentPath),
+			Status:       StatusUnchanged,
+			BaselinePath: baselinePath,
+			CurrentPath:  currentPath,
+		}, nil
+	}
+
+	diffImage := image.NewRGBA(image.Rect(0, 0, width, height))
+	diffPixels := 0
+	thresholdValue := threshold * 255.0
+
+	for y := 0; y < height; y++ {
+		for x := 0; x < width; x++ {
+			// Get pixel from each image (transparent if out of bounds)
+			var br, bg, bb, ba uint32
+			var cr, cg, cb, ca uint32
+
+			if x < baselineBounds.Dx() && y < baselineBounds.Dy() {
+				br, bg, bb, ba = baseline.At(baselineBounds.Min.X+x, baselineBounds.Min.Y+y).RGBA()
+			}
+			if x < currentBounds.Dx() && y < currentBounds.Dy() {
+				cr, cg, cb, ca = current.At(currentBounds.Min.X+x, currentBounds.Min.Y+y).RGBA()
+			}
+
+			// Convert from 16-bit to 8-bit
+			br8 := float64(br >> 8)
+			bg8 := float64(bg >> 8)
+			bb8 := float64(bb >> 8)
+			ba8 := float64(ba >> 8)
+			cr8 := float64(cr >> 8)
+			cg8 := float64(cg >> 8)
+			cb8 := float64(cb >> 8)
+			ca8 := float64(ca >> 8)
+
+			// Check if channels differ beyond threshold
+			isDiff := math.Abs(br8-cr8) > thresholdValue ||
+				math.Abs(bg8-cg8) > thresholdValue ||
+				math.Abs(bb8-cb8) > thresholdValue ||
+				math.Abs(ba8-ca8) > thresholdValue
+
+			if isDiff {
+				diffPixels++
+				// Highlight in magenta for diff overlay
+				diffImage.Set(x, y, color.RGBA{R: 255, G: 0, B: 255, A: 255})
+			} else {
+				// Dim the unchanged pixel (30% opacity of the current image)
+				diffImage.Set(x, y, color.RGBA{
+					R: uint8(cr8 * 0.3),
+					G: uint8(cg8 * 0.3),
+					B: uint8(cb8 * 0.3),
+					A: uint8(math.Max(ca8*0.3, 50)),
+				})
+			}
+		}
+	}
+
+	diffPercent := float64(diffPixels) / float64(totalPixels) * 100.0
+
+	status := StatusUnchanged
+	if diffPixels > 0 {
+		status = StatusChanged
+	}
+
+	return &Result{
+		Name:         filepath.Base(currentPath),
+		Status:       status,
+		DiffPercent:  diffPercent,
+		DiffPixels:   diffPixels,
+		TotalPixels:  totalPixels,
+		BaselinePath: baselinePath,
+		CurrentPath:  currentPath,
+		DiffImage:    diffImage,
+	}, nil
+}
+
+// CompareDirectories compares all PNG files in two directories.
+// Files are matched by name. Files only in baseline are "removed",
+// files only in current are "added", and matching files are compared.
+func CompareDirectories(baselineDir, currentDir string, threshold float64) ([]Result, error) {
+	baselineFiles, err := listPNGs(baselineDir)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list baseline directory: %w", err)
+	}
+
+	currentFiles, err := listPNGs(currentDir)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list current directory: %w", err)
+	}
+
+	// Build maps for lookup
+	baselineMap := make(map[string]string, len(baselineFiles))
+	for _, f := range baselineFiles {
+		baselineMap[filepath.Base(f)] = f
+	}
+
+	currentMap := make(map[string]string, len(currentFiles))
+	for _, f := range currentFiles {
+		currentMap[filepath.Base(f)] = f
+	}
+
+	// Collect all unique names
+	allNames := make(map[string]struct{})
+	for name := range baselineMap {
+		allNames[name] = struct{}{}
+	}
+	for name := range currentMap {
+		allNames[name] = struct{}{}
+	}
+
+	var results []Result
+
+	for name := range allNames {
+		baselinePath, inBaseline := baselineMap[name]
+		currentPath, inCurrent := currentMap[name]
+
+		switch {
+		case inBaseline && inCurrent:
+			result, err := Compare(baselinePath, currentPath, threshold)
+			if err != nil {
+				return nil, fmt.Errorf("failed to compare %s: %w", name, err)
+			}
+			results = append(results, *result)
+
+		case inBaseline && !inCurrent:
+			results = append(results, Result{
+				Name:         name,
+				Status:       StatusRemoved,
+				BaselinePath: baselinePath,
+			})
+
+		case !inBaseline && inCurrent:
+			results = append(results, Result{
+				Name:        name,
+				Status:      StatusAdded,
+				CurrentPath: currentPath,
+			})
+		}
+	}
+
+	// Sort: changed first (by diff % descending), then added, removed, unchanged
+	sort.Slice(results, func(i, j int) bool {
+		if results[i].Status != results[j].Status {
+			return statusOrder(results[i].Status) < statusOrder(results[j].Status)
+		}
+		if results[i].Status == StatusChanged {
+			return results[i].DiffPercent > results[j].DiffPercent
+		}
+		return results[i].Name < results[j].Name
+	})
+
+	return results, nil
+}
+
+// SaveDiffImage writes a diff overlay image to the specified path as PNG.
+func SaveDiffImage(img image.Image, path string) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("failed to create directory: %w", err)
+	}
+
+	f, err := os.Create(path)
+	if err != nil {
+		return fmt.Errorf("failed to create file: %w", err)
+	}
+	defer func() { _ = f.Close() }()
+
+	if err := png.Encode(f, img); err != nil {
+		return fmt.Errorf("failed to encode PNG: %w", err)
+	}
+
+	return nil
+}
+
+// decodePNG reads and decodes a PNG file.
+func decodePNG(path string) (image.Image, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = f.Close() }()
+
+	img, err := png.Decode(f)
+	if err != nil {
+		return nil, err
+	}
+
+	return img, nil
+}
+
+// listPNGs returns all .png files in a directory (non-recursive).
+func listPNGs(dir string) ([]string, error) {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, err
+	}
+
+	var pngs []string
+	for _, entry := range entries {
+		if entry.IsDir() {
+			continue
+		}
+		if strings.HasSuffix(strings.ToLower(entry.Name()), ".png") {
+			pngs = append(pngs, filepath.Join(dir, entry.Name()))
+		}
+	}
+
+	return pngs, nil
+}
+
+// statusOrder returns a sort priority for each status.
+func statusOrder(s Status) int {
+	switch s {
+	case StatusChanged:
+		return 0
+	case StatusAdded:
+		return 1
+	case StatusRemoved:
+		return 2
+	case StatusUnchanged:
+		return 3
+	default:
+		return 4
+	}
+}
--- a/tools/ods/internal/imgdiff/compare_test.go
+++ b/tools/ods/internal/imgdiff/compare_test.go
@@ -0,0 +1,309 @@
+package imgdiff
+
+import (
+	"image"
+	"image/color"
+	"image/png"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// createTestPNG creates a solid-color PNG file at the given path.
+func createTestPNG(t *testing.T, path string, width, height int, c color.Color) {
+	t.Helper()
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		t.Fatalf("failed to create dir: %v", err)
+	}
+	img := image.NewRGBA(image.Rect(0, 0, width, height))
+	for y := 0; y < height; y++ {
+		for x := 0; x < width; x++ {
+			img.Set(x, y, c)
+		}
+	}
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	defer func() { _ = f.Close() }()
+	if err := png.Encode(f, img); err != nil {
+		t.Fatalf("failed to encode PNG: %v", err)
+	}
+}
+
+// createTestPNGWithBlock creates a PNG with a colored block at the specified position.
+func createTestPNGWithBlock(t *testing.T, path string, width, height int, bg, block color.Color, bx, by, bw, bh int) {
+	t.Helper()
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		t.Fatalf("failed to create dir: %v", err)
+	}
+	img := image.NewRGBA(image.Rect(0, 0, width, height))
+	for y := 0; y < height; y++ {
+		for x := 0; x < width; x++ {
+			if x >= bx && x < bx+bw && y >= by && y < by+bh {
+				img.Set(x, y, block)
+			} else {
+				img.Set(x, y, bg)
+			}
+		}
+	}
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	defer func() { _ = f.Close() }()
+	if err := png.Encode(f, img); err != nil {
+		t.Fatalf("failed to encode PNG: %v", err)
+	}
+}
+
+func TestCompare_IdenticalImages(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	createTestPNG(t, baselinePath, 100, 100, white)
+	createTestPNG(t, currentPath, 100, 100, white)
+
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	if result.Status != StatusUnchanged {
+		t.Errorf("expected StatusUnchanged, got %s", result.Status)
+	}
+	if result.DiffPercent != 0.0 {
+		t.Errorf("expected 0%% diff, got %.2f%%", result.DiffPercent)
+	}
+	if result.DiffPixels != 0 {
+		t.Errorf("expected 0 diff pixels, got %d", result.DiffPixels)
+	}
+	if result.TotalPixels != 10000 {
+		t.Errorf("expected 10000 total pixels, got %d", result.TotalPixels)
+	}
+}
+
+func TestCompare_DifferentImages(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	red := color.RGBA{R: 255, G: 0, B: 0, A: 255}
+
+	// Baseline: all white
+	createTestPNG(t, baselinePath, 100, 100, white)
+	// Current: white with a 10x10 red block (100 pixels different)
+	createTestPNGWithBlock(t, currentPath, 100, 100, white, red, 0, 0, 10, 10)
+
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	if result.Status != StatusChanged {
+		t.Errorf("expected StatusChanged, got %s", result.Status)
+	}
+	if result.DiffPixels != 100 {
+		t.Errorf("expected 100 diff pixels, got %d", result.DiffPixels)
+	}
+	if result.DiffPercent != 1.0 {
+		t.Errorf("expected 1.0%% diff, got %.2f%%", result.DiffPercent)
+	}
+	if result.DiffImage == nil {
+		t.Error("expected non-nil DiffImage")
+	}
+}
+
+func TestCompare_SubtleDifferenceBelowThreshold(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	// Two very similar colors -- difference of 10 on one channel
+	c1 := color.RGBA{R: 200, G: 200, B: 200, A: 255}
+	c2 := color.RGBA{R: 210, G: 200, B: 200, A: 255}
+
+	createTestPNG(t, baselinePath, 10, 10, c1)
+	createTestPNG(t, currentPath, 10, 10, c2)
+
+	// Threshold 0.2 = 51 pixel value difference. 10 < 51, so should be unchanged.
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	if result.Status != StatusUnchanged {
+		t.Errorf("expected StatusUnchanged (diff below threshold), got %s", result.Status)
+	}
+	if result.DiffPixels != 0 {
+		t.Errorf("expected 0 diff pixels (below threshold), got %d", result.DiffPixels)
+	}
+}
+
+func TestCompare_DifferentSizes(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	createTestPNG(t, baselinePath, 100, 100, white)
+	createTestPNG(t, currentPath, 100, 120, white) // Taller
+
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	// The extra 20 rows (2000 pixels) should be "different" (white vs transparent/zero)
+	if result.Status != StatusChanged {
+		t.Errorf("expected StatusChanged for different sizes, got %s", result.Status)
+	}
+	if result.TotalPixels != 12000 { // 100*120
+		t.Errorf("expected 12000 total pixels, got %d", result.TotalPixels)
+	}
+}
+
+func TestCompareDirectories(t *testing.T) {
+	baselineDir := filepath.Join(t.TempDir(), "baseline")
+	currentDir := filepath.Join(t.TempDir(), "current")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	red := color.RGBA{R: 255, G: 0, B: 0, A: 255}
+	blue := color.RGBA{R: 0, G: 0, B: 255, A: 255}
+
+	// shared-unchanged.png: identical in both
+	createTestPNG(t, filepath.Join(baselineDir, "shared-unchanged.png"), 10, 10, white)
+	createTestPNG(t, filepath.Join(currentDir, "shared-unchanged.png"), 10, 10, white)
+
+	// shared-changed.png: different in both
+	createTestPNG(t, filepath.Join(baselineDir, "shared-changed.png"), 10, 10, white)
+	createTestPNG(t, filepath.Join(currentDir, "shared-changed.png"), 10, 10, red)
+
+	// removed.png: only in baseline
+	createTestPNG(t, filepath.Join(baselineDir, "removed.png"), 10, 10, white)
+
+	// added.png: only in current
+	createTestPNG(t, filepath.Join(currentDir, "added.png"), 10, 10, blue)
+
+	results, err := CompareDirectories(baselineDir, currentDir, 0.2)
+	if err != nil {
+		t.Fatalf("CompareDirectories failed: %v", err)
+	}
+
+	if len(results) != 4 {
+		t.Fatalf("expected 4 results, got %d", len(results))
+	}
+
+	// Results should be sorted: changed first, then added, removed, unchanged
+	statusCounts := map[Status]int{}
+	for _, r := range results {
+		statusCounts[r.Status]++
+	}
+
+	if statusCounts[StatusChanged] != 1 {
+		t.Errorf("expected 1 changed, got %d", statusCounts[StatusChanged])
+	}
+	if statusCounts[StatusAdded] != 1 {
+		t.Errorf("expected 1 added, got %d", statusCounts[StatusAdded])
+	}
+	if statusCounts[StatusRemoved] != 1 {
+		t.Errorf("expected 1 removed, got %d", statusCounts[StatusRemoved])
+	}
+	if statusCounts[StatusUnchanged] != 1 {
+		t.Errorf("expected 1 unchanged, got %d", statusCounts[StatusUnchanged])
+	}
+
+	// First result should be the changed one (sort order)
+	if results[0].Status != StatusChanged {
+		t.Errorf("expected first result to be changed, got %s", results[0].Status)
+	}
+}
+
+func TestCompareDirectories_EmptyBaseline(t *testing.T) {
+	baselineDir := filepath.Join(t.TempDir(), "baseline")
+	currentDir := filepath.Join(t.TempDir(), "current")
+
+	if err := os.MkdirAll(baselineDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	createTestPNG(t, filepath.Join(currentDir, "new.png"), 10, 10, white)
+
+	results, err := CompareDirectories(baselineDir, currentDir, 0.2)
+	if err != nil {
+		t.Fatalf("CompareDirectories failed: %v", err)
+	}
+
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+	if results[0].Status != StatusAdded {
+		t.Errorf("expected StatusAdded, got %s", results[0].Status)
+	}
+}
+
+func TestGenerateReport(t *testing.T) {
+	dir := t.TempDir()
+	baselineDir := filepath.Join(dir, "baseline")
+	currentDir := filepath.Join(dir, "current")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	red := color.RGBA{R: 255, G: 0, B: 0, A: 255}
+
+	createTestPNG(t, filepath.Join(baselineDir, "page.png"), 50, 50, white)
+	createTestPNG(t, filepath.Join(currentDir, "page.png"), 50, 50, red)
+
+	results, err := CompareDirectories(baselineDir, currentDir, 0.2)
+	if err != nil {
+		t.Fatalf("CompareDirectories failed: %v", err)
+	}
+
+	outputPath := filepath.Join(dir, "report", "index.html")
+	if err := GenerateReport(results, outputPath); err != nil {
+		t.Fatalf("GenerateReport failed: %v", err)
+	}
+
+	// Verify the file was created and has content
+	info, err := os.Stat(outputPath)
+	if err != nil {
+		t.Fatalf("report file not found: %v", err)
+	}
+	if info.Size() == 0 {
+		t.Error("report file is empty")
+	}
+
+	// Verify it contains expected HTML elements
+	content, err := os.ReadFile(outputPath)
+	if err != nil {
+		t.Fatalf("failed to read report: %v", err)
+	}
+
+	contentStr := string(content)
+	for _, expected := range []string{
+		"Visual Regression Report",
+		"data:image/png;base64,",
+		"page.png",
+		"changed",
+	} {
+		if !contains(contentStr, expected) {
+			t.Errorf("report missing expected content: %q", expected)
+		}
+	}
+}
+
+func contains(s, substr string) bool {
+	return len(s) >= len(substr) && searchString(s, substr)
+}
+
+func searchString(s, substr string) bool {
+	for i := 0; i <= len(s)-len(substr); i++ {
+		if s[i:i+len(substr)] == substr {
+			return true
+		}
+	}
+	return false
+}
--- a/tools/ods/internal/imgdiff/report.go
+++ b/tools/ods/internal/imgdiff/report.go
@@ -0,0 +1,345 @@
+package imgdiff
+
+import (
+	"bytes"
+	"encoding/base64"
+	"fmt"
+	"html/template"
+	"image"
+	"image/png"
+	"os"
+	"path/filepath"
+)
+
+// reportEntry holds data for a single screenshot in the HTML template.
+type reportEntry struct {
+	Name            string
+	Status          string
+	DiffPercent     string
+	BaselineDataURI template.URL
+	CurrentDataURI  template.URL
+	DiffDataURI     template.URL
+	HasBaseline     bool
+	HasCurrent      bool
+	HasDiff         bool
+}
+
+// reportData holds all data for the HTML template.
+type reportData struct {
+	Entries        []reportEntry
+	ChangedCount   int
+	AddedCount     int
+	RemovedCount   int
+	UnchangedCount int
+	TotalCount     int
+	HasDifferences bool
+}
+
+// GenerateReport produces a self-contained HTML file from comparison results.
+// All images are base64-encoded inline as data URIs.
+func GenerateReport(results []Result, outputPath string) error {
+	if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil {
+		return fmt.Errorf("failed to create output directory: %w", err)
+	}
+
+	data := reportData{}
+
+	for _, r := range results {
+		entry := reportEntry{
+			Name:   r.Name,
+			Status: r.Status.String(),
+		}
+
+		switch r.Status {
+		case StatusChanged:
+			data.ChangedCount++
+			entry.DiffPercent = fmt.Sprintf("%.2f%%", r.DiffPercent)
+		case StatusAdded:
+			data.AddedCount++
+		case StatusRemoved:
+			data.RemovedCount++
+		case StatusUnchanged:
+			data.UnchangedCount++
+			entry.DiffPercent = "0.00%"
+		}
+
+		if r.BaselinePath != "" {
+			uri, err := pngFileToDataURI(r.BaselinePath)
+			if err != nil {
+				return fmt.Errorf("failed to encode baseline %s: %w", r.Name, err)
+			}
+			entry.BaselineDataURI = template.URL(uri)
+			entry.HasBaseline = true
+		}
+
+		if r.CurrentPath != "" {
+			uri, err := pngFileToDataURI(r.CurrentPath)
+			if err != nil {
+				return fmt.Errorf("failed to encode current %s: %w", r.Name, err)
+			}
+			entry.CurrentDataURI = template.URL(uri)
+			entry.HasCurrent = true
+		}
+
+		if r.DiffImage != nil {
+			uri, err := imageToDataURI(r.DiffImage)
+			if err != nil {
+				return fmt.Errorf("failed to encode diff %s: %w", r.Name, err)
+			}
+			entry.DiffDataURI = template.URL(uri)
+			entry.HasDiff = true
+		}
+
+		data.Entries = append(data.Entries, entry)
+	}
+
+	data.TotalCount = len(results)
+	data.HasDifferences = data.ChangedCount > 0 || data.AddedCount > 0 || data.RemovedCount > 0
+
+	tmpl, err := template.New("report").Parse(htmlTemplate)
+	if err != nil {
+		return fmt.Errorf("failed to parse template: %w", err)
+	}
+
+	f, err := os.Create(outputPath)
+	if err != nil {
+		return fmt.Errorf("failed to create output file: %w", err)
+	}
+	defer func() { _ = f.Close() }()
+
+	if err := tmpl.Execute(f, data); err != nil {
+		return fmt.Errorf("failed to execute template: %w", err)
+	}
+
+	return nil
+}
+
+// pngFileToDataURI reads a PNG file and returns a base64 data URI.
+func pngFileToDataURI(path string) (string, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return "", err
+	}
+	encoded := base64.StdEncoding.EncodeToString(data)
+	return "data:image/png;base64," + encoded, nil
+}
+
+// imageToDataURI encodes an image.Image to a PNG base64 data URI.
+func imageToDataURI(img image.Image) (string, error) {
+	var buf bytes.Buffer
+	if err := png.Encode(&buf, img); err != nil {
+		return "", err
+	}
+	encoded := base64.StdEncoding.EncodeToString(buf.Bytes())
+	return "data:image/png;base64," + encoded, nil
+}
+
+const htmlTemplate = `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Visual Regression Report</title>
+<style>
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #f5f5f5; color: #333; }
+  .header { background: #1a1a2e; color: #fff; padding: 24px 32px; }
+  .header h1 { font-size: 24px; font-weight: 600; }
+  .header p { margin-top: 8px; opacity: 0.8; font-size: 14px; }
+  .summary { display: flex; gap: 16px; padding: 20px 32px; background: #fff; border-bottom: 1px solid #e0e0e0; flex-wrap: wrap; }
+  .summary-card { padding: 12px 20px; border-radius: 8px; font-size: 14px; font-weight: 500; }
+  .summary-changed { background: #fff3e0; color: #e65100; }
+  .summary-added { background: #e8f5e9; color: #2e7d32; }
+  .summary-removed { background: #fce4ec; color: #c62828; }
+  .summary-unchanged { background: #e3f2fd; color: #1565c0; }
+  .content { padding: 24px 32px; max-width: 1400px; margin: 0 auto; }
+  .section-title { font-size: 18px; font-weight: 600; margin: 24px 0 16px; padding-bottom: 8px; border-bottom: 2px solid #e0e0e0; }
+  .no-changes { text-align: center; padding: 60px 20px; color: #666; }
+  .no-changes h2 { font-size: 24px; margin-bottom: 8px; color: #2e7d32; }
+  .card { background: #fff; border-radius: 12px; margin-bottom: 24px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); overflow: hidden; }
+  .card-header { display: flex; justify-content: space-between; align-items: center; padding: 16px 20px; border-bottom: 1px solid #eee; }
+  .card-name { font-weight: 600; font-size: 15px; }
+  .card-badge { font-size: 12px; padding: 4px 10px; border-radius: 12px; font-weight: 500; }
+  .badge-changed { background: #fff3e0; color: #e65100; }
+  .badge-added { background: #e8f5e9; color: #2e7d32; }
+  .badge-removed { background: #fce4ec; color: #c62828; }
+  .tabs { display: flex; gap: 0; border-bottom: 1px solid #eee; }
+  .tab { padding: 10px 20px; cursor: pointer; font-size: 13px; font-weight: 500; color: #666; border-bottom: 2px solid transparent; transition: all 0.2s; }
+  .tab:hover { color: #333; background: #f9f9f9; }
+  .tab.active { color: #1a1a2e; border-bottom-color: #1a1a2e; }
+  .tab-content { display: none; padding: 20px; }
+  .tab-content.active { display: block; }
+  .slider-container { position: relative; overflow: hidden; cursor: ew-resize; user-select: none; border: 1px solid #eee; border-radius: 4px; }
+  .slider-container > img { display: block; width: 100%; height: auto; }
+  .slider-baseline { position: absolute; top: 0; left: 0; width: 100%; height: 100%; clip-path: inset(0 50% 0 0); }
+  .slider-baseline img { display: block; width: 100%; height: auto; }
+  .slider-divider { position: absolute; top: 0; width: 3px; height: 100%; background: #e65100; z-index: 10; cursor: ew-resize; }
+  .slider-divider::before { content: ""; position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); width: 32px; height: 32px; background: #e65100; border-radius: 50%; border: 2px solid #fff; box-shadow: 0 2px 8px rgba(0,0,0,0.3); }
+  .slider-divider::after { content: "\2194"; position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); color: #fff; font-size: 16px; z-index: 1; }
+  .slider-label { position: absolute; top: 10px; padding: 4px 10px; background: rgba(0,0,0,0.6); color: #fff; font-size: 11px; border-radius: 4px; z-index: 5; pointer-events: none; }
+  .slider-label-left { left: 10px; }
+  .slider-label-right { right: 10px; }
+  .side-by-side { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+  .side-by-side .img-container { border: 1px solid #eee; border-radius: 4px; overflow: hidden; }
+  .side-by-side .img-label { font-size: 12px; font-weight: 500; padding: 8px 12px; background: #f5f5f5; color: #666; }
+  .side-by-side img { display: block; width: 100%; height: auto; }
+  .diff-overlay img { display: block; max-width: 100%; height: auto; border: 1px solid #eee; border-radius: 4px; }
+  .single-image img { display: block; max-width: 100%; height: auto; border: 1px solid #eee; border-radius: 4px; }
+  .unchanged-section { margin-top: 32px; }
+  .unchanged-toggle { cursor: pointer; font-size: 14px; color: #666; padding: 12px 0; }
+  .unchanged-toggle:hover { color: #333; }
+  .unchanged-list { display: none; }
+  .unchanged-list.open { display: block; }
+  .unchanged-item { padding: 8px 0; font-size: 13px; color: #888; border-bottom: 1px solid #f0f0f0; }
+</style>
+</head>
+<body>
+
+<div class="header">
+  <h1>Visual Regression Report</h1>
+  <p>{{.TotalCount}} screenshot{{if ne .TotalCount 1}}s{{end}} compared</p>
+</div>
+
+<div class="summary">
+  {{if gt .ChangedCount 0}}<div class="summary-card summary-changed">{{.ChangedCount}} Changed</div>{{end}}
+  {{if gt .AddedCount 0}}<div class="summary-card summary-added">{{.AddedCount}} Added</div>{{end}}
+  {{if gt .RemovedCount 0}}<div class="summary-card summary-removed">{{.RemovedCount}} Removed</div>{{end}}
+  <div class="summary-card summary-unchanged">{{.UnchangedCount}} Unchanged</div>
+</div>
+
+<div class="content">
+{{if not .HasDifferences}}
+  <div class="no-changes">
+    <h2>No visual changes detected</h2>
+    <p>All {{.TotalCount}} screenshots match their baselines.</p>
+  </div>
+{{end}}
+
+{{range .Entries}}
+{{if eq .Status "changed"}}
+<div class="card">
+  <div class="card-header">
+    <span class="card-name">{{.Name}}</span>
+    <span class="card-badge badge-changed">{{.DiffPercent}} changed</span>
+  </div>
+  <div class="tabs">
+    <div class="tab active" onclick="switchTab(this, 'slider')">Slider</div>
+    <div class="tab" onclick="switchTab(this, 'sidebyside')">Side by Side</div>
+    <div class="tab" onclick="switchTab(this, 'diff')">Diff Overlay</div>
+  </div>
+  <div class="tab-content active" data-tab="slider">
+    <div class="slider-container" onmousedown="startSlider(event, this)" onmousemove="moveSlider(event, this)" ontouchstart="startSlider(event, this)" ontouchmove="moveSlider(event, this)">
+      <img src="{{.CurrentDataURI}}" alt="Current" draggable="false">
+      <div class="slider-baseline">
+        <img src="{{.BaselineDataURI}}" alt="Baseline" draggable="false">
+      </div>
+      <div class="slider-divider" style="left: calc(50% - 1.5px);"></div>
+      <span class="slider-label slider-label-left">Baseline</span>
+      <span class="slider-label slider-label-right">Current</span>
+    </div>
+  </div>
+  <div class="tab-content" data-tab="sidebyside">
+    <div class="side-by-side">
+      <div class="img-container">
+        <div class="img-label">Baseline</div>
+        <img src="{{.BaselineDataURI}}" alt="Baseline">
+      </div>
+      <div class="img-container">
+        <div class="img-label">Current</div>
+        <img src="{{.CurrentDataURI}}" alt="Current">
+      </div>
+    </div>
+  </div>
+  <div class="tab-content" data-tab="diff">
+    <div class="diff-overlay">
+      {{if .HasDiff}}<img src="{{.DiffDataURI}}" alt="Diff overlay">{{end}}
+    </div>
+  </div>
+</div>
+{{end}}
+
+{{if eq .Status "added"}}
+<div class="card">
+  <div class="card-header">
+    <span class="card-name">{{.Name}}</span>
+    <span class="card-badge badge-added">added</span>
+  </div>
+  <div class="tab-content active" data-tab="single">
+    <div class="single-image">
+      {{if .HasCurrent}}<img src="{{.CurrentDataURI}}" alt="New screenshot">{{end}}
+    </div>
+  </div>
+</div>
+{{end}}
+
+{{if eq .Status "removed"}}
+<div class="card">
+  <div class="card-header">
+    <span class="card-name">{{.Name}}</span>
+    <span class="card-badge badge-removed">removed</span>
+  </div>
+  <div class="tab-content active" data-tab="single">
+    <div class="single-image">
+      {{if .HasBaseline}}<img src="{{.BaselineDataURI}}" alt="Removed screenshot">{{end}}
+    </div>
+  </div>
+</div>
+{{end}}
+{{end}}
+
+{{if gt .UnchangedCount 0}}
+<div class="unchanged-section">
+  <div class="unchanged-toggle" onclick="toggleUnchanged(this)">
+    &#9654; {{.UnchangedCount}} unchanged screenshot{{if ne .UnchangedCount 1}}s{{end}} (click to expand)
+  </div>
+  <div class="unchanged-list">
+    {{range .Entries}}{{if eq .Status "unchanged"}}<div class="unchanged-item">{{.Name}}</div>{{end}}{{end}}
+  </div>
+</div>
+{{end}}
+
+</div>
+
+<script>
+// Tab switching
+function switchTab(tabEl, tabName) {
+  const card = tabEl.closest('.card');
+  card.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+  card.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+  tabEl.classList.add('active');
+  card.querySelector('[data-tab="' + tabName + '"]').classList.add('active');
+}
+
+// Slider interaction
+let sliderActive = false;
+
+function startSlider(e, container) {
+  sliderActive = true;
+  moveSlider(e, container);
+  const stopSlider = function() { sliderActive = false; };
+  document.addEventListener('mouseup', stopSlider, { once: true });
+  document.addEventListener('touchend', stopSlider, { once: true });
+}
+
+function moveSlider(e, container) {
+  if (!sliderActive) return;
+  e.preventDefault();
+  const rect = container.getBoundingClientRect();
+  const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+  let x = clientX - rect.left;
+  x = Math.max(0, Math.min(x, rect.width));
+  const percent = (x / rect.width) * 100;
+  const clipRight = 100 - percent;
+  container.querySelector('.slider-baseline').style.clipPath = 'inset(0 ' + clipRight + '% 0 0)';
+  container.querySelector('.slider-divider').style.left = 'calc(' + percent + '% - 1.5px)';
+}
+
+// Unchanged section toggle
+function toggleUnchanged(el) {
+  const list = el.nextElementSibling;
+  const isOpen = list.classList.toggle('open');
+  el.innerHTML = (isOpen ? '&#9660;' : '&#9654;') + ' {{.UnchangedCount}} unchanged screenshot{{if ne .UnchangedCount 1}}s{{end}} (click to ' + (isOpen ? 'collapse' : 'expand') + ')';
+}
+</script>
+</body>
+</html>`
--- a/tools/ods/internal/imgdiff/summary.go
+++ b/tools/ods/internal/imgdiff/summary.go
@@ -0,0 +1,60 @@
+package imgdiff
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// Summary holds aggregate comparison results in a JSON-friendly format.
+// It is written alongside the HTML report so that CI pipelines can read it
+// without parsing HTML.
+type Summary struct {
+	Project        string `json:"project"`
+	Changed        int    `json:"changed"`
+	Added          int    `json:"added"`
+	Removed        int    `json:"removed"`
+	Unchanged      int    `json:"unchanged"`
+	Total          int    `json:"total"`
+	HasDifferences bool   `json:"has_differences"`
+}
+
+// BuildSummary computes a Summary from a slice of comparison results.
+func BuildSummary(project string, results []Result) Summary {
+	s := Summary{Project: project}
+	for _, r := range results {
+		switch r.Status {
+		case StatusChanged:
+			s.Changed++
+		case StatusAdded:
+			s.Added++
+		case StatusRemoved:
+			s.Removed++
+		case StatusUnchanged:
+			s.Unchanged++
+		}
+	}
+	s.Total = len(results)
+	s.HasDifferences = s.Changed > 0 || s.Added > 0 || s.Removed > 0
+	return s
+}
+
+// WriteSummary writes a Summary as pretty-printed JSON to the given path,
+// creating parent directories as needed.
+func WriteSummary(summary Summary, path string) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("failed to create directory for summary: %w", err)
+	}
+
+	data, err := json.MarshalIndent(summary, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal summary: %w", err)
+	}
+
+	if err := os.WriteFile(path, data, 0644); err != nil {
+		return fmt.Errorf("failed to write summary: %w", err)
+	}
+
+	return nil
+}
--- a/tools/ods/internal/s3/sync.go
+++ b/tools/ods/internal/s3/sync.go
@@ -0,0 +1,49 @@
+package s3
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+
+	log "github.com/sirupsen/logrus"
+)
+
+// SyncDown downloads an S3 prefix to a local directory using AWS CLI.
+// This is equivalent to: aws s3 sync <s3url> <destDir>
+func SyncDown(s3url string, destDir string) error {
+	if err := os.MkdirAll(destDir, 0755); err != nil {
+		return fmt.Errorf("failed to create destination directory: %w", err)
+	}
+
+	log.Infof("Downloading from %s to %s ...", s3url, destDir)
+	cmd := exec.Command("aws", "s3", "sync", s3url, destDir)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	if err := cmd.Run(); err != nil {
+		return fmt.Errorf("aws s3 sync failed: %w\n\nTo authenticate, run:\n  aws sso login\n\nOr configure AWS credentials with:\n  aws configure sso", err)
+	}
+
+	return nil
+}
+
+// SyncUp uploads a local directory to an S3 prefix using AWS CLI.
+// If delete is true, files in S3 that don't exist locally are removed.
+// This is equivalent to: aws s3 sync <srcDir> <s3url> [--delete]
+func SyncUp(srcDir string, s3url string, delete bool) error {
+	args := []string{"s3", "sync", srcDir, s3url}
+	if delete {
+		args = append(args, "--delete")
+	}
+
+	log.Infof("Uploading from %s to %s ...", srcDir, s3url)
+	cmd := exec.Command("aws", args...)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	if err := cmd.Run(); err != nil {
+		return fmt.Errorf("aws s3 sync failed: %w\n\nTo authenticate, run:\n  aws sso login\n\nOr configure AWS credentials with:\n  aws configure sso", err)
+	}
+
+	return nil
+}
--- a/uv.lock
+++ b/uv.lock
@@ -4711,7 +4711,7 @@ requires-dist = [
    { name = "numpy", marker = "extra == 'model-server'", specifier = "==2.4.1" },
    { name = "oauthlib", marker = "extra == 'backend'", specifier = "==3.2.2" },
    { name = "office365-rest-python-client", marker = "extra == 'backend'", specifier = "==2.5.9" },
-    { name = "onyx-devtools", marker = "extra == 'dev'", specifier = "==0.5.3" },
+    { name = "onyx-devtools", marker = "extra == 'dev'", specifier = "==0.5.7" },
    { name = "openai", specifier = "==2.14.0" },
    { name = "openapi-generator-cli", marker = "extra == 'dev'", specifier = "==7.17.0" },
    { name = "openinference-instrumentation", marker = "extra == 'backend'", specifier = "==0.1.42" },
@@ -4816,20 +4816,20 @@ requires-dist = [{ name = "onyx", extras = ["backend", "dev", "ee"], editable =

 [[package]]
 name = "onyx-devtools"
-version = "0.5.3"
+version = "0.5.7"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "fastapi" },
    { name = "openapi-generator-cli" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/39/87e770afccf123cd72ca8c58178bc08a9b04cb6198f265213012a6a71f21/onyx_devtools-0.5.3-py3-none-any.whl", hash = "sha256:6b61dff779a5839032fb282f8db62aa3d640c09fa0d7d2ed7f8a23fd38fa84df", size = 2894984, upload-time = "2026-02-11T23:05:50.739Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/c5/9a7516398af4183f3247a668b710da344c002586e9be668cb690b8566d8a/onyx_devtools-0.5.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:268c57ffb08322bd9671d1b8444199607bc1eaf7e2c25300de98ba272c716c3e", size = 2913582, upload-time = "2026-02-11T23:05:33.582Z" },
-    { url = "https://files.pythonhosted.org/packages/70/58/86895464d02e2ae0a22a0bcc48cfd5e7cb647ee117a1a0620850f03e21e5/onyx_devtools-0.5.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e440d14ecad26ea3c85ae00a95cc1731214de6c6c71b90b08ab3608d99ecdd58", size = 2717143, upload-time = "2026-02-11T23:05:32.673Z" },
-    { url = "https://files.pythonhosted.org/packages/10/95/c8ea6a27afde2c29b108a0988aa4f44963d7124bfe04322217c7003129b9/onyx_devtools-0.5.3-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:49136baf0427aa6a5dde57457e4c963d86be4cd59bb6d02837609dd470de6a6b", size = 2625948, upload-time = "2026-02-11T23:05:48.147Z" },
-    { url = "https://files.pythonhosted.org/packages/85/cc/aabfb4599ce42aac88bdb1082696e3dde0a34a7739df61035e77e01cbca3/onyx_devtools-0.5.3-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2c327d258943f80b9860268fa69fde3a6f707d2aaed9362385cd6acd255d11cc", size = 2895001, upload-time = "2026-02-11T23:05:50.509Z" },
-    { url = "https://files.pythonhosted.org/packages/17/3c/d3af3a49464d15ebb0a8cf371169158bb99a14be859ac7468c73ecf055cd/onyx_devtools-0.5.3-py3-none-win_amd64.whl", hash = "sha256:fa5e7b779ede887f7c2e2da2442048cc9b626a9d8007b34c3b617e40dfd8d5bd", size = 2977738, upload-time = "2026-02-11T23:05:30.592Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/27/8844e7c4ee06453b57be55644e572206b7b79e3685351f80afd8b7056327/onyx_devtools-0.5.3-py3-none-win_arm64.whl", hash = "sha256:2542fc3b1ee27d0695aef8e17819879a0eeaed10e2855e31145cbfa6267fcf6c", size = 2688564, upload-time = "2026-02-11T23:05:34.968Z" },
+    { url = "https://files.pythonhosted.org/packages/23/7d/a9135044e220b6ef6a0752be826c6c758a1fc8b59d545306938aa43e8976/onyx_devtools-0.5.7-py3-none-any.whl", hash = "sha256:47c5cdefb525523a9860ed134366f30a0d2ad30e055b2350c1da577d1059654b", size = 3769892, upload-time = "2026-02-12T20:06:02.937Z" },
+    { url = "https://files.pythonhosted.org/packages/e7/63/26dbfc35f62d0617e4c46b508e106f155990c37c851d8eb44bc331b2e933/onyx_devtools-0.5.7-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:c7ce707d9e27733e7300b2be3686e3fd76d62b9b1c20c9bd02dac707f4eac1d5", size = 3815888, upload-time = "2026-02-12T20:06:07.024Z" },
+    { url = "https://files.pythonhosted.org/packages/82/55/4498e74af5f115355127c966e326f9ae430460170d1f1d50c2f150f53a00/onyx_devtools-0.5.7-py3-none-macosx_11_0_arm64.whl", hash = "sha256:0d02a0c1c48a33bd85b251a2288d94a00effc2139b6e2b7018362cba8cf717e1", size = 3562190, upload-time = "2026-02-12T20:06:00.998Z" },
+    { url = "https://files.pythonhosted.org/packages/18/70/fc1490420bd690bc6b3ebc3a6da68347636cb1a31afa07801fba9f77def4/onyx_devtools-0.5.7-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:fe3ae04f06e1b421f1297e70d2c14013d85941afa85210bfd96db30abb391989", size = 3425118, upload-time = "2026-02-12T20:05:59.192Z" },
+    { url = "https://files.pythonhosted.org/packages/b3/46/76b44234d7cd4cf5c73b897f6dd1864c867c63cc871fd73f8901592c9248/onyx_devtools-0.5.7-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:ecf5f525c773d8db0b58bef3a02b00df31e7a9ade16213b4220eb2baffffd8e2", size = 3769913, upload-time = "2026-02-12T20:06:03.405Z" },
+    { url = "https://files.pythonhosted.org/packages/7b/e5/9ef8d3265dfc82dbd9d27653d981ccb67c779882807ef1bd7fcecbe1c68a/onyx_devtools-0.5.7-py3-none-win_amd64.whl", hash = "sha256:f84368da19311acc246d511c5b2874b14ca1c9e53675198ba6ccabefbe57d648", size = 3863558, upload-time = "2026-02-12T20:06:01.995Z" },
+    { url = "https://files.pythonhosted.org/packages/30/ad/f23ace3e049017e9cfcc06302005fd476b44357b6f4ade521febd8393599/onyx_devtools-0.5.7-py3-none-win_arm64.whl", hash = "sha256:edb1dcd3901f7532114d40fbc903ba60c528bdad397425c174dc5841b5b8de43", size = 3486869, upload-time = "2026-02-12T20:06:03.719Z" },
 ]

 [[package]]
--- a/web/.gitignore
+++ b/web/.gitignore
@@ -41,6 +41,7 @@ next-env.d.ts
 /user_auth.json
 /build-archive.log
 /test-results
+/output/

 # generated clients ... in particular, the API to the Onyx backend itself!
 /src/lib/generated
--- a/web/README.md
+++ b/web/README.md
@@ -61,27 +61,26 @@ Bring up the entire application.

 0. Install playwright dependencies

-```cd web
+```bash
 npx playwright install
 ```

 1. Run playwright

-```
-cd web
+```bash
 npx playwright test
 ```

 To run a single test:

-```
+```bash
 npx playwright test landing-page.spec.ts
 ```

 If running locally, interactive options can help you see exactly what is happening in
 the test.

-```
+```bash
 npx playwright test --ui
 npx playwright test --headed
 ```
@@ -90,6 +89,17 @@ npx playwright test --headed

 By default, playwright.config.ts is configured to output the results to:

+```bash
+web/output/playwright/
 ```
-web/test-results
+
+3. Visual regression screenshots
+
+Screenshots are captured automatically during test runs and saved to `web/output/screenshots/`.
+To compare screenshots across CI runs, use:
+
+```bash
+ods screenshot-diff compare --project admin
 ```
+
+For more information, see [tools/ods/README.md](https://github.com/onyx-dot-app/onyx/blob/main/tools/ods/README.md#screenshot-diff---visual-regression-testing).
--- a/web/playwright.config.ts
+++ b/web/playwright.config.ts
@@ -8,6 +8,12 @@ export default defineConfig({
  timeout: 100000, // 100 seconds timeout
  expect: {
    timeout: 15000, // 15 seconds timeout for all assertions to reduce flakiness
+    toHaveScreenshot: {
+      // Allow up to 1% of pixels to differ (accounts for anti-aliasing, subpixel rendering)
+      maxDiffPixelRatio: 0.01,
+      // Threshold per-channel (0-1): how different a pixel can be before it counts as changed
+      threshold: 0.2,
+    },
  },
  retries: process.env.CI ? 2 : 0, // Retry failed tests 2 times in CI, 0 locally

@@ -20,7 +26,7 @@ export default defineConfig({
  reporter: [["list"]],
  // Only run Playwright tests from tests/e2e directory (ignore Jest tests in src/)
  testMatch: /.*\/tests\/e2e\/.*\.spec\.ts/,
-  outputDir: "test-results",
+  outputDir: "output/playwright",
  use: {
    // Base URL for the application, can be overridden via BASE_URL environment variable
    baseURL: process.env.BASE_URL || "http://localhost:3000",
--- a/web/tests/e2e/admin_pages.spec.ts
+++ b/web/tests/e2e/admin_pages.spec.ts
@@ -0,0 +1,199 @@
+import { test, expect } from "@playwright/test";
+import type { Page } from "@playwright/test";
+import { expectScreenshot } from "./utils/visualRegression";
+
+test.use({ storageState: "admin_auth.json" });
+test.describe.configure({ mode: "parallel" });
+
+interface AdminPageSnapshot {
+  name: string;
+  path: string;
+  pageTitle: string;
+  options?: {
+    paragraphText?: string | RegExp;
+    buttonName?: string;
+    subHeaderText?: string;
+  };
+}
+
+const ADMIN_PAGES: AdminPageSnapshot[] = [
+  {
+    name: "Document Management - Explorer",
+    path: "documents/explorer",
+    pageTitle: "Document Explorer",
+  },
+  {
+    name: "Connectors - Add Connector",
+    path: "add-connector",
+    pageTitle: "Add Connector",
+  },
+  {
+    name: "Custom Assistants - Assistants",
+    path: "assistants",
+    pageTitle: "Assistants",
+    options: {
+      paragraphText:
+        "Assistants are a way to build custom search/question-answering experiences for different use cases.",
+    },
+  },
+  {
+    name: "Configuration - Document Processing",
+    path: "configuration/document-processing",
+    pageTitle: "Document Processing",
+  },
+  {
+    name: "Document Management - Document Sets",
+    path: "documents/sets",
+    pageTitle: "Document Sets",
+    options: {
+      paragraphText:
+        "Document Sets allow you to group logically connected documents into a single bundle. These can then be used as a filter when performing searches to control the scope of information Onyx searches over.",
+    },
+  },
+  {
+    name: "Custom Assistants - Slack Bots",
+    path: "bots",
+    pageTitle: "Slack Bots",
+    options: {
+      paragraphText:
+        "Setup Slack bots that connect to Onyx. Once setup, you will be able to ask questions to Onyx directly from Slack. Additionally, you can:",
+    },
+  },
+  {
+    name: "Custom Assistants - Standard Answers",
+    path: "standard-answer",
+    pageTitle: "Standard Answers",
+  },
+  {
+    name: "Performance - Usage Statistics",
+    path: "performance/usage",
+    pageTitle: "Usage Statistics",
+  },
+  {
+    name: "Document Management - Feedback",
+    path: "documents/feedback",
+    pageTitle: "Document Feedback",
+  },
+  {
+    name: "Configuration - LLM",
+    path: "configuration/llm",
+    pageTitle: "LLM Setup",
+  },
+  {
+    name: "Connectors - Existing Connectors",
+    path: "indexing/status",
+    pageTitle: "Existing Connectors",
+  },
+  {
+    name: "User Management - Groups",
+    path: "groups",
+    pageTitle: "Manage User Groups",
+  },
+  {
+    name: "Appearance & Theming",
+    path: "theme",
+    pageTitle: "Appearance & Theming",
+  },
+  {
+    name: "Configuration - Search Settings",
+    path: "configuration/search",
+    pageTitle: "Search Settings",
+  },
+  {
+    name: "Custom Assistants - MCP Actions",
+    path: "actions/mcp",
+    pageTitle: "MCP Actions",
+  },
+  {
+    name: "Custom Assistants - OpenAPI Actions",
+    path: "actions/open-api",
+    pageTitle: "OpenAPI Actions",
+  },
+  {
+    name: "User Management - Token Rate Limits",
+    path: "token-rate-limits",
+    pageTitle: "Token Rate Limits",
+    options: {
+      paragraphText:
+        "Token rate limits enable you control how many tokens can be spent in a given time period. With token rate limits, you can:",
+      buttonName: "Create a Token Rate Limit",
+    },
+  },
+];
+
+async function verifyAdminPageNavigation(
+  page: Page,
+  path: string,
+  pageTitle: string,
+  options?: {
+    paragraphText?: string | RegExp;
+    buttonName?: string;
+    subHeaderText?: string;
+  }
+) {
+  await page.goto(`/admin/${path}`);
+
+  try {
+    await expect(page.locator('[aria-label="admin-page-title"]')).toHaveText(
+      pageTitle,
+      {
+        timeout: 10000,
+      }
+    );
+  } catch (error) {
+    console.error(
+      `Failed to find admin-page title with text "${pageTitle}" for path "${path}"`
+    );
+    // NOTE: This is a temporary measure for debugging the issue
+    console.error(await page.content());
+    throw error;
+  }
+
+  if (options?.paragraphText) {
+    await expect(page.locator("p.text-sm").nth(0)).toHaveText(
+      options.paragraphText
+    );
+  }
+
+  if (options?.buttonName) {
+    await expect(
+      page.getByRole("button", { name: options.buttonName })
+    ).toHaveCount(1);
+  }
+}
+
+const THEMES = ["light", "dark"] as const;
+
+for (const theme of THEMES) {
+  test.describe(`Admin pages (${theme} mode)`, () => {
+    // Inject the theme into localStorage before every navigation so
+    // next-themes picks it up on first render.
+    test.beforeEach(async ({ page }) => {
+      await page.addInitScript((t: string) => {
+        localStorage.setItem("theme", t);
+      }, theme);
+    });
+
+    for (const snapshot of ADMIN_PAGES) {
+      test(`Admin - ${snapshot.name}`, async ({ page }) => {
+        await verifyAdminPageNavigation(
+          page,
+          snapshot.path,
+          snapshot.pageTitle,
+          snapshot.options
+        );
+
+        // Wait for all network requests to settle before capturing the screenshot.
+        await page.waitForLoadState("networkidle");
+
+        // Capture a screenshot for visual regression review.
+        // The screenshot name includes the theme to keep light/dark baselines separate.
+        const screenshotName = `admin-${theme}-${snapshot.path.replace(
+          /\//g,
+          "-"
+        )}`;
+        await expectScreenshot(page, { name: screenshotName });
+      });
+    }
+  });
+}
--- a/web/tests/e2e/utils/visualRegression.ts
+++ b/web/tests/e2e/utils/visualRegression.ts
@@ -0,0 +1,125 @@
+import type { Page, PageScreenshotOptions } from "@playwright/test";
+import { expect } from "@playwright/test";
+
+/**
+ * Whether visual regression assertions are enabled.
+ *
+ * When `VISUAL_REGRESSION=true` is set, `expectScreenshot()` calls
+ * `toHaveScreenshot()` which will fail if the screenshot differs from the
+ * stored baseline.
+ *
+ * When disabled (the default), screenshots are still captured and saved but
+ * mismatches do NOT fail the test — this lets CI collect screenshots for later
+ * review without gating on them.
+ */
+const VISUAL_REGRESSION_ENABLED =
+  process.env.VISUAL_REGRESSION?.toLowerCase() === "true";
+
+/**
+ * Default selectors to mask across all screenshots so that dynamic content
+ * (timestamps, avatars, etc.) doesn't cause spurious diffs.
+ */
+const DEFAULT_MASK_SELECTORS: string[] = [
+  // Add selectors for dynamic content that should be masked, e.g.:
+  // '[data-testid="timestamp"]',
+  // '[data-testid="user-avatar"]',
+];
+
+interface ScreenshotOptions {
+  /**
+   * Name for the screenshot file. If omitted, Playwright auto-generates one
+   * from the test title.
+   */
+  name?: string;
+
+  /**
+   * Additional CSS selectors to mask (on top of the defaults).
+   * Masked areas are replaced with a pink box so they don't cause diffs.
+   */
+  mask?: string[];
+
+  /**
+   * If true, capture the full scrollable page instead of just the viewport.
+   * Defaults to false.
+   */
+  fullPage?: boolean;
+
+  /**
+   * Override the max diff pixel ratio for this specific screenshot.
+   */
+  maxDiffPixelRatio?: number;
+
+  /**
+   * Override the per-channel threshold for this specific screenshot.
+   */
+  threshold?: number;
+
+  /**
+   * Additional Playwright screenshot options.
+   */
+  screenshotOptions?: PageScreenshotOptions;
+}
+
+/**
+ * Take a screenshot and optionally assert it matches the stored baseline.
+ *
+ * Behavior depends on the `VISUAL_REGRESSION` environment variable:
+ * - `VISUAL_REGRESSION=true`  → assert via `toHaveScreenshot()` (fails on diff)
+ * - Otherwise                 → capture and save the screenshot for review only
+ *
+ * Usage:
+ * ```ts
+ * import { expectScreenshot } from "@tests/e2e/utils/visualRegression";
+ *
+ * test("admin page looks right", async ({ page }) => {
+ *   await page.goto("/admin/settings");
+ *   await expectScreenshot(page, { name: "admin-settings" });
+ * });
+ * ```
+ */
+export async function expectScreenshot(
+  page: Page,
+  options: ScreenshotOptions = {}
+): Promise<void> {
+  const {
+    name,
+    mask = [],
+    fullPage = false,
+    maxDiffPixelRatio,
+    threshold,
+  } = options;
+
+  // Combine default masks with per-call masks
+  const allMaskSelectors = [...DEFAULT_MASK_SELECTORS, ...mask];
+  const maskLocators = allMaskSelectors.map((selector) =>
+    page.locator(selector)
+  );
+
+  // Build the screenshot name array (Playwright expects string[])
+  const nameArg = name ? [name + ".png"] : undefined;
+
+  if (VISUAL_REGRESSION_ENABLED) {
+    // Assert mode — fail the test if the screenshot differs from baseline
+    const screenshotOpts = {
+      fullPage,
+      mask: maskLocators.length > 0 ? maskLocators : undefined,
+      ...(maxDiffPixelRatio !== undefined && { maxDiffPixelRatio }),
+      ...(threshold !== undefined && { threshold }),
+    };
+
+    if (nameArg) {
+      await expect(page).toHaveScreenshot(nameArg, screenshotOpts);
+    } else {
+      await expect(page).toHaveScreenshot(screenshotOpts);
+    }
+  } else {
+    // Capture-only mode — save the screenshot without asserting
+    const screenshotPath = name ? `output/screenshots/${name}.png` : undefined;
+    await page.screenshot({
+      path: screenshotPath,
+      fullPage,
+      mask: maskLocators.length > 0 ? maskLocators : undefined,
+      ...options.screenshotOptions,
+    });
+  }
+}