chore(playwright): replace chromatic with builtin screenshots

parallel and wait for networkidle what did claude cook use prebuilt ods no assume-role onyx-playwright-artifacts grief rm baseline upload on non-main XXX Revert "XXX" This reverts commit f810e37c38. dark coverage XXX still griefing ux ods 0.5.5 fix slider zoom nit Revert "XXX" This reverts commit 6782bd17a6. upgrade zizmor nit rename screenshot-diff claude my goat
2026-04-08 08:22:42 +00:00 · 2026-02-12 12:02:05 -08:00
17 changed files with 2242 additions and 22 deletions
--- a/.github/workflows/pr-playwright-tests.yml
+++ b/.github/workflows/pr-playwright-tests.yml
@@ -52,6 +52,9 @@ env:
  MCP_SERVER_PUBLIC_HOST: host.docker.internal
  MCP_SERVER_PUBLIC_URL: http://host.docker.internal:8004/mcp

+  # Visual regression S3 bucket (shared across all jobs)
+  PLAYWRIGHT_S3_BUCKET: onyx-playwright-artifacts
+
 jobs:
  build-web-image:
    runs-on:
@@ -239,6 +242,9 @@ jobs:
  playwright-tests:
    needs: [build-web-image, build-backend-image, build-model-server-image]
    name: Playwright Tests (${{ matrix.project }})
+    permissions:
+      id-token: write # Required for OIDC-based AWS credential exchange (S3 access)
+      contents: read
    runs-on:
      - runs-on
      - runner=8cpu-linux-arm64
@@ -428,8 +434,6 @@ jobs:
        env:
          PROJECT: ${{ matrix.project }}
        run: |
-          # Create test-results directory to ensure it exists for artifact upload
-          mkdir -p test-results
          npx playwright test --project ${PROJECT}

      - uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
@@ -437,9 +441,112 @@ jobs:
        with:
          # Includes test results and trace.zip files
          name: playwright-test-results-${{ matrix.project }}-${{ github.run_id }}
-          path: ./web/test-results/
+          path: ./web/output/playwright/
          retention-days: 30

+      - name: Upload screenshots
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        if: always()
+        with:
+          name: playwright-screenshots-${{ matrix.project }}-${{ github.run_id }}
+          path: ./web/output/screenshots/
+          retention-days: 30
+
+      # --- Visual Regression Diff ---
+      - name: Install the latest version of uv
+        if: always()
+        uses: astral-sh/setup-uv@61cb8a9741eeb8a550a1b8544337180c0fc8476b # ratchet:astral-sh/setup-uv@v7
+        with:
+          enable-cache: false
+          version: "0.9.9"
+
+      - name: Determine baseline revision
+        if: always()
+        id: baseline-rev
+        run: |
+          if [ "${{ github.event_name }}" = "pull_request" ]; then
+            # PRs compare against the base branch (e.g. main, release/2.5)
+            echo "rev=${{ github.event.pull_request.base.ref }}" >> "$GITHUB_OUTPUT"
+          elif [[ "${{ github.ref }}" == refs/tags/* ]]; then
+            # Tag builds compare against the tag name
+            echo "rev=${{ github.ref_name }}" >> "$GITHUB_OUTPUT"
+          else
+            # Push builds (main, release/*) compare against the branch name
+            echo "rev=${{ github.ref_name }}" >> "$GITHUB_OUTPUT"
+          fi
+
+      - name: Generate screenshot diff report
+        if: always()
+        env:
+          PROJECT: ${{ matrix.project }}
+          PLAYWRIGHT_S3_BUCKET: ${{ env.PLAYWRIGHT_S3_BUCKET }}
+          BASELINE_REV: ${{ steps.baseline-rev.outputs.rev }}
+        run: |
+          uv run --no-sync --with onyx-devtools ods screenshot-diff compare \
+            --project "${PROJECT}" \
+            --rev "${BASELINE_REV}"
+
+      - name: Upload visual diff report to S3
+        if: always()
+        env:
+          PROJECT: ${{ matrix.project }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          RUN_ID: ${{ github.run_id }}
+        run: |
+          SUMMARY_FILE="web/output/screenshot-diff/${PROJECT}/summary.json"
+          if [ ! -f "${SUMMARY_FILE}" ]; then
+            echo "No summary file found — skipping S3 upload."
+            exit 0
+          fi
+
+          HAS_DIFF=$(jq -r '.has_differences' "${SUMMARY_FILE}")
+          if [ "${HAS_DIFF}" != "true" ]; then
+            echo "No visual differences for ${PROJECT} — skipping S3 upload."
+            exit 0
+          fi
+
+          aws s3 sync "web/output/screenshot-diff/${PROJECT}/" \
+            "s3://${PLAYWRIGHT_S3_BUCKET}/reports/pr-${PR_NUMBER}/${RUN_ID}/${PROJECT}/"
+
+      - name: Upload visual diff summary
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        if: always()
+        with:
+          name: screenshot-diff-summary-${{ matrix.project }}
+          path: ./web/output/screenshot-diff/${{ matrix.project }}/summary.json
+          if-no-files-found: ignore
+          retention-days: 5
+
+      - name: Upload visual diff report artifact
+        uses: actions/upload-artifact@b7c566a772e6b6bfb58ed0dc250532a479d7789f
+        if: always()
+        with:
+          name: screenshot-diff-report-${{ matrix.project }}-${{ github.run_id }}
+          path: ./web/output/screenshot-diff/${{ matrix.project }}/
+          if-no-files-found: ignore
+          retention-days: 30
+
+      - name: Update S3 baselines
+        if: >-
+          success() && (
+            github.ref == 'refs/heads/main' ||
+            startsWith(github.ref, 'refs/heads/release/') ||
+            startsWith(github.ref, 'refs/tags/v')
+          )
+        env:
+          PROJECT: ${{ matrix.project }}
+          PLAYWRIGHT_S3_BUCKET: ${{ env.PLAYWRIGHT_S3_BUCKET }}
+          BASELINE_REV: ${{ steps.baseline-rev.outputs.rev }}
+        run: |
+          if [ -d "web/output/screenshots/" ] && [ "$(ls -A web/output/screenshots/)" ]; then
+            uv run --no-sync --with onyx-devtools ods screenshot-diff upload-baselines \
+              --project "${PROJECT}" \
+              --rev "${BASELINE_REV}" \
+              --delete
+          else
+            echo "No screenshots to upload for ${PROJECT} — skipping baseline update."
+          fi
+
      # save before stopping the containers so the logs can be captured
      - name: Save Docker logs
        if: success() || failure()
@@ -457,6 +564,95 @@ jobs:
          name: docker-logs-${{ matrix.project }}-${{ github.run_id }}
          path: ${{ github.workspace }}/docker-compose.log

+  # Post a single combined visual regression comment after all matrix jobs finish
+  visual-regression-comment:
+    needs: [playwright-tests]
+    if: always() && github.event_name == 'pull_request'
+    runs-on: ubuntu-slim
+    timeout-minutes: 5
+    permissions:
+      pull-requests: write
+    steps:
+      - name: Download visual diff summaries
+        uses: actions/download-artifact@95815c38cf2ff2164869cbab79da8d1f422bc89e # ratchet:actions/download-artifact@v4
+        with:
+          pattern: screenshot-diff-summary-*
+          path: summaries/
+
+      - name: Post combined PR comment
+        env:
+          GH_TOKEN: ${{ github.token }}
+          PR_NUMBER: ${{ github.event.pull_request.number }}
+          RUN_ID: ${{ github.run_id }}
+          REPO: ${{ github.repository }}
+          S3_BUCKET: ${{ env.PLAYWRIGHT_S3_BUCKET }}
+        run: |
+          MARKER="<!-- visual-regression-report -->"
+
+          # Build the markdown table from all summary files
+          TABLE_HEADER="| Project | Changed | Added | Removed | Unchanged | Report |"
+          TABLE_DIVIDER="|---------|---------|-------|---------|-----------|--------|"
+          TABLE_ROWS=""
+          HAS_ANY_SUMMARY=false
+
+          for SUMMARY_DIR in summaries/screenshot-diff-summary-*/; do
+            SUMMARY_FILE="${SUMMARY_DIR}summary.json"
+            if [ ! -f "${SUMMARY_FILE}" ]; then
+              continue
+            fi
+
+            HAS_ANY_SUMMARY=true
+            PROJECT=$(jq -r '.project' "${SUMMARY_FILE}")
+            CHANGED=$(jq -r '.changed' "${SUMMARY_FILE}")
+            ADDED=$(jq -r '.added' "${SUMMARY_FILE}")
+            REMOVED=$(jq -r '.removed' "${SUMMARY_FILE}")
+            UNCHANGED=$(jq -r '.unchanged' "${SUMMARY_FILE}")
+            TOTAL=$(jq -r '.total' "${SUMMARY_FILE}")
+            HAS_DIFF=$(jq -r '.has_differences' "${SUMMARY_FILE}")
+
+            if [ "${TOTAL}" = "0" ]; then
+              REPORT_LINK="_No screenshots_"
+            elif [ "${HAS_DIFF}" = "true" ]; then
+              REPORT_URL="https://${S3_BUCKET}.s3.us-east-2.amazonaws.com/reports/pr-${PR_NUMBER}/${RUN_ID}/${PROJECT}/index.html"
+              REPORT_LINK="[View Report](${REPORT_URL})"
+            else
+              REPORT_LINK="✅ No changes"
+            fi
+
+            TABLE_ROWS="${TABLE_ROWS}| \`${PROJECT}\` | ${CHANGED} | ${ADDED} | ${REMOVED} | ${UNCHANGED} | ${REPORT_LINK} |\n"
+          done
+
+          if [ "${HAS_ANY_SUMMARY}" = "false" ]; then
+            echo "No visual diff summaries found — skipping PR comment."
+            exit 0
+          fi
+
+          BODY=$(printf '%s\n' \
+            "${MARKER}" \
+            "### 🖼️ Visual Regression Report" \
+            "" \
+            "${TABLE_HEADER}" \
+            "${TABLE_DIVIDER}" \
+            "$(printf '%b' "${TABLE_ROWS}")")
+
+          # Upsert: find existing comment with the marker, or create a new one
+          EXISTING_COMMENT_ID=$(gh api \
+            "repos/${REPO}/issues/${PR_NUMBER}/comments" \
+            --jq ".[] | select(.body | startswith(\"${MARKER}\")) | .id" \
+            2>/dev/null | head -1)
+
+          if [ -n "${EXISTING_COMMENT_ID}" ]; then
+            gh api \
+              --method PATCH \
+              "repos/${REPO}/issues/comments/${EXISTING_COMMENT_ID}" \
+              -f body="${BODY}"
+          else
+            gh api \
+              --method POST \
+              "repos/${REPO}/issues/${PR_NUMBER}/comments" \
+              -f body="${BODY}"
+          fi
+
  playwright-required:
    # NOTE: Github-hosted runners have about 20s faster queue times and are preferred here.
    runs-on: ubuntu-slim
--- a/backend/requirements/dev.txt
+++ b/backend/requirements/dev.txt
@@ -317,7 +317,7 @@ oauthlib==3.2.2
    # via
    #   kubernetes
    #   requests-oauthlib
-onyx-devtools==0.5.3
+onyx-devtools==0.5.6
    # via onyx
 openai==2.14.0
    # via
--- a/pyproject.toml
+++ b/pyproject.toml
@@ -144,7 +144,7 @@ dev = [
    "matplotlib==3.10.8",
    "mypy-extensions==1.0.0",
    "mypy==1.13.0",
-    "onyx-devtools==0.5.3",
+    "onyx-devtools==0.5.6",
    "openapi-generator-cli==7.17.0",
    "pandas-stubs~=2.3.3",
    "pre-commit==3.2.2",
--- a/tools/ods/README.md
+++ b/tools/ods/README.md
@@ -29,6 +29,10 @@ Some commands require external tools to be installed and configured:
  - Install from [cli.github.com](https://cli.github.com/)
  - Authenticate with `gh auth login`

+- **AWS CLI** - Required for `screenshot-diff` commands (S3 baseline sync)
+  - Install from [aws.amazon.com/cli](https://aws.amazon.com/cli/)
+  - Authenticate with `aws sso login` or `aws configure`
+
 ### Autocomplete

 `ods` provides autocomplete for `bash`, `fish`, `powershell` and `zsh` shells.
@@ -239,6 +243,100 @@ ods cherry-pick abc123 --release 2.5 --release 2.6
 ods cherry-pick abc123 def456 ghi789 --release 2.5
 ```

+### `screenshot-diff` - Visual Regression Testing
+
+Compare Playwright screenshots against baselines and generate visual diff reports.
+Baselines are stored per-project and per-revision in S3:
+
+```
+s3://<bucket>/baselines/<project>/<rev>/
+```
+
+This allows storing baselines for `main`, release branches (`release/2.5`), and
+version tags (`v2.0.0`) side-by-side. Revisions containing `/` are sanitised to
+`-` in the S3 path (e.g. `release/2.5` → `release-2.5`).
+
+```shell
+ods screenshot-diff <subcommand>
+```
+
+**Subcommands:**
+
+- `compare` - Compare screenshots against baselines and generate a diff report
+- `upload-baselines` - Upload screenshots to S3 as new baselines
+
+The `--project` flag provides sensible defaults so you don't need to specify every path.
+When set, the following defaults are applied:
+
+| Flag | Default |
+|------|---------|
+| `--baseline` | `s3://onyx-playwright-artifacts/baselines/<project>/<rev>/` |
+| `--current` | `web/output/screenshots/` |
+| `--output` | `web/output/screenshot-diff/<project>/index.html` |
+| `--rev` | `main` |
+
+The S3 bucket defaults to `onyx-playwright-artifacts` and can be overridden with the
+`PLAYWRIGHT_S3_BUCKET` environment variable.
+
+**`compare` Flags:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--project` | | Project name (e.g. `admin`); sets sensible defaults |
+| `--rev` | `main` | Revision baseline to compare against |
+| `--from-rev` | | Source (older) revision for cross-revision comparison |
+| `--to-rev` | | Target (newer) revision for cross-revision comparison |
+| `--baseline` | | Baseline directory or S3 URL (`s3://...`) |
+| `--current` | | Current screenshots directory or S3 URL (`s3://...`) |
+| `--output` | `screenshot-diff/index.html` | Output path for the HTML report |
+| `--threshold` | `0.2` | Per-channel pixel difference threshold (0.0–1.0) |
+| `--max-diff-ratio` | `0.01` | Max diff pixel ratio before marking as changed |
+
+**`upload-baselines` Flags:**
+
+| Flag | Default | Description |
+|------|---------|-------------|
+| `--project` | | Project name (e.g. `admin`); sets sensible defaults |
+| `--rev` | `main` | Revision to store the baseline under |
+| `--dir` | | Local directory containing screenshots to upload |
+| `--dest` | | S3 destination URL (`s3://...`) |
+| `--delete` | `false` | Delete S3 files not present locally |
+
+**Examples:**
+
+```shell
+# Compare local screenshots against the main baseline (default)
+ods screenshot-diff compare --project admin
+
+# Compare against a release branch baseline
+ods screenshot-diff compare --project admin --rev release/2.5
+
+# Compare two revisions directly (both sides fetched from S3)
+ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+# Compare with explicit paths
+ods screenshot-diff compare \
+  --baseline ./baselines \
+  --current ./web/output/screenshots/ \
+  --output ./report/index.html
+
+# Upload baselines for main (default)
+ods screenshot-diff upload-baselines --project admin
+
+# Upload baselines for a release branch
+ods screenshot-diff upload-baselines --project admin --rev release/2.5
+
+# Upload baselines for a version tag
+ods screenshot-diff upload-baselines --project admin --rev v2.0.0
+
+# Upload with delete (remove old baselines not in current set)
+ods screenshot-diff upload-baselines --project admin --delete
+```
+
+The `compare` subcommand writes a `summary.json` alongside the report with aggregate
+counts (changed, added, removed, unchanged). The HTML report is only generated when
+visual differences are detected.
+
 ### Testing Changes Locally (Dry Run)

 Both `run-ci` and `cherry-pick` support `--dry-run` to test without making remote changes:
--- a/tools/ods/cmd/root.go
+++ b/tools/ods/cmd/root.go
@@ -49,6 +49,7 @@ func NewRootCommand() *cobra.Command {
 	cmd.AddCommand(NewLogsCommand())
 	cmd.AddCommand(NewPullCommand())
 	cmd.AddCommand(NewRunCICommand())
+	cmd.AddCommand(NewScreenshotDiffCommand())

 	return cmd
 }
--- a/tools/ods/cmd/screenshot_diff.go
+++ b/tools/ods/cmd/screenshot_diff.go
@@ -0,0 +1,500 @@
+package cmd
+
+import (
+	"fmt"
+	"os"
+	"path/filepath"
+	"strings"
+
+	log "github.com/sirupsen/logrus"
+	"github.com/spf13/cobra"
+
+	"github.com/onyx-dot-app/onyx/tools/ods/internal/imgdiff"
+	"github.com/onyx-dot-app/onyx/tools/ods/internal/s3"
+)
+
+const (
+	// DefaultS3Bucket is the default S3 bucket for Playwright visual regression artifacts.
+	DefaultS3Bucket = "onyx-playwright-artifacts"
+
+	// DefaultScreenshotDir is the default local directory for captured screenshots,
+	// relative to the repository root.
+	DefaultScreenshotDir = "web/output/screenshots"
+
+	// DefaultOutputDir is the default base directory for screenshot diff output,
+	// relative to the repository root.
+	DefaultOutputDir = "web/output/screenshot-diff"
+
+	// DefaultRev is the default revision used when --rev is not specified.
+	DefaultRev = "main"
+)
+
+// getS3Bucket returns the S3 bucket name, preferring the PLAYWRIGHT_S3_BUCKET
+// environment variable over the compiled-in default.
+func getS3Bucket() string {
+	if bucket := os.Getenv("PLAYWRIGHT_S3_BUCKET"); bucket != "" {
+		return bucket
+	}
+	return DefaultS3Bucket
+}
+
+// sanitizeRev normalises a git ref for use as an S3 path segment.
+// Slashes are replaced with dashes (e.g. "release/2.5" → "release-2.5").
+func sanitizeRev(rev string) string {
+	return strings.ReplaceAll(rev, "/", "-")
+}
+
+// ScreenshotDiffCompareOptions holds options for the compare subcommand.
+type ScreenshotDiffCompareOptions struct {
+	Project      string
+	Rev          string // revision whose baseline to compare against (default: "main")
+	FromRev      string // cross-revision mode: source (older) revision
+	ToRev        string // cross-revision mode: target (newer) revision
+	Baseline     string
+	Current      string
+	Output       string
+	Threshold    float64
+	MaxDiffRatio float64
+}
+
+// ScreenshotDiffUploadOptions holds options for the upload-baselines subcommand.
+type ScreenshotDiffUploadOptions struct {
+	Project string
+	Rev     string // revision to store the baseline under (default: "main")
+	Dir     string
+	Dest    string
+	Delete  bool
+}
+
+// NewScreenshotDiffCommand creates the screenshot-diff command with subcommands.
+func NewScreenshotDiffCommand() *cobra.Command {
+	cmd := &cobra.Command{
+		Use:   "screenshot-diff",
+		Short: "Visual regression testing for Playwright screenshots",
+		Long: `Compare Playwright screenshots against baselines and generate visual diff reports.
+
+Supports comparing local directories and downloading baselines from S3.
+The generated HTML report is self-contained (images base64-inlined) and can
+be opened locally or hosted on S3.
+
+Baselines are stored per-project and per-revision in S3:
+
+  s3://<bucket>/baselines/<project>/<rev>/
+
+The --project flag provides sensible defaults so you don't need to specify
+every path. For example:
+
+  # Compare local screenshots against the "main" baseline (default)
+  ods screenshot-diff compare --project admin
+
+  # Compare against a release branch baseline
+  ods screenshot-diff compare --project admin --rev release/2.5
+
+  # Compare two revisions directly (no local screenshots needed)
+  ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+  # Upload new baselines for the "admin" project on main
+  ods screenshot-diff upload-baselines --project admin
+
+  # Upload baselines for a release branch
+  ods screenshot-diff upload-baselines --project admin --rev release/2.5
+
+You can override any default with explicit flags:
+
+  ods screenshot-diff compare --baseline ./my-baselines --current ./my-screenshots`,
+		Run: func(cmd *cobra.Command, args []string) {
+			_ = cmd.Help()
+		},
+	}
+
+	cmd.AddCommand(newCompareCommand())
+	cmd.AddCommand(newUploadBaselinesCommand())
+
+	return cmd
+}
+
+func newCompareCommand() *cobra.Command {
+	opts := &ScreenshotDiffCompareOptions{}
+
+	cmd := &cobra.Command{
+		Use:   "compare",
+		Short: "Compare screenshots against baselines and generate a diff report",
+		Long: `Compare current screenshots against baseline screenshots and produce
+a self-contained HTML visual diff report with a JSON summary.
+
+Baselines are stored per-revision in S3:
+
+  s3://<bucket>/baselines/<project>/<rev>/
+
+When --project is specified, the following defaults are applied:
+  --baseline  → s3://<bucket>/baselines/<project>/<rev>/
+  --current   → web/output/screenshots/
+  --output    → web/output/screenshot-diff/<project>/index.html
+  --rev       → main
+
+The bucket defaults to "onyx-playwright-artifacts" and can be overridden
+with the PLAYWRIGHT_S3_BUCKET environment variable.
+
+A summary.json file is always written next to the HTML report. If there
+are no visual differences, the HTML report is skipped.
+
+CROSS-REVISION MODE:
+
+Use --from-rev and --to-rev to compare two stored revisions directly.
+Both sides are downloaded from S3 — no local screenshots are needed.
+
+  ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+Examples:
+
+  # Compare local screenshots against main (default)
+  ods screenshot-diff compare --project admin
+
+  # Compare against a specific revision
+  ods screenshot-diff compare --project admin --rev release/2.5
+
+  # Compare two revisions
+  ods screenshot-diff compare --project admin --from-rev v1.0.0 --to-rev v2.0.0
+
+  # Override specific flags
+  ods screenshot-diff compare --project admin --current ./custom-dir/
+
+  # Fully manual (no project flag)
+  ods screenshot-diff compare \
+    --baseline s3://my-bucket/baselines/admin/main/ \
+    --current ./web/output/screenshots/ \
+    --output ./web/output/screenshot-diff/admin/index.html`,
+		Run: func(cmd *cobra.Command, args []string) {
+			runCompare(opts)
+		},
+	}
+
+	cmd.Flags().StringVar(&opts.Project, "project", "", "Project name (e.g. admin); sets sensible defaults for baseline, current, and output")
+	cmd.Flags().StringVar(&opts.Rev, "rev", "", "Revision to compare against (default: main). Ignored when --from-rev/--to-rev are set")
+	cmd.Flags().StringVar(&opts.FromRev, "from-rev", "", "Source (older) revision for cross-revision comparison")
+	cmd.Flags().StringVar(&opts.ToRev, "to-rev", "", "Target (newer) revision for cross-revision comparison")
+	cmd.Flags().StringVar(&opts.Baseline, "baseline", "", "Baseline directory or S3 URL (s3://...)")
+	cmd.Flags().StringVar(&opts.Current, "current", "", "Current screenshots directory or S3 URL (s3://...)")
+	cmd.Flags().StringVar(&opts.Output, "output", "", "Output path for the HTML report")
+	cmd.Flags().Float64Var(&opts.Threshold, "threshold", 0.2, "Per-channel pixel difference threshold (0.0-1.0)")
+	cmd.Flags().Float64Var(&opts.MaxDiffRatio, "max-diff-ratio", 0.01, "Max diff pixel ratio before marking as changed (informational)")
+
+	return cmd
+}
+
+func newUploadBaselinesCommand() *cobra.Command {
+	opts := &ScreenshotDiffUploadOptions{}
+
+	cmd := &cobra.Command{
+		Use:   "upload-baselines",
+		Short: "Upload screenshots to S3 as new baselines",
+		Long: `Upload a local directory of screenshots to S3 to serve as the new
+baseline for future comparisons. Typically run after tests pass on the
+main branch or a release branch.
+
+Baselines are stored per-revision in S3:
+
+  s3://<bucket>/baselines/<project>/<rev>/
+
+When --project is specified, the following defaults are applied:
+  --dir   → web/output/screenshots/
+  --dest  → s3://<bucket>/baselines/<project>/<rev>/
+  --rev   → main
+
+Examples:
+
+  # Upload baselines for main (default)
+  ods screenshot-diff upload-baselines --project admin
+
+  # Upload baselines for a release branch
+  ods screenshot-diff upload-baselines --project admin --rev release/2.5
+
+  # Upload baselines for a version tag
+  ods screenshot-diff upload-baselines --project admin --rev v2.0.0
+
+  # With delete (remove old baselines not in current set)
+  ods screenshot-diff upload-baselines --project admin --delete
+
+  # Fully manual
+  ods screenshot-diff upload-baselines \
+    --dir ./web/output/screenshots/ \
+    --dest s3://onyx-playwright-artifacts/baselines/admin/main/`,
+		Run: func(cmd *cobra.Command, args []string) {
+			runUploadBaselines(opts)
+		},
+	}
+
+	cmd.Flags().StringVar(&opts.Project, "project", "", "Project name (e.g. admin); sets sensible defaults for dir and dest")
+	cmd.Flags().StringVar(&opts.Rev, "rev", "", "Revision to store the baseline under (default: main)")
+	cmd.Flags().StringVar(&opts.Dir, "dir", "", "Local directory containing screenshots to upload")
+	cmd.Flags().StringVar(&opts.Dest, "dest", "", "S3 destination URL (s3://...)")
+	cmd.Flags().BoolVar(&opts.Delete, "delete", false, "Delete S3 files not present locally")
+
+	return cmd
+}
+
+// resolveCompareDefaults fills in missing flags from the --project default when set.
+func resolveCompareDefaults(opts *ScreenshotDiffCompareOptions) {
+	bucket := getS3Bucket()
+
+	if opts.Project != "" {
+		// Cross-revision mode: both sides come from S3
+		if opts.FromRev != "" && opts.ToRev != "" {
+			if opts.Baseline == "" {
+				opts.Baseline = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+					bucket, opts.Project, sanitizeRev(opts.FromRev))
+			}
+			if opts.Current == "" {
+				opts.Current = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+					bucket, opts.Project, sanitizeRev(opts.ToRev))
+			}
+		} else {
+			// Standard mode: compare local screenshots against a revision
+			rev := opts.Rev
+			if rev == "" {
+				rev = DefaultRev
+			}
+			if opts.Baseline == "" {
+				opts.Baseline = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+					bucket, opts.Project, sanitizeRev(rev))
+			}
+			if opts.Current == "" {
+				opts.Current = DefaultScreenshotDir
+			}
+		}
+
+		if opts.Output == "" {
+			opts.Output = filepath.Join(DefaultOutputDir, opts.Project, "index.html")
+		}
+	}
+
+	// Fall back for output even without --project
+	if opts.Output == "" {
+		opts.Output = "screenshot-diff/index.html"
+	}
+}
+
+// resolveUploadDefaults fills in missing flags from the --project default when set.
+func resolveUploadDefaults(opts *ScreenshotDiffUploadOptions) {
+	bucket := getS3Bucket()
+
+	if opts.Project != "" {
+		rev := opts.Rev
+		if rev == "" {
+			rev = DefaultRev
+		}
+		if opts.Dir == "" {
+			opts.Dir = DefaultScreenshotDir
+		}
+		if opts.Dest == "" {
+			opts.Dest = fmt.Sprintf("s3://%s/baselines/%s/%s/",
+				bucket, opts.Project, sanitizeRev(rev))
+		}
+	}
+}
+
+// downloadS3Dir downloads an S3 URL into a local temporary directory and
+// returns the path. The caller is responsible for cleaning up the directory.
+func downloadS3Dir(s3URL string, prefix string) (string, error) {
+	tmpDir, err := os.MkdirTemp("", prefix)
+	if err != nil {
+		return "", fmt.Errorf("failed to create temp directory: %w", err)
+	}
+
+	if err := s3.SyncDown(s3URL, tmpDir); err != nil {
+		_ = os.RemoveAll(tmpDir)
+		return "", fmt.Errorf("failed to download from S3 (%s): %w", s3URL, err)
+	}
+
+	return tmpDir, nil
+}
+
+func runCompare(opts *ScreenshotDiffCompareOptions) {
+	// Validate cross-revision flags are used together
+	if (opts.FromRev != "") != (opts.ToRev != "") {
+		log.Fatal("--from-rev and --to-rev must be used together")
+	}
+
+	resolveCompareDefaults(opts)
+
+	// Validate required fields
+	if opts.Baseline == "" {
+		log.Fatal("--baseline is required (or use --project to set defaults)")
+	}
+	if opts.Current == "" {
+		log.Fatal("--current is required (or use --project to set defaults)")
+	}
+
+	// Determine the project name for the summary (use flag or derive from path)
+	project := opts.Project
+	if project == "" {
+		project = "default"
+	}
+
+	// Track temp dirs for cleanup
+	var tempDirs []string
+	defer func() {
+		for _, d := range tempDirs {
+			_ = os.RemoveAll(d)
+		}
+	}()
+
+	// Resolve baseline directory
+	baselineDir := opts.Baseline
+	if strings.HasPrefix(opts.Baseline, "s3://") {
+		dir, err := downloadS3Dir(opts.Baseline, "screenshot-baseline-*")
+		if err != nil {
+			log.Fatalf("Failed to download baselines: %v", err)
+		}
+		tempDirs = append(tempDirs, dir)
+		baselineDir = dir
+	}
+
+	// Resolve current directory (may also be S3 in cross-revision mode)
+	currentDir := opts.Current
+	if strings.HasPrefix(opts.Current, "s3://") {
+		dir, err := downloadS3Dir(opts.Current, "screenshot-current-*")
+		if err != nil {
+			log.Fatalf("Failed to download current screenshots: %v", err)
+		}
+		tempDirs = append(tempDirs, dir)
+		currentDir = dir
+	}
+
+	// Verify baseline directory exists
+	if _, err := os.Stat(baselineDir); os.IsNotExist(err) {
+		log.Warnf("Baseline directory does not exist: %s", baselineDir)
+		log.Warn("This may be the first run -- no baselines to compare against.")
+		// Create an empty dir so CompareDirectories works (all files will be "added")
+		if err := os.MkdirAll(baselineDir, 0755); err != nil {
+			log.Fatalf("Failed to create baseline directory: %v", err)
+		}
+	}
+
+	// Resolve the output path
+	outputPath := opts.Output
+	if !filepath.IsAbs(outputPath) {
+		cwd, err := os.Getwd()
+		if err != nil {
+			log.Fatalf("Failed to get working directory: %v", err)
+		}
+		outputPath = filepath.Join(cwd, outputPath)
+	}
+	summaryPath := filepath.Join(filepath.Dir(outputPath), "summary.json")
+
+	// If the current screenshots directory doesn't exist, write an empty summary and exit
+	if _, err := os.Stat(currentDir); os.IsNotExist(err) {
+		log.Warnf("Current screenshots directory does not exist: %s", currentDir)
+		log.Warn("No screenshots captured for this project — writing empty summary.")
+
+		summary := imgdiff.Summary{Project: project}
+		if err := imgdiff.WriteSummary(summary, summaryPath); err != nil {
+			log.Fatalf("Failed to write summary: %v", err)
+		}
+		log.Infof("Summary written to: %s", summaryPath)
+		return
+	}
+
+	log.Infof("Comparing screenshots...")
+	log.Infof("  Baseline: %s", opts.Baseline)
+	log.Infof("  Current:  %s", opts.Current)
+	log.Infof("  Threshold: %.2f", opts.Threshold)
+
+	results, err := imgdiff.CompareDirectories(baselineDir, currentDir, opts.Threshold)
+	if err != nil {
+		log.Fatalf("Comparison failed: %v", err)
+	}
+
+	// Print terminal summary
+	printSummary(results)
+
+	// Build and write JSON summary (always)
+	summary := imgdiff.BuildSummary(project, results)
+	if err := imgdiff.WriteSummary(summary, summaryPath); err != nil {
+		log.Fatalf("Failed to write summary: %v", err)
+	}
+	log.Infof("Summary written to: %s", summaryPath)
+
+	// Generate HTML report only if there are differences
+	if summary.HasDifferences {
+		log.Infof("Generating report: %s", outputPath)
+		if err := imgdiff.GenerateReport(results, outputPath); err != nil {
+			log.Fatalf("Failed to generate report: %v", err)
+		}
+		log.Infof("Report generated successfully: %s", outputPath)
+	} else {
+		log.Infof("No visual differences detected — skipping report generation.")
+	}
+}
+
+func runUploadBaselines(opts *ScreenshotDiffUploadOptions) {
+	resolveUploadDefaults(opts)
+
+	// Validate required fields
+	if opts.Dir == "" {
+		log.Fatal("--dir is required (or use --project to set defaults)")
+	}
+	if opts.Dest == "" {
+		log.Fatal("--dest is required (or use --project to set defaults)")
+	}
+
+	if _, err := os.Stat(opts.Dir); os.IsNotExist(err) {
+		log.Fatalf("Screenshots directory does not exist: %s", opts.Dir)
+	}
+
+	if !strings.HasPrefix(opts.Dest, "s3://") {
+		log.Fatalf("Destination must be an S3 URL (s3://...): %s", opts.Dest)
+	}
+
+	log.Infof("Uploading baselines...")
+	log.Infof("  Source: %s", opts.Dir)
+	log.Infof("  Dest:   %s", opts.Dest)
+
+	if err := s3.SyncUp(opts.Dir, opts.Dest, opts.Delete); err != nil {
+		log.Fatalf("Failed to upload baselines: %v", err)
+	}
+
+	log.Info("Baselines uploaded successfully.")
+}
+
+func printSummary(results []imgdiff.Result) {
+	changed, added, removed, unchanged := 0, 0, 0, 0
+	for _, r := range results {
+		switch r.Status {
+		case imgdiff.StatusChanged:
+			changed++
+		case imgdiff.StatusAdded:
+			added++
+		case imgdiff.StatusRemoved:
+			removed++
+		case imgdiff.StatusUnchanged:
+			unchanged++
+		}
+	}
+
+	fmt.Println()
+	fmt.Println("╔══════════════════════════════════════════════╗")
+	fmt.Println("║          Visual Regression Summary           ║")
+	fmt.Println("╠══════════════════════════════════════════════╣")
+	fmt.Printf("║  Changed:   %-32d ║\n", changed)
+	fmt.Printf("║  Added:     %-32d ║\n", added)
+	fmt.Printf("║  Removed:   %-32d ║\n", removed)
+	fmt.Printf("║  Unchanged: %-32d ║\n", unchanged)
+	fmt.Printf("║  Total:     %-32d ║\n", len(results))
+	fmt.Println("╚══════════════════════════════════════════════╝")
+	fmt.Println()
+
+	if changed > 0 || added > 0 || removed > 0 {
+		for _, r := range results {
+			switch r.Status {
+			case imgdiff.StatusChanged:
+				fmt.Printf("  ⚠ CHANGED  %s (%.2f%% diff)\n", r.Name, r.DiffPercent)
+			case imgdiff.StatusAdded:
+				fmt.Printf("  ✚ ADDED    %s\n", r.Name)
+			case imgdiff.StatusRemoved:
+				fmt.Printf("  ✖ REMOVED  %s\n", r.Name)
+			}
+		}
+		fmt.Println()
+	}
+}
--- a/tools/ods/internal/imgdiff/compare.go
+++ b/tools/ods/internal/imgdiff/compare.go
@@ -0,0 +1,321 @@
+package imgdiff
+
+import (
+	"fmt"
+	"image"
+	"image/color"
+	"image/png"
+	"math"
+	"os"
+	"path/filepath"
+	"sort"
+	"strings"
+)
+
+// Status represents the comparison status of a screenshot.
+type Status int
+
+const (
+	// StatusUnchanged means the baseline and current images are identical (within threshold).
+	StatusUnchanged Status = iota
+	// StatusChanged means the images differ beyond the threshold.
+	StatusChanged
+	// StatusAdded means the image exists only in the current directory (no baseline).
+	StatusAdded
+	// StatusRemoved means the image exists only in the baseline directory (no current).
+	StatusRemoved
+)
+
+// String returns a human-readable string for the status.
+func (s Status) String() string {
+	switch s {
+	case StatusUnchanged:
+		return "unchanged"
+	case StatusChanged:
+		return "changed"
+	case StatusAdded:
+		return "added"
+	case StatusRemoved:
+		return "removed"
+	default:
+		return "unknown"
+	}
+}
+
+// Result holds the comparison result for a single screenshot.
+type Result struct {
+	// Name is the filename of the screenshot (e.g. "admin-documents-explorer.png").
+	Name string
+
+	// Status is the comparison status.
+	Status Status
+
+	// DiffPercent is the percentage of pixels that differ (0.0 to 100.0).
+	DiffPercent float64
+
+	// DiffPixels is the number of pixels that differ.
+	DiffPixels int
+
+	// TotalPixels is the total number of pixels compared.
+	TotalPixels int
+
+	// BaselinePath is the path to the baseline image (empty if added).
+	BaselinePath string
+
+	// CurrentPath is the path to the current image (empty if removed).
+	CurrentPath string
+
+	// DiffImage is the generated diff overlay image (nil if unchanged, added, or removed).
+	DiffImage image.Image
+}
+
+// Compare compares two PNG images pixel-by-pixel and returns the result.
+// The threshold parameter (0.0 to 1.0) controls per-channel sensitivity:
+// a pixel is considered different if any channel differs by more than threshold * 255.
+func Compare(baselinePath, currentPath string, threshold float64) (*Result, error) {
+	baseline, err := decodePNG(baselinePath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to decode baseline %s: %w", baselinePath, err)
+	}
+
+	current, err := decodePNG(currentPath)
+	if err != nil {
+		return nil, fmt.Errorf("failed to decode current %s: %w", currentPath, err)
+	}
+
+	baselineBounds := baseline.Bounds()
+	currentBounds := current.Bounds()
+
+	// Use the larger dimensions to ensure we compare the full area
+	width := max(baselineBounds.Dx(), currentBounds.Dx())
+	height := max(baselineBounds.Dy(), currentBounds.Dy())
+	totalPixels := width * height
+
+	if totalPixels == 0 {
+		return &Result{
+			Name:         filepath.Base(currentPath),
+			Status:       StatusUnchanged,
+			BaselinePath: baselinePath,
+			CurrentPath:  currentPath,
+		}, nil
+	}
+
+	diffImage := image.NewRGBA(image.Rect(0, 0, width, height))
+	diffPixels := 0
+	thresholdValue := threshold * 255.0
+
+	for y := 0; y < height; y++ {
+		for x := 0; x < width; x++ {
+			// Get pixel from each image (transparent if out of bounds)
+			var br, bg, bb, ba uint32
+			var cr, cg, cb, ca uint32
+
+			if x < baselineBounds.Dx() && y < baselineBounds.Dy() {
+				br, bg, bb, ba = baseline.At(baselineBounds.Min.X+x, baselineBounds.Min.Y+y).RGBA()
+			}
+			if x < currentBounds.Dx() && y < currentBounds.Dy() {
+				cr, cg, cb, ca = current.At(currentBounds.Min.X+x, currentBounds.Min.Y+y).RGBA()
+			}
+
+			// Convert from 16-bit to 8-bit
+			br8 := float64(br >> 8)
+			bg8 := float64(bg >> 8)
+			bb8 := float64(bb >> 8)
+			ba8 := float64(ba >> 8)
+			cr8 := float64(cr >> 8)
+			cg8 := float64(cg >> 8)
+			cb8 := float64(cb >> 8)
+			ca8 := float64(ca >> 8)
+
+			// Check if channels differ beyond threshold
+			isDiff := math.Abs(br8-cr8) > thresholdValue ||
+				math.Abs(bg8-cg8) > thresholdValue ||
+				math.Abs(bb8-cb8) > thresholdValue ||
+				math.Abs(ba8-ca8) > thresholdValue
+
+			if isDiff {
+				diffPixels++
+				// Highlight in magenta for diff overlay
+				diffImage.Set(x, y, color.RGBA{R: 255, G: 0, B: 255, A: 255})
+			} else {
+				// Dim the unchanged pixel (30% opacity of the current image)
+				diffImage.Set(x, y, color.RGBA{
+					R: uint8(cr8 * 0.3),
+					G: uint8(cg8 * 0.3),
+					B: uint8(cb8 * 0.3),
+					A: uint8(math.Max(ca8*0.3, 50)),
+				})
+			}
+		}
+	}
+
+	diffPercent := float64(diffPixels) / float64(totalPixels) * 100.0
+
+	status := StatusUnchanged
+	if diffPixels > 0 {
+		status = StatusChanged
+	}
+
+	return &Result{
+		Name:         filepath.Base(currentPath),
+		Status:       status,
+		DiffPercent:  diffPercent,
+		DiffPixels:   diffPixels,
+		TotalPixels:  totalPixels,
+		BaselinePath: baselinePath,
+		CurrentPath:  currentPath,
+		DiffImage:    diffImage,
+	}, nil
+}
+
+// CompareDirectories compares all PNG files in two directories.
+// Files are matched by name. Files only in baseline are "removed",
+// files only in current are "added", and matching files are compared.
+func CompareDirectories(baselineDir, currentDir string, threshold float64) ([]Result, error) {
+	baselineFiles, err := listPNGs(baselineDir)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list baseline directory: %w", err)
+	}
+
+	currentFiles, err := listPNGs(currentDir)
+	if err != nil {
+		return nil, fmt.Errorf("failed to list current directory: %w", err)
+	}
+
+	// Build maps for lookup
+	baselineMap := make(map[string]string, len(baselineFiles))
+	for _, f := range baselineFiles {
+		baselineMap[filepath.Base(f)] = f
+	}
+
+	currentMap := make(map[string]string, len(currentFiles))
+	for _, f := range currentFiles {
+		currentMap[filepath.Base(f)] = f
+	}
+
+	// Collect all unique names
+	allNames := make(map[string]struct{})
+	for name := range baselineMap {
+		allNames[name] = struct{}{}
+	}
+	for name := range currentMap {
+		allNames[name] = struct{}{}
+	}
+
+	var results []Result
+
+	for name := range allNames {
+		baselinePath, inBaseline := baselineMap[name]
+		currentPath, inCurrent := currentMap[name]
+
+		switch {
+		case inBaseline && inCurrent:
+			result, err := Compare(baselinePath, currentPath, threshold)
+			if err != nil {
+				return nil, fmt.Errorf("failed to compare %s: %w", name, err)
+			}
+			results = append(results, *result)
+
+		case inBaseline && !inCurrent:
+			results = append(results, Result{
+				Name:         name,
+				Status:       StatusRemoved,
+				BaselinePath: baselinePath,
+			})
+
+		case !inBaseline && inCurrent:
+			results = append(results, Result{
+				Name:        name,
+				Status:      StatusAdded,
+				CurrentPath: currentPath,
+			})
+		}
+	}
+
+	// Sort: changed first (by diff % descending), then added, removed, unchanged
+	sort.Slice(results, func(i, j int) bool {
+		if results[i].Status != results[j].Status {
+			return statusOrder(results[i].Status) < statusOrder(results[j].Status)
+		}
+		if results[i].Status == StatusChanged {
+			return results[i].DiffPercent > results[j].DiffPercent
+		}
+		return results[i].Name < results[j].Name
+	})
+
+	return results, nil
+}
+
+// SaveDiffImage writes a diff overlay image to the specified path as PNG.
+func SaveDiffImage(img image.Image, path string) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("failed to create directory: %w", err)
+	}
+
+	f, err := os.Create(path)
+	if err != nil {
+		return fmt.Errorf("failed to create file: %w", err)
+	}
+	defer func() { _ = f.Close() }()
+
+	if err := png.Encode(f, img); err != nil {
+		return fmt.Errorf("failed to encode PNG: %w", err)
+	}
+
+	return nil
+}
+
+// decodePNG reads and decodes a PNG file.
+func decodePNG(path string) (image.Image, error) {
+	f, err := os.Open(path)
+	if err != nil {
+		return nil, err
+	}
+	defer func() { _ = f.Close() }()
+
+	img, err := png.Decode(f)
+	if err != nil {
+		return nil, err
+	}
+
+	return img, nil
+}
+
+// listPNGs returns all .png files in a directory (non-recursive).
+func listPNGs(dir string) ([]string, error) {
+	entries, err := os.ReadDir(dir)
+	if err != nil {
+		if os.IsNotExist(err) {
+			return nil, nil
+		}
+		return nil, err
+	}
+
+	var pngs []string
+	for _, entry := range entries {
+		if entry.IsDir() {
+			continue
+		}
+		if strings.HasSuffix(strings.ToLower(entry.Name()), ".png") {
+			pngs = append(pngs, filepath.Join(dir, entry.Name()))
+		}
+	}
+
+	return pngs, nil
+}
+
+// statusOrder returns a sort priority for each status.
+func statusOrder(s Status) int {
+	switch s {
+	case StatusChanged:
+		return 0
+	case StatusAdded:
+		return 1
+	case StatusRemoved:
+		return 2
+	case StatusUnchanged:
+		return 3
+	default:
+		return 4
+	}
+}
--- a/tools/ods/internal/imgdiff/compare_test.go
+++ b/tools/ods/internal/imgdiff/compare_test.go
@@ -0,0 +1,309 @@
+package imgdiff
+
+import (
+	"image"
+	"image/color"
+	"image/png"
+	"os"
+	"path/filepath"
+	"testing"
+)
+
+// createTestPNG creates a solid-color PNG file at the given path.
+func createTestPNG(t *testing.T, path string, width, height int, c color.Color) {
+	t.Helper()
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		t.Fatalf("failed to create dir: %v", err)
+	}
+	img := image.NewRGBA(image.Rect(0, 0, width, height))
+	for y := 0; y < height; y++ {
+		for x := 0; x < width; x++ {
+			img.Set(x, y, c)
+		}
+	}
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	defer func() { _ = f.Close() }()
+	if err := png.Encode(f, img); err != nil {
+		t.Fatalf("failed to encode PNG: %v", err)
+	}
+}
+
+// createTestPNGWithBlock creates a PNG with a colored block at the specified position.
+func createTestPNGWithBlock(t *testing.T, path string, width, height int, bg, block color.Color, bx, by, bw, bh int) {
+	t.Helper()
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		t.Fatalf("failed to create dir: %v", err)
+	}
+	img := image.NewRGBA(image.Rect(0, 0, width, height))
+	for y := 0; y < height; y++ {
+		for x := 0; x < width; x++ {
+			if x >= bx && x < bx+bw && y >= by && y < by+bh {
+				img.Set(x, y, block)
+			} else {
+				img.Set(x, y, bg)
+			}
+		}
+	}
+	f, err := os.Create(path)
+	if err != nil {
+		t.Fatalf("failed to create file: %v", err)
+	}
+	defer func() { _ = f.Close() }()
+	if err := png.Encode(f, img); err != nil {
+		t.Fatalf("failed to encode PNG: %v", err)
+	}
+}
+
+func TestCompare_IdenticalImages(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	createTestPNG(t, baselinePath, 100, 100, white)
+	createTestPNG(t, currentPath, 100, 100, white)
+
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	if result.Status != StatusUnchanged {
+		t.Errorf("expected StatusUnchanged, got %s", result.Status)
+	}
+	if result.DiffPercent != 0.0 {
+		t.Errorf("expected 0%% diff, got %.2f%%", result.DiffPercent)
+	}
+	if result.DiffPixels != 0 {
+		t.Errorf("expected 0 diff pixels, got %d", result.DiffPixels)
+	}
+	if result.TotalPixels != 10000 {
+		t.Errorf("expected 10000 total pixels, got %d", result.TotalPixels)
+	}
+}
+
+func TestCompare_DifferentImages(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	red := color.RGBA{R: 255, G: 0, B: 0, A: 255}
+
+	// Baseline: all white
+	createTestPNG(t, baselinePath, 100, 100, white)
+	// Current: white with a 10x10 red block (100 pixels different)
+	createTestPNGWithBlock(t, currentPath, 100, 100, white, red, 0, 0, 10, 10)
+
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	if result.Status != StatusChanged {
+		t.Errorf("expected StatusChanged, got %s", result.Status)
+	}
+	if result.DiffPixels != 100 {
+		t.Errorf("expected 100 diff pixels, got %d", result.DiffPixels)
+	}
+	if result.DiffPercent != 1.0 {
+		t.Errorf("expected 1.0%% diff, got %.2f%%", result.DiffPercent)
+	}
+	if result.DiffImage == nil {
+		t.Error("expected non-nil DiffImage")
+	}
+}
+
+func TestCompare_SubtleDifferenceBelowThreshold(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	// Two very similar colors -- difference of 10 on one channel
+	c1 := color.RGBA{R: 200, G: 200, B: 200, A: 255}
+	c2 := color.RGBA{R: 210, G: 200, B: 200, A: 255}
+
+	createTestPNG(t, baselinePath, 10, 10, c1)
+	createTestPNG(t, currentPath, 10, 10, c2)
+
+	// Threshold 0.2 = 51 pixel value difference. 10 < 51, so should be unchanged.
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	if result.Status != StatusUnchanged {
+		t.Errorf("expected StatusUnchanged (diff below threshold), got %s", result.Status)
+	}
+	if result.DiffPixels != 0 {
+		t.Errorf("expected 0 diff pixels (below threshold), got %d", result.DiffPixels)
+	}
+}
+
+func TestCompare_DifferentSizes(t *testing.T) {
+	dir := t.TempDir()
+	baselinePath := filepath.Join(dir, "baseline.png")
+	currentPath := filepath.Join(dir, "current.png")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	createTestPNG(t, baselinePath, 100, 100, white)
+	createTestPNG(t, currentPath, 100, 120, white) // Taller
+
+	result, err := Compare(baselinePath, currentPath, 0.2)
+	if err != nil {
+		t.Fatalf("Compare failed: %v", err)
+	}
+
+	// The extra 20 rows (2000 pixels) should be "different" (white vs transparent/zero)
+	if result.Status != StatusChanged {
+		t.Errorf("expected StatusChanged for different sizes, got %s", result.Status)
+	}
+	if result.TotalPixels != 12000 { // 100*120
+		t.Errorf("expected 12000 total pixels, got %d", result.TotalPixels)
+	}
+}
+
+func TestCompareDirectories(t *testing.T) {
+	baselineDir := filepath.Join(t.TempDir(), "baseline")
+	currentDir := filepath.Join(t.TempDir(), "current")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	red := color.RGBA{R: 255, G: 0, B: 0, A: 255}
+	blue := color.RGBA{R: 0, G: 0, B: 255, A: 255}
+
+	// shared-unchanged.png: identical in both
+	createTestPNG(t, filepath.Join(baselineDir, "shared-unchanged.png"), 10, 10, white)
+	createTestPNG(t, filepath.Join(currentDir, "shared-unchanged.png"), 10, 10, white)
+
+	// shared-changed.png: different in both
+	createTestPNG(t, filepath.Join(baselineDir, "shared-changed.png"), 10, 10, white)
+	createTestPNG(t, filepath.Join(currentDir, "shared-changed.png"), 10, 10, red)
+
+	// removed.png: only in baseline
+	createTestPNG(t, filepath.Join(baselineDir, "removed.png"), 10, 10, white)
+
+	// added.png: only in current
+	createTestPNG(t, filepath.Join(currentDir, "added.png"), 10, 10, blue)
+
+	results, err := CompareDirectories(baselineDir, currentDir, 0.2)
+	if err != nil {
+		t.Fatalf("CompareDirectories failed: %v", err)
+	}
+
+	if len(results) != 4 {
+		t.Fatalf("expected 4 results, got %d", len(results))
+	}
+
+	// Results should be sorted: changed first, then added, removed, unchanged
+	statusCounts := map[Status]int{}
+	for _, r := range results {
+		statusCounts[r.Status]++
+	}
+
+	if statusCounts[StatusChanged] != 1 {
+		t.Errorf("expected 1 changed, got %d", statusCounts[StatusChanged])
+	}
+	if statusCounts[StatusAdded] != 1 {
+		t.Errorf("expected 1 added, got %d", statusCounts[StatusAdded])
+	}
+	if statusCounts[StatusRemoved] != 1 {
+		t.Errorf("expected 1 removed, got %d", statusCounts[StatusRemoved])
+	}
+	if statusCounts[StatusUnchanged] != 1 {
+		t.Errorf("expected 1 unchanged, got %d", statusCounts[StatusUnchanged])
+	}
+
+	// First result should be the changed one (sort order)
+	if results[0].Status != StatusChanged {
+		t.Errorf("expected first result to be changed, got %s", results[0].Status)
+	}
+}
+
+func TestCompareDirectories_EmptyBaseline(t *testing.T) {
+	baselineDir := filepath.Join(t.TempDir(), "baseline")
+	currentDir := filepath.Join(t.TempDir(), "current")
+
+	if err := os.MkdirAll(baselineDir, 0755); err != nil {
+		t.Fatal(err)
+	}
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	createTestPNG(t, filepath.Join(currentDir, "new.png"), 10, 10, white)
+
+	results, err := CompareDirectories(baselineDir, currentDir, 0.2)
+	if err != nil {
+		t.Fatalf("CompareDirectories failed: %v", err)
+	}
+
+	if len(results) != 1 {
+		t.Fatalf("expected 1 result, got %d", len(results))
+	}
+	if results[0].Status != StatusAdded {
+		t.Errorf("expected StatusAdded, got %s", results[0].Status)
+	}
+}
+
+func TestGenerateReport(t *testing.T) {
+	dir := t.TempDir()
+	baselineDir := filepath.Join(dir, "baseline")
+	currentDir := filepath.Join(dir, "current")
+
+	white := color.RGBA{R: 255, G: 255, B: 255, A: 255}
+	red := color.RGBA{R: 255, G: 0, B: 0, A: 255}
+
+	createTestPNG(t, filepath.Join(baselineDir, "page.png"), 50, 50, white)
+	createTestPNG(t, filepath.Join(currentDir, "page.png"), 50, 50, red)
+
+	results, err := CompareDirectories(baselineDir, currentDir, 0.2)
+	if err != nil {
+		t.Fatalf("CompareDirectories failed: %v", err)
+	}
+
+	outputPath := filepath.Join(dir, "report", "index.html")
+	if err := GenerateReport(results, outputPath); err != nil {
+		t.Fatalf("GenerateReport failed: %v", err)
+	}
+
+	// Verify the file was created and has content
+	info, err := os.Stat(outputPath)
+	if err != nil {
+		t.Fatalf("report file not found: %v", err)
+	}
+	if info.Size() == 0 {
+		t.Error("report file is empty")
+	}
+
+	// Verify it contains expected HTML elements
+	content, err := os.ReadFile(outputPath)
+	if err != nil {
+		t.Fatalf("failed to read report: %v", err)
+	}
+
+	contentStr := string(content)
+	for _, expected := range []string{
+		"Visual Regression Report",
+		"data:image/png;base64,",
+		"page.png",
+		"changed",
+	} {
+		if !contains(contentStr, expected) {
+			t.Errorf("report missing expected content: %q", expected)
+		}
+	}
+}
+
+func contains(s, substr string) bool {
+	return len(s) >= len(substr) && searchString(s, substr)
+}
+
+func searchString(s, substr string) bool {
+	for i := 0; i <= len(s)-len(substr); i++ {
+		if s[i:i+len(substr)] == substr {
+			return true
+		}
+	}
+	return false
+}
--- a/tools/ods/internal/imgdiff/report.go
+++ b/tools/ods/internal/imgdiff/report.go
@@ -0,0 +1,345 @@
+package imgdiff
+
+import (
+	"bytes"
+	"encoding/base64"
+	"fmt"
+	"html/template"
+	"image"
+	"image/png"
+	"os"
+	"path/filepath"
+)
+
+// reportEntry holds data for a single screenshot in the HTML template.
+type reportEntry struct {
+	Name            string
+	Status          string
+	DiffPercent     string
+	BaselineDataURI template.URL
+	CurrentDataURI  template.URL
+	DiffDataURI     template.URL
+	HasBaseline     bool
+	HasCurrent      bool
+	HasDiff         bool
+}
+
+// reportData holds all data for the HTML template.
+type reportData struct {
+	Entries        []reportEntry
+	ChangedCount   int
+	AddedCount     int
+	RemovedCount   int
+	UnchangedCount int
+	TotalCount     int
+	HasDifferences bool
+}
+
+// GenerateReport produces a self-contained HTML file from comparison results.
+// All images are base64-encoded inline as data URIs.
+func GenerateReport(results []Result, outputPath string) error {
+	if err := os.MkdirAll(filepath.Dir(outputPath), 0755); err != nil {
+		return fmt.Errorf("failed to create output directory: %w", err)
+	}
+
+	data := reportData{}
+
+	for _, r := range results {
+		entry := reportEntry{
+			Name:   r.Name,
+			Status: r.Status.String(),
+		}
+
+		switch r.Status {
+		case StatusChanged:
+			data.ChangedCount++
+			entry.DiffPercent = fmt.Sprintf("%.2f%%", r.DiffPercent)
+		case StatusAdded:
+			data.AddedCount++
+		case StatusRemoved:
+			data.RemovedCount++
+		case StatusUnchanged:
+			data.UnchangedCount++
+			entry.DiffPercent = "0.00%"
+		}
+
+		if r.BaselinePath != "" {
+			uri, err := pngFileToDataURI(r.BaselinePath)
+			if err != nil {
+				return fmt.Errorf("failed to encode baseline %s: %w", r.Name, err)
+			}
+			entry.BaselineDataURI = template.URL(uri)
+			entry.HasBaseline = true
+		}
+
+		if r.CurrentPath != "" {
+			uri, err := pngFileToDataURI(r.CurrentPath)
+			if err != nil {
+				return fmt.Errorf("failed to encode current %s: %w", r.Name, err)
+			}
+			entry.CurrentDataURI = template.URL(uri)
+			entry.HasCurrent = true
+		}
+
+		if r.DiffImage != nil {
+			uri, err := imageToDataURI(r.DiffImage)
+			if err != nil {
+				return fmt.Errorf("failed to encode diff %s: %w", r.Name, err)
+			}
+			entry.DiffDataURI = template.URL(uri)
+			entry.HasDiff = true
+		}
+
+		data.Entries = append(data.Entries, entry)
+	}
+
+	data.TotalCount = len(results)
+	data.HasDifferences = data.ChangedCount > 0 || data.AddedCount > 0 || data.RemovedCount > 0
+
+	tmpl, err := template.New("report").Parse(htmlTemplate)
+	if err != nil {
+		return fmt.Errorf("failed to parse template: %w", err)
+	}
+
+	f, err := os.Create(outputPath)
+	if err != nil {
+		return fmt.Errorf("failed to create output file: %w", err)
+	}
+	defer func() { _ = f.Close() }()
+
+	if err := tmpl.Execute(f, data); err != nil {
+		return fmt.Errorf("failed to execute template: %w", err)
+	}
+
+	return nil
+}
+
+// pngFileToDataURI reads a PNG file and returns a base64 data URI.
+func pngFileToDataURI(path string) (string, error) {
+	data, err := os.ReadFile(path)
+	if err != nil {
+		return "", err
+	}
+	encoded := base64.StdEncoding.EncodeToString(data)
+	return "data:image/png;base64," + encoded, nil
+}
+
+// imageToDataURI encodes an image.Image to a PNG base64 data URI.
+func imageToDataURI(img image.Image) (string, error) {
+	var buf bytes.Buffer
+	if err := png.Encode(&buf, img); err != nil {
+		return "", err
+	}
+	encoded := base64.StdEncoding.EncodeToString(buf.Bytes())
+	return "data:image/png;base64," + encoded, nil
+}
+
+const htmlTemplate = `<!DOCTYPE html>
+<html lang="en">
+<head>
+<meta charset="UTF-8">
+<meta name="viewport" content="width=device-width, initial-scale=1.0">
+<title>Visual Regression Report</title>
+<style>
+  * { box-sizing: border-box; margin: 0; padding: 0; }
+  body { font-family: -apple-system, BlinkMacSystemFont, "Segoe UI", Roboto, sans-serif; background: #f5f5f5; color: #333; }
+  .header { background: #1a1a2e; color: #fff; padding: 24px 32px; }
+  .header h1 { font-size: 24px; font-weight: 600; }
+  .header p { margin-top: 8px; opacity: 0.8; font-size: 14px; }
+  .summary { display: flex; gap: 16px; padding: 20px 32px; background: #fff; border-bottom: 1px solid #e0e0e0; flex-wrap: wrap; }
+  .summary-card { padding: 12px 20px; border-radius: 8px; font-size: 14px; font-weight: 500; }
+  .summary-changed { background: #fff3e0; color: #e65100; }
+  .summary-added { background: #e8f5e9; color: #2e7d32; }
+  .summary-removed { background: #fce4ec; color: #c62828; }
+  .summary-unchanged { background: #e3f2fd; color: #1565c0; }
+  .content { padding: 24px 32px; max-width: 1400px; margin: 0 auto; }
+  .section-title { font-size: 18px; font-weight: 600; margin: 24px 0 16px; padding-bottom: 8px; border-bottom: 2px solid #e0e0e0; }
+  .no-changes { text-align: center; padding: 60px 20px; color: #666; }
+  .no-changes h2 { font-size: 24px; margin-bottom: 8px; color: #2e7d32; }
+  .card { background: #fff; border-radius: 12px; margin-bottom: 24px; box-shadow: 0 1px 3px rgba(0,0,0,0.1); overflow: hidden; }
+  .card-header { display: flex; justify-content: space-between; align-items: center; padding: 16px 20px; border-bottom: 1px solid #eee; }
+  .card-name { font-weight: 600; font-size: 15px; }
+  .card-badge { font-size: 12px; padding: 4px 10px; border-radius: 12px; font-weight: 500; }
+  .badge-changed { background: #fff3e0; color: #e65100; }
+  .badge-added { background: #e8f5e9; color: #2e7d32; }
+  .badge-removed { background: #fce4ec; color: #c62828; }
+  .tabs { display: flex; gap: 0; border-bottom: 1px solid #eee; }
+  .tab { padding: 10px 20px; cursor: pointer; font-size: 13px; font-weight: 500; color: #666; border-bottom: 2px solid transparent; transition: all 0.2s; }
+  .tab:hover { color: #333; background: #f9f9f9; }
+  .tab.active { color: #1a1a2e; border-bottom-color: #1a1a2e; }
+  .tab-content { display: none; padding: 20px; }
+  .tab-content.active { display: block; }
+  .slider-container { position: relative; overflow: hidden; cursor: ew-resize; user-select: none; border: 1px solid #eee; border-radius: 4px; }
+  .slider-container > img { display: block; width: 100%; height: auto; }
+  .slider-baseline { position: absolute; top: 0; left: 0; width: 100%; height: 100%; clip-path: inset(0 50% 0 0); }
+  .slider-baseline img { display: block; width: 100%; height: auto; }
+  .slider-divider { position: absolute; top: 0; width: 3px; height: 100%; background: #e65100; z-index: 10; cursor: ew-resize; }
+  .slider-divider::before { content: ""; position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); width: 32px; height: 32px; background: #e65100; border-radius: 50%; border: 2px solid #fff; box-shadow: 0 2px 8px rgba(0,0,0,0.3); }
+  .slider-divider::after { content: "\2194"; position: absolute; top: 50%; left: 50%; transform: translate(-50%, -50%); color: #fff; font-size: 16px; z-index: 1; }
+  .slider-label { position: absolute; top: 10px; padding: 4px 10px; background: rgba(0,0,0,0.6); color: #fff; font-size: 11px; border-radius: 4px; z-index: 5; pointer-events: none; }
+  .slider-label-left { left: 10px; }
+  .slider-label-right { right: 10px; }
+  .side-by-side { display: grid; grid-template-columns: 1fr 1fr; gap: 16px; }
+  .side-by-side .img-container { border: 1px solid #eee; border-radius: 4px; overflow: hidden; }
+  .side-by-side .img-label { font-size: 12px; font-weight: 500; padding: 8px 12px; background: #f5f5f5; color: #666; }
+  .side-by-side img { display: block; width: 100%; height: auto; }
+  .diff-overlay img { display: block; max-width: 100%; height: auto; border: 1px solid #eee; border-radius: 4px; }
+  .single-image img { display: block; max-width: 100%; height: auto; border: 1px solid #eee; border-radius: 4px; }
+  .unchanged-section { margin-top: 32px; }
+  .unchanged-toggle { cursor: pointer; font-size: 14px; color: #666; padding: 12px 0; }
+  .unchanged-toggle:hover { color: #333; }
+  .unchanged-list { display: none; }
+  .unchanged-list.open { display: block; }
+  .unchanged-item { padding: 8px 0; font-size: 13px; color: #888; border-bottom: 1px solid #f0f0f0; }
+</style>
+</head>
+<body>
+
+<div class="header">
+  <h1>Visual Regression Report</h1>
+  <p>{{.TotalCount}} screenshot{{if ne .TotalCount 1}}s{{end}} compared</p>
+</div>
+
+<div class="summary">
+  {{if gt .ChangedCount 0}}<div class="summary-card summary-changed">{{.ChangedCount}} Changed</div>{{end}}
+  {{if gt .AddedCount 0}}<div class="summary-card summary-added">{{.AddedCount}} Added</div>{{end}}
+  {{if gt .RemovedCount 0}}<div class="summary-card summary-removed">{{.RemovedCount}} Removed</div>{{end}}
+  <div class="summary-card summary-unchanged">{{.UnchangedCount}} Unchanged</div>
+</div>
+
+<div class="content">
+{{if not .HasDifferences}}
+  <div class="no-changes">
+    <h2>No visual changes detected</h2>
+    <p>All {{.TotalCount}} screenshots match their baselines.</p>
+  </div>
+{{end}}
+
+{{range .Entries}}
+{{if eq .Status "changed"}}
+<div class="card">
+  <div class="card-header">
+    <span class="card-name">{{.Name}}</span>
+    <span class="card-badge badge-changed">{{.DiffPercent}} changed</span>
+  </div>
+  <div class="tabs">
+    <div class="tab active" onclick="switchTab(this, 'slider')">Slider</div>
+    <div class="tab" onclick="switchTab(this, 'sidebyside')">Side by Side</div>
+    <div class="tab" onclick="switchTab(this, 'diff')">Diff Overlay</div>
+  </div>
+  <div class="tab-content active" data-tab="slider">
+    <div class="slider-container" onmousedown="startSlider(event, this)" onmousemove="moveSlider(event, this)" ontouchstart="startSlider(event, this)" ontouchmove="moveSlider(event, this)">
+      <img src="{{.CurrentDataURI}}" alt="Current" draggable="false">
+      <div class="slider-baseline">
+        <img src="{{.BaselineDataURI}}" alt="Baseline" draggable="false">
+      </div>
+      <div class="slider-divider" style="left: calc(50% - 1.5px);"></div>
+      <span class="slider-label slider-label-left">Baseline</span>
+      <span class="slider-label slider-label-right">Current</span>
+    </div>
+  </div>
+  <div class="tab-content" data-tab="sidebyside">
+    <div class="side-by-side">
+      <div class="img-container">
+        <div class="img-label">Baseline</div>
+        <img src="{{.BaselineDataURI}}" alt="Baseline">
+      </div>
+      <div class="img-container">
+        <div class="img-label">Current</div>
+        <img src="{{.CurrentDataURI}}" alt="Current">
+      </div>
+    </div>
+  </div>
+  <div class="tab-content" data-tab="diff">
+    <div class="diff-overlay">
+      {{if .HasDiff}}<img src="{{.DiffDataURI}}" alt="Diff overlay">{{end}}
+    </div>
+  </div>
+</div>
+{{end}}
+
+{{if eq .Status "added"}}
+<div class="card">
+  <div class="card-header">
+    <span class="card-name">{{.Name}}</span>
+    <span class="card-badge badge-added">added</span>
+  </div>
+  <div class="tab-content active" data-tab="single">
+    <div class="single-image">
+      {{if .HasCurrent}}<img src="{{.CurrentDataURI}}" alt="New screenshot">{{end}}
+    </div>
+  </div>
+</div>
+{{end}}
+
+{{if eq .Status "removed"}}
+<div class="card">
+  <div class="card-header">
+    <span class="card-name">{{.Name}}</span>
+    <span class="card-badge badge-removed">removed</span>
+  </div>
+  <div class="tab-content active" data-tab="single">
+    <div class="single-image">
+      {{if .HasBaseline}}<img src="{{.BaselineDataURI}}" alt="Removed screenshot">{{end}}
+    </div>
+  </div>
+</div>
+{{end}}
+{{end}}
+
+{{if gt .UnchangedCount 0}}
+<div class="unchanged-section">
+  <div class="unchanged-toggle" onclick="toggleUnchanged(this)">
+    &#9654; {{.UnchangedCount}} unchanged screenshot{{if ne .UnchangedCount 1}}s{{end}} (click to expand)
+  </div>
+  <div class="unchanged-list">
+    {{range .Entries}}{{if eq .Status "unchanged"}}<div class="unchanged-item">{{.Name}}</div>{{end}}{{end}}
+  </div>
+</div>
+{{end}}
+
+</div>
+
+<script>
+// Tab switching
+function switchTab(tabEl, tabName) {
+  const card = tabEl.closest('.card');
+  card.querySelectorAll('.tab').forEach(t => t.classList.remove('active'));
+  card.querySelectorAll('.tab-content').forEach(c => c.classList.remove('active'));
+  tabEl.classList.add('active');
+  card.querySelector('[data-tab="' + tabName + '"]').classList.add('active');
+}
+
+// Slider interaction
+let sliderActive = false;
+
+function startSlider(e, container) {
+  sliderActive = true;
+  moveSlider(e, container);
+  const stopSlider = function() { sliderActive = false; };
+  document.addEventListener('mouseup', stopSlider, { once: true });
+  document.addEventListener('touchend', stopSlider, { once: true });
+}
+
+function moveSlider(e, container) {
+  if (!sliderActive) return;
+  e.preventDefault();
+  const rect = container.getBoundingClientRect();
+  const clientX = e.touches ? e.touches[0].clientX : e.clientX;
+  let x = clientX - rect.left;
+  x = Math.max(0, Math.min(x, rect.width));
+  const percent = (x / rect.width) * 100;
+  const clipRight = 100 - percent;
+  container.querySelector('.slider-baseline').style.clipPath = 'inset(0 ' + clipRight + '% 0 0)';
+  container.querySelector('.slider-divider').style.left = 'calc(' + percent + '% - 1.5px)';
+}
+
+// Unchanged section toggle
+function toggleUnchanged(el) {
+  const list = el.nextElementSibling;
+  const isOpen = list.classList.toggle('open');
+  el.innerHTML = (isOpen ? '&#9660;' : '&#9654;') + ' {{.UnchangedCount}} unchanged screenshot{{if ne .UnchangedCount 1}}s{{end}} (click to ' + (isOpen ? 'collapse' : 'expand') + ')';
+}
+</script>
+</body>
+</html>`
--- a/tools/ods/internal/imgdiff/summary.go
+++ b/tools/ods/internal/imgdiff/summary.go
@@ -0,0 +1,60 @@
+package imgdiff
+
+import (
+	"encoding/json"
+	"fmt"
+	"os"
+	"path/filepath"
+)
+
+// Summary holds aggregate comparison results in a JSON-friendly format.
+// It is written alongside the HTML report so that CI pipelines can read it
+// without parsing HTML.
+type Summary struct {
+	Project        string `json:"project"`
+	Changed        int    `json:"changed"`
+	Added          int    `json:"added"`
+	Removed        int    `json:"removed"`
+	Unchanged      int    `json:"unchanged"`
+	Total          int    `json:"total"`
+	HasDifferences bool   `json:"has_differences"`
+}
+
+// BuildSummary computes a Summary from a slice of comparison results.
+func BuildSummary(project string, results []Result) Summary {
+	s := Summary{Project: project}
+	for _, r := range results {
+		switch r.Status {
+		case StatusChanged:
+			s.Changed++
+		case StatusAdded:
+			s.Added++
+		case StatusRemoved:
+			s.Removed++
+		case StatusUnchanged:
+			s.Unchanged++
+		}
+	}
+	s.Total = len(results)
+	s.HasDifferences = s.Changed > 0 || s.Added > 0 || s.Removed > 0
+	return s
+}
+
+// WriteSummary writes a Summary as pretty-printed JSON to the given path,
+// creating parent directories as needed.
+func WriteSummary(summary Summary, path string) error {
+	if err := os.MkdirAll(filepath.Dir(path), 0755); err != nil {
+		return fmt.Errorf("failed to create directory for summary: %w", err)
+	}
+
+	data, err := json.MarshalIndent(summary, "", "  ")
+	if err != nil {
+		return fmt.Errorf("failed to marshal summary: %w", err)
+	}
+
+	if err := os.WriteFile(path, data, 0644); err != nil {
+		return fmt.Errorf("failed to write summary: %w", err)
+	}
+
+	return nil
+}
--- a/tools/ods/internal/s3/sync.go
+++ b/tools/ods/internal/s3/sync.go
@@ -0,0 +1,49 @@
+package s3
+
+import (
+	"fmt"
+	"os"
+	"os/exec"
+
+	log "github.com/sirupsen/logrus"
+)
+
+// SyncDown downloads an S3 prefix to a local directory using AWS CLI.
+// This is equivalent to: aws s3 sync <s3url> <destDir>
+func SyncDown(s3url string, destDir string) error {
+	if err := os.MkdirAll(destDir, 0755); err != nil {
+		return fmt.Errorf("failed to create destination directory: %w", err)
+	}
+
+	log.Infof("Downloading from %s to %s ...", s3url, destDir)
+	cmd := exec.Command("aws", "s3", "sync", s3url, destDir)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	if err := cmd.Run(); err != nil {
+		return fmt.Errorf("aws s3 sync failed: %w\n\nTo authenticate, run:\n  aws sso login\n\nOr configure AWS credentials with:\n  aws configure sso", err)
+	}
+
+	return nil
+}
+
+// SyncUp uploads a local directory to an S3 prefix using AWS CLI.
+// If delete is true, files in S3 that don't exist locally are removed.
+// This is equivalent to: aws s3 sync <srcDir> <s3url> [--delete]
+func SyncUp(srcDir string, s3url string, delete bool) error {
+	args := []string{"s3", "sync", srcDir, s3url}
+	if delete {
+		args = append(args, "--delete")
+	}
+
+	log.Infof("Uploading from %s to %s ...", srcDir, s3url)
+	cmd := exec.Command("aws", args...)
+	cmd.Stdout = os.Stdout
+	cmd.Stderr = os.Stderr
+
+	if err := cmd.Run(); err != nil {
+		return fmt.Errorf("aws s3 sync failed: %w\n\nTo authenticate, run:\n  aws sso login\n\nOr configure AWS credentials with:\n  aws configure sso", err)
+	}
+
+	return nil
+}
--- a/uv.lock
+++ b/uv.lock
@@ -4711,7 +4711,7 @@ requires-dist = [
    { name = "numpy", marker = "extra == 'model-server'", specifier = "==2.4.1" },
    { name = "oauthlib", marker = "extra == 'backend'", specifier = "==3.2.2" },
    { name = "office365-rest-python-client", marker = "extra == 'backend'", specifier = "==2.5.9" },
-    { name = "onyx-devtools", marker = "extra == 'dev'", specifier = "==0.5.3" },
+    { name = "onyx-devtools", marker = "extra == 'dev'", specifier = "==0.5.6" },
    { name = "openai", specifier = "==2.14.0" },
    { name = "openapi-generator-cli", marker = "extra == 'dev'", specifier = "==7.17.0" },
    { name = "openinference-instrumentation", marker = "extra == 'backend'", specifier = "==0.1.42" },
@@ -4816,20 +4816,20 @@ requires-dist = [{ name = "onyx", extras = ["backend", "dev", "ee"], editable =

 [[package]]
 name = "onyx-devtools"
-version = "0.5.3"
+version = "0.5.6"
 source = { registry = "https://pypi.org/simple" }
 dependencies = [
    { name = "fastapi" },
    { name = "openapi-generator-cli" },
 ]
 wheels = [
-    { url = "https://files.pythonhosted.org/packages/85/39/87e770afccf123cd72ca8c58178bc08a9b04cb6198f265213012a6a71f21/onyx_devtools-0.5.3-py3-none-any.whl", hash = "sha256:6b61dff779a5839032fb282f8db62aa3d640c09fa0d7d2ed7f8a23fd38fa84df", size = 2894984, upload-time = "2026-02-11T23:05:50.739Z" },
-    { url = "https://files.pythonhosted.org/packages/ef/c5/9a7516398af4183f3247a668b710da344c002586e9be668cb690b8566d8a/onyx_devtools-0.5.3-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:268c57ffb08322bd9671d1b8444199607bc1eaf7e2c25300de98ba272c716c3e", size = 2913582, upload-time = "2026-02-11T23:05:33.582Z" },
-    { url = "https://files.pythonhosted.org/packages/70/58/86895464d02e2ae0a22a0bcc48cfd5e7cb647ee117a1a0620850f03e21e5/onyx_devtools-0.5.3-py3-none-macosx_11_0_arm64.whl", hash = "sha256:e440d14ecad26ea3c85ae00a95cc1731214de6c6c71b90b08ab3608d99ecdd58", size = 2717143, upload-time = "2026-02-11T23:05:32.673Z" },
-    { url = "https://files.pythonhosted.org/packages/10/95/c8ea6a27afde2c29b108a0988aa4f44963d7124bfe04322217c7003129b9/onyx_devtools-0.5.3-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:49136baf0427aa6a5dde57457e4c963d86be4cd59bb6d02837609dd470de6a6b", size = 2625948, upload-time = "2026-02-11T23:05:48.147Z" },
-    { url = "https://files.pythonhosted.org/packages/85/cc/aabfb4599ce42aac88bdb1082696e3dde0a34a7739df61035e77e01cbca3/onyx_devtools-0.5.3-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:2c327d258943f80b9860268fa69fde3a6f707d2aaed9362385cd6acd255d11cc", size = 2895001, upload-time = "2026-02-11T23:05:50.509Z" },
-    { url = "https://files.pythonhosted.org/packages/17/3c/d3af3a49464d15ebb0a8cf371169158bb99a14be859ac7468c73ecf055cd/onyx_devtools-0.5.3-py3-none-win_amd64.whl", hash = "sha256:fa5e7b779ede887f7c2e2da2442048cc9b626a9d8007b34c3b617e40dfd8d5bd", size = 2977738, upload-time = "2026-02-11T23:05:30.592Z" },
-    { url = "https://files.pythonhosted.org/packages/3f/27/8844e7c4ee06453b57be55644e572206b7b79e3685351f80afd8b7056327/onyx_devtools-0.5.3-py3-none-win_arm64.whl", hash = "sha256:2542fc3b1ee27d0695aef8e17819879a0eeaed10e2855e31145cbfa6267fcf6c", size = 2688564, upload-time = "2026-02-11T23:05:34.968Z" },
+    { url = "https://files.pythonhosted.org/packages/9e/2b/13ef8d1a2bea4aff37d2ad8bd11eb1037563bfbcf70a1983e69b7f019670/onyx_devtools-0.5.6-py3-none-any.whl", hash = "sha256:d2e19801265c7b3707880d40446e4cf08b9e8f19459722b84cff2ac46b4ae1ef", size = 3765683, upload-time = "2026-02-12T19:21:08.526Z" },
+    { url = "https://files.pythonhosted.org/packages/8e/c5/be5cb5c707b1cf954a7b18f85ba2b86018e8e5f372a78ba0d5bd6603137c/onyx_devtools-0.5.6-py3-none-macosx_10_12_x86_64.whl", hash = "sha256:1d329d46d4faa78bb49565f109fd2e232e4d3ec0d4b8108273f7468203ba1967", size = 3812726, upload-time = "2026-02-12T19:21:14.738Z" },
+    { url = "https://files.pythonhosted.org/packages/b6/b0/f8609e18886d2c65b22179f276f9e469b10c44c44e592e0d037dfafffc1d/onyx_devtools-0.5.6-py3-none-macosx_11_0_arm64.whl", hash = "sha256:27b34b6c1cc134525a22ea6cb91275edce5200eabae257e4aebd8d34a5927820", size = 3559212, upload-time = "2026-02-12T19:21:07.633Z" },
+    { url = "https://files.pythonhosted.org/packages/07/59/c53a8a4b99e14b69292558904e1cc38559e67859012c2cbad79af40f3c30/onyx_devtools-0.5.6-py3-none-manylinux_2_17_aarch64.whl", hash = "sha256:35eb1ef93287d8e38266cb39039fe98fdbb25fe0c2bde730cbf8ed0900fb140f", size = 3420474, upload-time = "2026-02-12T19:21:08.214Z" },
+    { url = "https://files.pythonhosted.org/packages/ea/83/a59669b21b4d76ad6b3aafa2023848d8ae3397b9f390699a8c977c8ac03e/onyx_devtools-0.5.6-py3-none-manylinux_2_17_x86_64.whl", hash = "sha256:af99e72568f813dd10bf19f37fa8fffba5bce900abbf03737d41c8d94acd8442", size = 3765694, upload-time = "2026-02-12T19:21:07.838Z" },
+    { url = "https://files.pythonhosted.org/packages/49/95/f5e272afe94299054a28996e2bb63e74957c5b4b65066ea6cc750a7dbeaf/onyx_devtools-0.5.6-py3-none-win_amd64.whl", hash = "sha256:96abd45800d0ddfad19b259e7c94f3fd741351b6c4e14c79798a781d3a6136bb", size = 3858988, upload-time = "2026-02-12T19:21:04.93Z" },
+    { url = "https://files.pythonhosted.org/packages/47/1a/aae4a7f900e08f53264ec0dc301a7ea2743e078913366e8f5b458d4d708b/onyx_devtools-0.5.6-py3-none-win_arm64.whl", hash = "sha256:ed1f824c67f5d2af07530ef017d99d09c2af05bc2734d7035d9b5b826de4c6ba", size = 3483383, upload-time = "2026-02-12T19:21:06.339Z" },
 ]

 [[package]]
--- a/web/.gitignore
+++ b/web/.gitignore
@@ -41,6 +41,7 @@ next-env.d.ts
 /user_auth.json
 /build-archive.log
 /test-results
+/output/

 # generated clients ... in particular, the API to the Onyx backend itself!
 /src/lib/generated
--- a/web/README.md
+++ b/web/README.md
@@ -61,35 +61,46 @@ Bring up the entire application.

 0. Install playwright dependencies

-```cd web
+```bash
 npx playwright install
 ```

 1. Run playwright

-```
-cd web
+```bash
 npx playwright test
 ```

 To run a single test:

-```
+```bash
 npx playwright test landing-page.spec.ts
 ```

 If running locally, interactive options can help you see exactly what is happening in
 the test.

-```
+```bash
 npx playwright test --ui
 npx playwright test --headed
 ```

-2. Inspect results
+3. Inspect results

 By default, playwright.config.ts is configured to output the results to:

+```bash
+web/output/playwright/
 ```
-web/test-results
+
+4. Visual regression screenshots
+
+Screenshots are captured automatically during test runs when `VISUAL_REGRESSION=true` is set.
+Baselines are stored in `web/tests/e2e/__screenshots__/` and can be updated with
+`npx playwright test --update-snapshots`.
+
+To compare screenshots across CI runs, use:
+
+```bash
+ods screenshot-diff compare --project admin
 ```
--- a/web/playwright.config.ts
+++ b/web/playwright.config.ts
@@ -8,6 +8,12 @@ export default defineConfig({
  timeout: 100000, // 100 seconds timeout
  expect: {
    timeout: 15000, // 15 seconds timeout for all assertions to reduce flakiness
+    toHaveScreenshot: {
+      // Allow up to 1% of pixels to differ (accounts for anti-aliasing, subpixel rendering)
+      maxDiffPixelRatio: 0.01,
+      // Threshold per-channel (0-1): how different a pixel can be before it counts as changed
+      threshold: 0.2,
+    },
  },
  retries: process.env.CI ? 2 : 0, // Retry failed tests 2 times in CI, 0 locally

@@ -20,7 +26,7 @@ export default defineConfig({
  reporter: [["list"]],
  // Only run Playwright tests from tests/e2e directory (ignore Jest tests in src/)
  testMatch: /.*\/tests\/e2e\/.*\.spec\.ts/,
-  outputDir: "test-results",
+  outputDir: "output/playwright",
  use: {
    // Base URL for the application, can be overridden via BASE_URL environment variable
    baseURL: process.env.BASE_URL || "http://localhost:3000",
--- a/web/tests/e2e/admin_pages.spec.ts
+++ b/web/tests/e2e/admin_pages.spec.ts
@@ -0,0 +1,199 @@
+import { test, expect } from "@playwright/test";
+import type { Page } from "@playwright/test";
+import { expectScreenshot } from "./utils/visualRegression";
+
+test.use({ storageState: "admin_auth.json" });
+test.describe.configure({ mode: "parallel" });
+
+interface AdminPageSnapshot {
+  name: string;
+  path: string;
+  pageTitle: string;
+  options?: {
+    paragraphText?: string | RegExp;
+    buttonName?: string;
+    subHeaderText?: string;
+  };
+}
+
+const ADMIN_PAGES: AdminPageSnapshot[] = [
+  {
+    name: "Document Management - Explorer",
+    path: "documents/explorer",
+    pageTitle: "Document Explorer",
+  },
+  {
+    name: "Connectors - Add Connector",
+    path: "add-connector",
+    pageTitle: "Add Connector",
+  },
+  {
+    name: "Custom Assistants - Assistants",
+    path: "assistants",
+    pageTitle: "Assistants",
+    options: {
+      paragraphText:
+        "Assistants are a way to build custom search/question-answering experiences for different use cases.",
+    },
+  },
+  {
+    name: "Configuration - Document Processing",
+    path: "configuration/document-processing",
+    pageTitle: "Document Processing",
+  },
+  {
+    name: "Document Management - Document Sets",
+    path: "documents/sets",
+    pageTitle: "Document Sets",
+    options: {
+      paragraphText:
+        "Document Sets allow you to group logically connected documents into a single bundle. These can then be used as a filter when performing searches to control the scope of information Onyx searches over.",
+    },
+  },
+  {
+    name: "Custom Assistants - Slack Bots",
+    path: "bots",
+    pageTitle: "Slack Bots",
+    options: {
+      paragraphText:
+        "Setup Slack bots that connect to Onyx. Once setup, you will be able to ask questions to Onyx directly from Slack. Additionally, you can:",
+    },
+  },
+  {
+    name: "Custom Assistants - Standard Answers",
+    path: "standard-answer",
+    pageTitle: "Standard Answers",
+  },
+  {
+    name: "Performance - Usage Statistics",
+    path: "performance/usage",
+    pageTitle: "Usage Statistics",
+  },
+  {
+    name: "Document Management - Feedback",
+    path: "documents/feedback",
+    pageTitle: "Document Feedback",
+  },
+  {
+    name: "Configuration - LLM",
+    path: "configuration/llm",
+    pageTitle: "LLM Setup",
+  },
+  {
+    name: "Connectors - Existing Connectors",
+    path: "indexing/status",
+    pageTitle: "Existing Connectors",
+  },
+  {
+    name: "User Management - Groups",
+    path: "groups",
+    pageTitle: "Manage User Groups",
+  },
+  {
+    name: "Appearance & Theming",
+    path: "theme",
+    pageTitle: "Appearance & Theming",
+  },
+  {
+    name: "Configuration - Search Settings",
+    path: "configuration/search",
+    pageTitle: "Search Settings",
+  },
+  {
+    name: "Custom Assistants - MCP Actions",
+    path: "actions/mcp",
+    pageTitle: "MCP Actions",
+  },
+  {
+    name: "Custom Assistants - OpenAPI Actions",
+    path: "actions/open-api",
+    pageTitle: "OpenAPI Actions",
+  },
+  {
+    name: "User Management - Token Rate Limits",
+    path: "token-rate-limits",
+    pageTitle: "Token Rate Limits",
+    options: {
+      paragraphText:
+        "Token rate limits enable you control how many tokens can be spent in a given time period. With token rate limits, you can:",
+      buttonName: "Create a Token Rate Limit",
+    },
+  },
+];
+
+async function verifyAdminPageNavigation(
+  page: Page,
+  path: string,
+  pageTitle: string,
+  options?: {
+    paragraphText?: string | RegExp;
+    buttonName?: string;
+    subHeaderText?: string;
+  }
+) {
+  await page.goto(`/admin/${path}`);
+
+  try {
+    await expect(page.locator('[aria-label="admin-page-title"]')).toHaveText(
+      pageTitle,
+      {
+        timeout: 10000,
+      }
+    );
+  } catch (error) {
+    console.error(
+      `Failed to find admin-page title with text "${pageTitle}" for path "${path}"`
+    );
+    // NOTE: This is a temporary measure for debugging the issue
+    console.error(await page.content());
+    throw error;
+  }
+
+  if (options?.paragraphText) {
+    await expect(page.locator("p.text-sm").nth(0)).toHaveText(
+      options.paragraphText
+    );
+  }
+
+  if (options?.buttonName) {
+    await expect(
+      page.getByRole("button", { name: options.buttonName })
+    ).toHaveCount(1);
+  }
+}
+
+const THEMES = ["light", "dark"] as const;
+
+for (const theme of THEMES) {
+  test.describe(`Admin pages (${theme} mode)`, () => {
+    // Inject the theme into localStorage before every navigation so
+    // next-themes picks it up on first render.
+    test.beforeEach(async ({ page }) => {
+      await page.addInitScript((t: string) => {
+        localStorage.setItem("theme", t);
+      }, theme);
+    });
+
+    for (const snapshot of ADMIN_PAGES) {
+      test(`Admin - ${snapshot.name}`, async ({ page }) => {
+        await verifyAdminPageNavigation(
+          page,
+          snapshot.path,
+          snapshot.pageTitle,
+          snapshot.options
+        );
+
+        // Wait for all network requests to settle before capturing the screenshot.
+        await page.waitForLoadState("networkidle");
+
+        // Capture a screenshot for visual regression review.
+        // The screenshot name includes the theme to keep light/dark baselines separate.
+        const screenshotName = `admin-${theme}-${snapshot.path.replace(
+          /\//g,
+          "-"
+        )}`;
+        await expectScreenshot(page, { name: screenshotName });
+      });
+    }
+  });
+}
--- a/web/tests/e2e/utils/visualRegression.ts
+++ b/web/tests/e2e/utils/visualRegression.ts
@@ -0,0 +1,124 @@
+import type { Page, PageScreenshotOptions } from "@playwright/test";
+import { expect } from "@playwright/test";
+
+/**
+ * Whether visual regression assertions are enabled.
+ *
+ * When `VISUAL_REGRESSION=true` is set, `expectScreenshot()` calls
+ * `toHaveScreenshot()` which will fail if the screenshot differs from the
+ * stored baseline.
+ *
+ * When disabled (the default), screenshots are still captured and saved but
+ * mismatches do NOT fail the test — this lets CI collect screenshots for later
+ * review without gating on them.
+ */
+const VISUAL_REGRESSION_ENABLED =
+  process.env.VISUAL_REGRESSION?.toLowerCase() === "true";
+
+/**
+ * Default selectors to mask across all screenshots so that dynamic content
+ * (timestamps, avatars, etc.) doesn't cause spurious diffs.
+ */
+const DEFAULT_MASK_SELECTORS: string[] = [
+  // Add selectors for dynamic content that should be masked, e.g.:
+  // '[data-testid="timestamp"]',
+  // '[data-testid="user-avatar"]',
+];
+
+interface ScreenshotOptions {
+  /**
+   * Name for the screenshot file. If omitted, Playwright auto-generates one
+   * from the test title.
+   */
+  name?: string;
+
+  /**
+   * Additional CSS selectors to mask (on top of the defaults).
+   * Masked areas are replaced with a pink box so they don't cause diffs.
+   */
+  mask?: string[];
+
+  /**
+   * If true, capture the full scrollable page instead of just the viewport.
+   * Defaults to false.
+   */
+  fullPage?: boolean;
+
+  /**
+   * Override the max diff pixel ratio for this specific screenshot.
+   */
+  maxDiffPixelRatio?: number;
+
+  /**
+   * Override the per-channel threshold for this specific screenshot.
+   */
+  threshold?: number;
+
+  /**
+   * Additional Playwright screenshot options.
+   */
+  screenshotOptions?: PageScreenshotOptions;
+}
+
+/**
+ * Take a screenshot and optionally assert it matches the stored baseline.
+ *
+ * Behavior depends on the `VISUAL_REGRESSION` environment variable:
+ * - `VISUAL_REGRESSION=true`  → assert via `toHaveScreenshot()` (fails on diff)
+ * - Otherwise                 → capture and save the screenshot for review only
+ *
+ * Usage:
+ * ```ts
+ * import { expectScreenshot } from "@tests/e2e/utils/visualRegression";
+ *
+ * test("admin page looks right", async ({ page }) => {
+ *   await page.goto("/admin/settings");
+ *   await expectScreenshot(page, { name: "admin-settings" });
+ * });
+ * ```
+ */
+export async function expectScreenshot(
+  page: Page,
+  options: ScreenshotOptions = {}
+): Promise<void> {
+  const {
+    name,
+    mask = [],
+    fullPage = false,
+    maxDiffPixelRatio,
+    threshold,
+  } = options;
+
+  // Combine default masks with per-call masks
+  const allMaskSelectors = [...DEFAULT_MASK_SELECTORS, ...mask];
+  const maskLocators = allMaskSelectors.map((selector) =>
+    page.locator(selector)
+  );
+
+  // Build the screenshot name array (Playwright expects string[])
+  const nameArg = name ? [name + ".png"] : undefined;
+
+  if (VISUAL_REGRESSION_ENABLED) {
+    // Assert mode — fail the test if the screenshot differs from baseline
+    const screenshotOpts = {
+      fullPage,
+      mask: maskLocators.length > 0 ? maskLocators : undefined,
+      ...(maxDiffPixelRatio !== undefined && { maxDiffPixelRatio }),
+      ...(threshold !== undefined && { threshold }),
+    };
+
+    if (nameArg) {
+      await expect(page).toHaveScreenshot(nameArg, screenshotOpts);
+    } else {
+      await expect(page).toHaveScreenshot(screenshotOpts);
+    }
+  } else {
+    // Capture-only mode — save the screenshot without asserting
+    const screenshotPath = name ? `output/screenshots/${name}.png` : undefined;
+    await page.screenshot({
+      path: screenshotPath,
+      fullPage,
+      ...options.screenshotOptions,
+    });
+  }
+}