From ec32a84bb922727637adbc0e9365e3a96866f4ad Mon Sep 17 00:00:00 2001
From: abose <arun@core.ai>
Date: Fri, 20 Feb 2026 15:56:05 +0530
Subject: [PATCH] feat: add MCP test runner support for run_tests and
 get_test_results

Adds MCP tools to control the Phoenix test runner remotely: run test
suites by category/spec and poll structured results. Includes WS
protocol handlers, test-runner-side MCP script, and updated CLAUDE.md
with accurate suite naming guidance.
---
 CLAUDE.md                                   |  45 +++++
 phoenix-builder-mcp/mcp-tools.js            |  75 ++++++++
 phoenix-builder-mcp/ws-control-server.js    |  98 ++++++++++
 src/phoenix-builder/phoenix-builder-boot.js |   3 +-
 test/SpecRunner.html                        |   4 +
 test/SpecRunner.js                          |   1 +
 test/phoenix-test-runner-mcp.js             | 188 ++++++++++++++++++++
 7 files changed, 413 insertions(+), 1 deletion(-)
 create mode 100644 test/phoenix-test-runner-mcp.js

diff --git a/CLAUDE.md b/CLAUDE.md
index 224ead6eb..34cddb4b6 100644
--- a/CLAUDE.md
+++ b/CLAUDE.md
@@ -24,3 +24,48 @@ Use `exec_js` to run JS in the Phoenix browser runtime. jQuery `$()` is global.
 **Click AI chat buttons:** `$('.ai-edit-restore-btn:contains("Undo")').click();`
 
 **Check logs:** `get_browser_console_logs` with `filter` regex (e.g. `"AI UI"`, `"error"`) and `tail` — includes both browser console and Node.js (PhNode) logs. Use `get_terminal_logs` for Electron process output (only available if Phoenix was launched via `start_phoenix`).
+
+## Running Tests via MCP
+
+The test runner must be open as a separate Phoenix instance (it shows up as `phoenix-test-runner-*` in `get_phoenix_status`). Use `run_tests` to trigger test runs and `get_test_results` to poll for results. `take_screenshot` also works on the test runner.
+
+### Test categories
+- **unit** — Fast, no UI. Safe to run all at once (`run_tests category=unit`).
+- **integration** — Spawns a Phoenix iframe inside the test runner. Some specs require window focus and will hang if the test runner window isn't focused.
+- **LegacyInteg** — Like integration but uses the legacy test harness. Also spawns an embedded Phoenix instance.
+- **livepreview**, **mainview** — Specialized integration tests.
+- **Do NOT use:** `all`, `performance`, `extension`, `individualrun` — not actively supported.
+
+### Hierarchy: Category → Suite → Test
+- **Category** — top-level grouping: `unit`, `integration`, `LegacyInteg`, etc. Safe to run an entire category.
+- **Suite** — a group of related tests within a category (e.g. `integration: FileFilters` has ~20 tests). This is the `spec` parameter value.
+- **Test** — a single test within a suite.
+
+### Running all tests in a category
+```
+run_tests(category="unit")
+```
+
+### Running a single suite
+Pass the exact suite name as the `spec` parameter. **Suite names do NOT always have a category prefix.** Many suites are registered with just their plain name (e.g. `"CSS Parsing"`, `"Editor"`, `"JSUtils"`), while others include a prefix (e.g. `"unit:Phoenix Platform Tests"`, `"integration: FileFilters"`, `"LegacyInteg:ExtensionLoader"`). If the suite name is wrong, the test runner will show a blank page with 0 specs and appear stuck.
+
+**To discover the exact suite name**, run this in `exec_js` on the test runner instance:
+```js
+return jasmine.getEnv().topSuite().children.map(s => s.description);
+```
+
+Examples:
+```
+run_tests(category="unit", spec="CSS Parsing")
+run_tests(category="unit", spec="unit:Phoenix Platform Tests")
+run_tests(category="integration", spec="integration: FileFilters")
+run_tests(category="LegacyInteg", spec="LegacyInteg:ExtensionLoader")
+```
+
+### Running individual tests
+You can pass a specific test's full name as `spec` to run just that one test. It is perfectly valid to run a single test. However, if a single test fails, re-run the full suite to confirm — suites sometimes execute tests in order with shared state, so an individual test may fail in isolation but pass within its suite. If the suite passes, the test is valid.
+
+### Gotchas
+- **Instance name changes on reload:** The test runner gets a new random instance name each time the page reloads. Always check `get_phoenix_status` after a `run_tests` call to get the current instance name.
+- **Integration tests may hang:** Specs labeled "needs window focus" will hang indefinitely if the test runner doesn't have OS-level window focus. If `get_test_results` starts timing out, the event loop is likely blocked by a stuck spec — use `force_reload_phoenix` to recover.
+- **LegacyInteg/integration tests spawn an iframe:** These tests open an embedded Phoenix instance inside the test runner, so they are slower and more resource-intensive than unit tests.
diff --git a/phoenix-builder-mcp/mcp-tools.js b/phoenix-builder-mcp/mcp-tools.js
index 7c466d3dd..e63128a01 100644
--- a/phoenix-builder-mcp/mcp-tools.js
+++ b/phoenix-builder-mcp/mcp-tools.js
@@ -383,6 +383,81 @@ export function registerTools(server, processManager, wsControlServer, phoenixDe
         }
     );
 
+    server.tool(
+        "run_tests",
+        "Run tests in the Phoenix test runner (SpecRunner.html). Reloads the test runner with the specified " +
+        "category and optional spec filter. The test runner must already be open in a browser with MCP enabled. " +
+        "Supported categories: unit, integration, LegacyInteg, livepreview, mainview. " +
+        "WARNING: Do NOT use 'all', 'performance', 'extension', or 'individualrun' categories — they are " +
+        "not actively supported and the full 'all' suite should never be run. " +
+        "To run all tests in a category, omit the spec parameter. " +
+        "To run a single suite, pass the suite name as spec (e.g. spec='unit: HTML Code Hinting'). " +
+        "Suite names are prefixed with the category and a colon, e.g. 'unit: Editor', 'unit: CSS Parsing'. " +
+        "You can also run individual specs by passing the full spec name, but note that individual specs " +
+        "may fail when run alone because suites often run tests in order with shared state — prefer " +
+        "running the full suite instead of individual specs. " +
+        "After calling run_tests, use get_test_results to poll for results.",
+        {
+            category: z.string().describe("Test category to run: unit, integration, LegacyInteg, livepreview, or mainview."),
+            spec: z.string().optional().describe("Optional suite or spec name to run within the category. " +
+                "Use the full name including category prefix, e.g. 'unit: CSS Parsing' for a suite. " +
+                "Prefer running full suites over individual specs, as specs may depend on suite execution order. " +
+                "Omit to run all tests in the category."),
+            instance: z.string().optional().describe("Target a specific test runner instance by name. Required when multiple instances are connected.")
+        },
+        async ({ category, spec, instance }) => {
+            try {
+                const result = await wsControlServer.requestRunTests(category, spec, instance);
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify({
+                            success: true,
+                            message: result.message || "Test runner is reloading with category=" + category
+                        })
+                    }]
+                };
+            } catch (err) {
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify({ error: err.message })
+                    }]
+                };
+            }
+        }
+    );
+
+    server.tool(
+        "get_test_results",
+        "Get structured test results from the Phoenix test runner. Returns running status, pass/fail counts, " +
+        "failure details, and the currently executing spec. The test runner must already be open with MCP enabled.",
+        {
+            instance: z.string().optional().describe("Target a specific test runner instance by name. Required when multiple instances are connected.")
+        },
+        async ({ instance }) => {
+            try {
+                const result = await wsControlServer.requestTestResults(instance);
+                // Remove internal WS fields
+                delete result.type;
+                delete result.id;
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify(result, null, 2)
+                    }]
+                };
+            } catch (err) {
+                return {
+                    content: [{
+                        type: "text",
+                        text: JSON.stringify({ error: err.message })
+                    }]
+                };
+            }
+        }
+    );
+
     server.tool(
         "get_phoenix_status",
         "Check the status of the Phoenix process and WebSocket connection.",
diff --git a/phoenix-builder-mcp/ws-control-server.js b/phoenix-builder-mcp/ws-control-server.js
index 452a2ac86..1ed071a63 100644
--- a/phoenix-builder-mcp/ws-control-server.js
+++ b/phoenix-builder-mcp/ws-control-server.js
@@ -109,6 +109,28 @@ export function createWSControlServer(port) {
                     break;
                 }
 
+                case "run_tests_response": {
+                    const pendingRt = pendingRequests.get(msg.id);
+                    if (pendingRt) {
+                        pendingRequests.delete(msg.id);
+                        if (msg.success) {
+                            pendingRt.resolve({ success: true, message: msg.message });
+                        } else {
+                            pendingRt.reject(new Error(msg.message || "run_tests failed"));
+                        }
+                    }
+                    break;
+                }
+
+                case "get_test_results_response": {
+                    const pendingTr = pendingRequests.get(msg.id);
+                    if (pendingTr) {
+                        pendingRequests.delete(msg.id);
+                        pendingTr.resolve(msg);
+                    }
+                    break;
+                }
+
                 case "reload_response": {
                     const pending3 = pendingRequests.get(msg.id);
                     if (pending3) {
@@ -390,6 +412,80 @@ export function createWSControlServer(port) {
         });
     }
 
+    function requestRunTests(category, spec, instanceName) {
+        return new Promise((resolve, reject) => {
+            const resolved = _resolveClient(instanceName);
+            if (resolved.error) {
+                reject(new Error(resolved.error));
+                return;
+            }
+
+            const { client } = resolved;
+            if (client.ws.readyState !== 1) {
+                reject(new Error("Phoenix client \"" + resolved.name + "\" is not connected"));
+                return;
+            }
+
+            const id = ++requestIdCounter;
+            const timeout = setTimeout(() => {
+                pendingRequests.delete(id);
+                reject(new Error("run_tests request timed out (30s)"));
+            }, 30000);
+
+            pendingRequests.set(id, {
+                resolve: (data) => {
+                    clearTimeout(timeout);
+                    resolve(data);
+                },
+                reject: (err) => {
+                    clearTimeout(timeout);
+                    reject(err);
+                }
+            });
+
+            const msg = { type: "run_tests_request", id, category };
+            if (spec) {
+                msg.spec = spec;
+            }
+            client.ws.send(JSON.stringify(msg));
+        });
+    }
+
+    function requestTestResults(instanceName) {
+        return new Promise((resolve, reject) => {
+            const resolved = _resolveClient(instanceName);
+            if (resolved.error) {
+                reject(new Error(resolved.error));
+                return;
+            }
+
+            const { client } = resolved;
+            if (client.ws.readyState !== 1) {
+                reject(new Error("Phoenix client \"" + resolved.name + "\" is not connected"));
+                return;
+            }
+
+            const id = ++requestIdCounter;
+            const timeout = setTimeout(() => {
+                pendingRequests.delete(id);
+                reject(new Error("get_test_results request timed out (30s)"));
+            }, 30000);
+
+            pendingRequests.set(id, {
+                resolve: (data) => {
+                    clearTimeout(timeout);
+                    resolve(data);
+                },
+                reject: (err) => {
+                    clearTimeout(timeout);
+                    reject(err);
+                }
+            });
+
+            client.ws.send(JSON.stringify({ type: "get_test_results_request", id }));
+        });
+    }
+
     function getBrowserLogs(sinceLast, instanceName) {
         const resolved = _resolveClient(instanceName);
         if (resolved.error) {
@@ -442,6 +538,8 @@ export function createWSControlServer(port) {
         requestLogs,
         requestExecJs,
         requestExecJsLivePreview,
+        requestRunTests,
+        requestTestResults,
         getBrowserLogs,
         clearBrowserLogs,
         isClientConnected,
diff --git a/src/phoenix-builder/phoenix-builder-boot.js b/src/phoenix-builder/phoenix-builder-boot.js
index c577720ec..94116ec1f 100644
--- a/src/phoenix-builder/phoenix-builder-boot.js
+++ b/src/phoenix-builder/phoenix-builder-boot.js
@@ -90,7 +90,8 @@
         let name = sessionStorage.getItem(INSTANCE_NAME_KEY);
         if (!name) {
             const hex = Math.floor(Math.random() * 0x10000).toString(16).padStart(4, "0");
-            name = "phoenix-" + _getPlatformTag() + "-" + hex;
+            const prefix = window._phoenixBuilderNamePrefix || "phoenix";
+            name = prefix + "-" + _getPlatformTag() + "-" + hex;
             sessionStorage.setItem(INSTANCE_NAME_KEY, name);
         }
         return name;
diff --git a/test/SpecRunner.html b/test/SpecRunner.html
index 1665f48c2..095bcb4e5 100644
--- a/test/SpecRunner.html
+++ b/test/SpecRunner.html
@@ -394,6 +394,10 @@
     }());
   </script>
 
+  <script>window._phoenixBuilderNamePrefix = "phoenix-test-runner";</script>
+  <script src="../src/phoenix-builder/phoenix-builder-boot.js"></script>
+  <script src="phoenix-test-runner-mcp.js"></script>
+
   <script src="../src/phoenix/shell.js" type="module"></script>
   <script src="virtual-server-loader.js" type="module"></script>
   <script src="../src/node-loader.js" defer></script>
diff --git a/test/SpecRunner.js b/test/SpecRunner.js
index 0c17e0ace..99bb6e82d 100644
--- a/test/SpecRunner.js
+++ b/test/SpecRunner.js
@@ -484,6 +484,7 @@ define(function (require, exports, module) {
             // Create the reporter, which is really a model class that just gathers
             // spec and performance data.
             reporter = new UnitTestReporter(jasmineEnv, params.get("spec"), selectedCategories);
+            window._unitTestReporter = reporter;
             SpecRunnerUtils.setUnitTestReporter(reporter);
 
             // Optionally emit JUnit XML file for automated runs
diff --git a/test/phoenix-test-runner-mcp.js b/test/phoenix-test-runner-mcp.js
new file mode 100644
index 000000000..1284a49f4
--- /dev/null
+++ b/test/phoenix-test-runner-mcp.js
@@ -0,0 +1,188 @@
+/*
+ * GNU AGPL-3.0 License
+ *
+ * Copyright (c) 2021 - present core.ai . All rights reserved.
+ *
+ * This program is free software: you can redistribute it and/or modify it
+ * under the terms of the GNU Affero General Public License as published by
+ * the Free Software Foundation, either version 3 of the License, or
+ * (at your option) any later version.
+ *
+ * This program is distributed in the hope that it will be useful, but WITHOUT
+ * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or
+ * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License
+ * for more details.
+ *
+ * You should have received a copy of the GNU Affero General Public License
+ * along with this program. If not, see https://opensource.org/licenses/AGPL-3.0.
+ *
+ */
+
+// Test-runner-specific MCP WebSocket handlers.
+// Loaded as a plain script (non-AMD) in SpecRunner.html after phoenix-builder-boot.js.
+// Registers handlers for run_tests_request and get_test_results_request.
+
+(function () {
+
+    var builder = window._phoenixBuilder;
+    if (!builder) {
+        // MCP not enabled — nothing to do
+        return;
+    }
+
+    // --- screenshot_request ---
+    // Handles screenshot capture in the test runner window.
+    // Reuses Phoenix.app.screenShotBinary which works on Tauri, Electron, and browser (with extension).
+    builder.registerHandler("screenshot_request", function (msg) {
+        if (!Phoenix || !Phoenix.app || !Phoenix.app.screenShotBinary) {
+            builder.sendMessage({
+                type: "error",
+                id: msg.id,
+                message: "Screenshot API not available"
+            });
+            return;
+        }
+
+        Phoenix.app.screenShotBinary(msg.selector || undefined)
+            .then(function (bytes) {
+                var binary = "";
+                var chunkSize = 8192;
+                for (var i = 0; i < bytes.length; i += chunkSize) {
+                    var chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length));
+                    binary += String.fromCharCode.apply(null, chunk);
+                }
+                var base64 = btoa(binary);
+                builder.sendMessage({
+                    type: "screenshot_response",
+                    id: msg.id,
+                    data: base64
+                });
+            })
+            .catch(function (err) {
+                builder.sendMessage({
+                    type: "error",
+                    id: msg.id,
+                    message: err.message || "Screenshot failed"
+                });
+            });
+    });
+
+    // --- run_tests_request ---
+    // Reloads SpecRunner with the requested category and/or spec URL params.
+    builder.registerHandler("run_tests_request", function (msg) {
+        var category = msg.category || "unit";
+        var spec = msg.spec || null;
+
+        // Respond before reloading so the MCP server gets the ack
+        builder.sendMessage({
+            type: "run_tests_response",
+            id: msg.id,
+            success: true,
+            message: "Reloading test runner with category=" + category + (spec ? ", spec=" + spec : "")
+        });
+
+        // Build the new URL and reload.
+        // Construct the query string manually with encodeURIComponent so spaces
+        // become %20 (not +). The SpecRunner UrlParams parser uses decodeURIComponent
+        // which only decodes %20, not +.
+        var base = window.location.href.split("?")[0];
+        var qs = "category=" + encodeURIComponent(category) +
+            "&spec=" + encodeURIComponent(spec || "all");
+
+        setTimeout(function () {
+            window.location.href = base + "?" + qs;
+        }, 100);
+    });
+
+    // --- get_test_results_request ---
+    // Returns structured test results from the live reporter and window.testResults.
+    builder.registerHandler("get_test_results_request", function (msg) {
+        var results = _gatherTestResults();
+        results.type = "get_test_results_response";
+        results.id = msg.id;
+        builder.sendMessage(results);
+    });
+
+    function _gatherTestResults() {
+        var testResults = window.testResults || {};
+        var completed = !!window.playWrightRunComplete;
+
+        // Try to access the reporter via the global that SpecRunner sets up.
+        // The reporter is attached to the BootstrapReporterView which reads from UnitTestReporter.
+        // We get what we can from the DOM and globals.
+        var reporter = window._unitTestReporter || null;
+
+        var totalSpecCount = 0;
+        var totalPassedCount = 0;
+        var totalFailedCount = 0;
+        var activeSpecCompleteCount = 0;
+        var currentSpec = "";
+        var activeSuite = null;
+        var categories = [];
+        var running = false;
+        var passed = !!testResults.passed;
+        var failures = [];
+
+        if (reporter) {
+            totalSpecCount = reporter.totalSpecCount || 0;
+            totalPassedCount = reporter.totalPassedCount || 0;
+            totalFailedCount = reporter.totalFailedCount || 0;
+            activeSpecCompleteCount = reporter.activeSpecCompleteCount || 0;
+            activeSuite = reporter.activeSuite || null;
+            categories = reporter.selectedCategories || [];
+            passed = !!reporter.passed;
+
+            // If tests started but haven't completed, they're running
+            running = !!activeSuite && !completed;
+
+            // Current spec from the info element
+            var infoEl = document.querySelector(".alert-info");
+            if (infoEl && infoEl.textContent && infoEl.textContent.indexOf("Running ") === 0) {
+                currentSpec = infoEl.textContent.substring("Running ".length);
+            }
+
+            // Gather failures from reporter suites
+            var suiteNames = Object.keys(reporter.suites || {});
+            for (var i = 0; i < suiteNames.length; i++) {
+                var suite = reporter.suites[suiteNames[i]];
+                if (suite && suite.specs) {
+                    for (var j = 0; j < suite.specs.length; j++) {
+                        var spec = suite.specs[j];
+                        if (spec && !spec.passed) {
+                            var msgs = [];
+                            if (spec.messages && spec.messages.length) {
+                                for (var k = 0; k < spec.messages.length; k++) {
+                                    var m = spec.messages[k];
+                                    msgs.push(m.message || String(m));
+                                }
+                            }
+                            failures.push({
+                                suite: suite.name,
+                                spec: spec.name,
+                                messages: msgs
+                            });
+                        }
+                    }
+                }
+            }
+        } else {
+            // No reporter yet — tests haven't loaded
+            running = false;
+        }
+
+        return {
+            running: running,
+            completed: completed,
+            passed: passed,
+            totalSpecCount: totalSpecCount,
+            totalPassedCount: totalPassedCount,
+            totalFailedCount: totalFailedCount,
+            activeSpecCompleteCount: activeSpecCompleteCount,
+            failures: failures,
+            currentSpec: currentSpec,
+            categories: categories,
+            activeSuite: activeSuite
+        };
+    }
+
+}());