From ec32a84bb922727637adbc0e9365e3a96866f4ad Mon Sep 17 00:00:00 2001 From: abose Date: Fri, 20 Feb 2026 15:56:05 +0530 Subject: [PATCH] feat: add MCP test runner support for run_tests and get_test_results Adds MCP tools to control the Phoenix test runner remotely: run test suites by category/spec and poll structured results. Includes WS protocol handlers, test-runner-side MCP script, and updated CLAUDE.md with accurate suite naming guidance. --- CLAUDE.md | 45 +++++ phoenix-builder-mcp/mcp-tools.js | 75 ++++++++ phoenix-builder-mcp/ws-control-server.js | 98 ++++++++++ src/phoenix-builder/phoenix-builder-boot.js | 3 +- test/SpecRunner.html | 4 + test/SpecRunner.js | 1 + test/phoenix-test-runner-mcp.js | 188 ++++++++++++++++++++ 7 files changed, 413 insertions(+), 1 deletion(-) create mode 100644 test/phoenix-test-runner-mcp.js diff --git a/CLAUDE.md b/CLAUDE.md index 224ead6eb..34cddb4b6 100644 --- a/CLAUDE.md +++ b/CLAUDE.md @@ -24,3 +24,48 @@ Use `exec_js` to run JS in the Phoenix browser runtime. jQuery `$()` is global. **Click AI chat buttons:** `$('.ai-edit-restore-btn:contains("Undo")').click();` **Check logs:** `get_browser_console_logs` with `filter` regex (e.g. `"AI UI"`, `"error"`) and `tail` — includes both browser console and Node.js (PhNode) logs. Use `get_terminal_logs` for Electron process output (only available if Phoenix was launched via `start_phoenix`). + +## Running Tests via MCP + +The test runner must be open as a separate Phoenix instance (it shows up as `phoenix-test-runner-*` in `get_phoenix_status`). Use `run_tests` to trigger test runs and `get_test_results` to poll for results. `take_screenshot` also works on the test runner. + +### Test categories +- **unit** — Fast, no UI. Safe to run all at once (`run_tests category=unit`). +- **integration** — Spawns a Phoenix iframe inside the test runner. Some specs require window focus and will hang if the test runner window isn't focused. +- **LegacyInteg** — Like integration but uses the legacy test harness. Also spawns an embedded Phoenix instance. +- **livepreview**, **mainview** — Specialized integration tests. +- **Do NOT use:** `all`, `performance`, `extension`, `individualrun` — not actively supported. + +### Hierarchy: Category → Suite → Test +- **Category** — top-level grouping: `unit`, `integration`, `LegacyInteg`, etc. Safe to run an entire category. +- **Suite** — a group of related tests within a category (e.g. `integration: FileFilters` has ~20 tests). This is the `spec` parameter value. +- **Test** — a single test within a suite. + +### Running all tests in a category +``` +run_tests(category="unit") +``` + +### Running a single suite +Pass the exact suite name as the `spec` parameter. **Suite names do NOT always have a category prefix.** Many suites are registered with just their plain name (e.g. `"CSS Parsing"`, `"Editor"`, `"JSUtils"`), while others include a prefix (e.g. `"unit:Phoenix Platform Tests"`, `"integration: FileFilters"`, `"LegacyInteg:ExtensionLoader"`). If the suite name is wrong, the test runner will show a blank page with 0 specs and appear stuck. + +**To discover the exact suite name**, run this in `exec_js` on the test runner instance: +```js +return jasmine.getEnv().topSuite().children.map(s => s.description); +``` + +Examples: +``` +run_tests(category="unit", spec="CSS Parsing") +run_tests(category="unit", spec="unit:Phoenix Platform Tests") +run_tests(category="integration", spec="integration: FileFilters") +run_tests(category="LegacyInteg", spec="LegacyInteg:ExtensionLoader") +``` + +### Running individual tests +You can pass a specific test's full name as `spec` to run just that one test. It is perfectly valid to run a single test. However, if a single test fails, re-run the full suite to confirm — suites sometimes execute tests in order with shared state, so an individual test may fail in isolation but pass within its suite. If the suite passes, the test is valid. + +### Gotchas +- **Instance name changes on reload:** The test runner gets a new random instance name each time the page reloads. Always check `get_phoenix_status` after a `run_tests` call to get the current instance name. +- **Integration tests may hang:** Specs labeled "needs window focus" will hang indefinitely if the test runner doesn't have OS-level window focus. If `get_test_results` starts timing out, the event loop is likely blocked by a stuck spec — use `force_reload_phoenix` to recover. +- **LegacyInteg/integration tests spawn an iframe:** These tests open an embedded Phoenix instance inside the test runner, so they are slower and more resource-intensive than unit tests. diff --git a/phoenix-builder-mcp/mcp-tools.js b/phoenix-builder-mcp/mcp-tools.js index 7c466d3dd..e63128a01 100644 --- a/phoenix-builder-mcp/mcp-tools.js +++ b/phoenix-builder-mcp/mcp-tools.js @@ -383,6 +383,81 @@ export function registerTools(server, processManager, wsControlServer, phoenixDe } ); + server.tool( + "run_tests", + "Run tests in the Phoenix test runner (SpecRunner.html). Reloads the test runner with the specified " + + "category and optional spec filter. The test runner must already be open in a browser with MCP enabled. " + + "Supported categories: unit, integration, LegacyInteg, livepreview, mainview. " + + "WARNING: Do NOT use 'all', 'performance', 'extension', or 'individualrun' categories — they are " + + "not actively supported and the full 'all' suite should never be run. " + + "To run all tests in a category, omit the spec parameter. " + + "To run a single suite, pass the suite name as spec (e.g. spec='unit: HTML Code Hinting'). " + + "Suite names are prefixed with the category and a colon, e.g. 'unit: Editor', 'unit: CSS Parsing'. " + + "You can also run individual specs by passing the full spec name, but note that individual specs " + + "may fail when run alone because suites often run tests in order with shared state — prefer " + + "running the full suite instead of individual specs. " + + "After calling run_tests, use get_test_results to poll for results.", + { + category: z.string().describe("Test category to run: unit, integration, LegacyInteg, livepreview, or mainview."), + spec: z.string().optional().describe("Optional suite or spec name to run within the category. " + + "Use the full name including category prefix, e.g. 'unit: CSS Parsing' for a suite. " + + "Prefer running full suites over individual specs, as specs may depend on suite execution order. " + + "Omit to run all tests in the category."), + instance: z.string().optional().describe("Target a specific test runner instance by name. Required when multiple instances are connected.") + }, + async ({ category, spec, instance }) => { + try { + const result = await wsControlServer.requestRunTests(category, spec, instance); + return { + content: [{ + type: "text", + text: JSON.stringify({ + success: true, + message: result.message || "Test runner is reloading with category=" + category + }) + }] + }; + } catch (err) { + return { + content: [{ + type: "text", + text: JSON.stringify({ error: err.message }) + }] + }; + } + } + ); + + server.tool( + "get_test_results", + "Get structured test results from the Phoenix test runner. Returns running status, pass/fail counts, " + + "failure details, and the currently executing spec. The test runner must already be open with MCP enabled.", + { + instance: z.string().optional().describe("Target a specific test runner instance by name. Required when multiple instances are connected.") + }, + async ({ instance }) => { + try { + const result = await wsControlServer.requestTestResults(instance); + // Remove internal WS fields + delete result.type; + delete result.id; + return { + content: [{ + type: "text", + text: JSON.stringify(result, null, 2) + }] + }; + } catch (err) { + return { + content: [{ + type: "text", + text: JSON.stringify({ error: err.message }) + }] + }; + } + } + ); + server.tool( "get_phoenix_status", "Check the status of the Phoenix process and WebSocket connection.", diff --git a/phoenix-builder-mcp/ws-control-server.js b/phoenix-builder-mcp/ws-control-server.js index 452a2ac86..1ed071a63 100644 --- a/phoenix-builder-mcp/ws-control-server.js +++ b/phoenix-builder-mcp/ws-control-server.js @@ -109,6 +109,28 @@ export function createWSControlServer(port) { break; } + case "run_tests_response": { + const pendingRt = pendingRequests.get(msg.id); + if (pendingRt) { + pendingRequests.delete(msg.id); + if (msg.success) { + pendingRt.resolve({ success: true, message: msg.message }); + } else { + pendingRt.reject(new Error(msg.message || "run_tests failed")); + } + } + break; + } + + case "get_test_results_response": { + const pendingTr = pendingRequests.get(msg.id); + if (pendingTr) { + pendingRequests.delete(msg.id); + pendingTr.resolve(msg); + } + break; + } + case "reload_response": { const pending3 = pendingRequests.get(msg.id); if (pending3) { @@ -390,6 +412,80 @@ export function createWSControlServer(port) { }); } + function requestRunTests(category, spec, instanceName) { + return new Promise((resolve, reject) => { + const resolved = _resolveClient(instanceName); + if (resolved.error) { + reject(new Error(resolved.error)); + return; + } + + const { client } = resolved; + if (client.ws.readyState !== 1) { + reject(new Error("Phoenix client \"" + resolved.name + "\" is not connected")); + return; + } + + const id = ++requestIdCounter; + const timeout = setTimeout(() => { + pendingRequests.delete(id); + reject(new Error("run_tests request timed out (30s)")); + }, 30000); + + pendingRequests.set(id, { + resolve: (data) => { + clearTimeout(timeout); + resolve(data); + }, + reject: (err) => { + clearTimeout(timeout); + reject(err); + } + }); + + const msg = { type: "run_tests_request", id, category }; + if (spec) { + msg.spec = spec; + } + client.ws.send(JSON.stringify(msg)); + }); + } + + function requestTestResults(instanceName) { + return new Promise((resolve, reject) => { + const resolved = _resolveClient(instanceName); + if (resolved.error) { + reject(new Error(resolved.error)); + return; + } + + const { client } = resolved; + if (client.ws.readyState !== 1) { + reject(new Error("Phoenix client \"" + resolved.name + "\" is not connected")); + return; + } + + const id = ++requestIdCounter; + const timeout = setTimeout(() => { + pendingRequests.delete(id); + reject(new Error("get_test_results request timed out (30s)")); + }, 30000); + + pendingRequests.set(id, { + resolve: (data) => { + clearTimeout(timeout); + resolve(data); + }, + reject: (err) => { + clearTimeout(timeout); + reject(err); + } + }); + + client.ws.send(JSON.stringify({ type: "get_test_results_request", id })); + }); + } + function getBrowserLogs(sinceLast, instanceName) { const resolved = _resolveClient(instanceName); if (resolved.error) { @@ -442,6 +538,8 @@ export function createWSControlServer(port) { requestLogs, requestExecJs, requestExecJsLivePreview, + requestRunTests, + requestTestResults, getBrowserLogs, clearBrowserLogs, isClientConnected, diff --git a/src/phoenix-builder/phoenix-builder-boot.js b/src/phoenix-builder/phoenix-builder-boot.js index c577720ec..94116ec1f 100644 --- a/src/phoenix-builder/phoenix-builder-boot.js +++ b/src/phoenix-builder/phoenix-builder-boot.js @@ -90,7 +90,8 @@ let name = sessionStorage.getItem(INSTANCE_NAME_KEY); if (!name) { const hex = Math.floor(Math.random() * 0x10000).toString(16).padStart(4, "0"); - name = "phoenix-" + _getPlatformTag() + "-" + hex; + const prefix = window._phoenixBuilderNamePrefix || "phoenix"; + name = prefix + "-" + _getPlatformTag() + "-" + hex; sessionStorage.setItem(INSTANCE_NAME_KEY, name); } return name; diff --git a/test/SpecRunner.html b/test/SpecRunner.html index 1665f48c2..095bcb4e5 100644 --- a/test/SpecRunner.html +++ b/test/SpecRunner.html @@ -394,6 +394,10 @@ }()); + + + + diff --git a/test/SpecRunner.js b/test/SpecRunner.js index 0c17e0ace..99bb6e82d 100644 --- a/test/SpecRunner.js +++ b/test/SpecRunner.js @@ -484,6 +484,7 @@ define(function (require, exports, module) { // Create the reporter, which is really a model class that just gathers // spec and performance data. reporter = new UnitTestReporter(jasmineEnv, params.get("spec"), selectedCategories); + window._unitTestReporter = reporter; SpecRunnerUtils.setUnitTestReporter(reporter); // Optionally emit JUnit XML file for automated runs diff --git a/test/phoenix-test-runner-mcp.js b/test/phoenix-test-runner-mcp.js new file mode 100644 index 000000000..1284a49f4 --- /dev/null +++ b/test/phoenix-test-runner-mcp.js @@ -0,0 +1,188 @@ +/* + * GNU AGPL-3.0 License + * + * Copyright (c) 2021 - present core.ai . All rights reserved. + * + * This program is free software: you can redistribute it and/or modify it + * under the terms of the GNU Affero General Public License as published by + * the Free Software Foundation, either version 3 of the License, or + * (at your option) any later version. + * + * This program is distributed in the hope that it will be useful, but WITHOUT + * ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or + * FITNESS FOR A PARTICULAR PURPOSE. See the GNU Affero General Public License + * for more details. + * + * You should have received a copy of the GNU Affero General Public License + * along with this program. If not, see https://opensource.org/licenses/AGPL-3.0. + * + */ + +// Test-runner-specific MCP WebSocket handlers. +// Loaded as a plain script (non-AMD) in SpecRunner.html after phoenix-builder-boot.js. +// Registers handlers for run_tests_request and get_test_results_request. + +(function () { + + var builder = window._phoenixBuilder; + if (!builder) { + // MCP not enabled — nothing to do + return; + } + + // --- screenshot_request --- + // Handles screenshot capture in the test runner window. + // Reuses Phoenix.app.screenShotBinary which works on Tauri, Electron, and browser (with extension). + builder.registerHandler("screenshot_request", function (msg) { + if (!Phoenix || !Phoenix.app || !Phoenix.app.screenShotBinary) { + builder.sendMessage({ + type: "error", + id: msg.id, + message: "Screenshot API not available" + }); + return; + } + + Phoenix.app.screenShotBinary(msg.selector || undefined) + .then(function (bytes) { + var binary = ""; + var chunkSize = 8192; + for (var i = 0; i < bytes.length; i += chunkSize) { + var chunk = bytes.subarray(i, Math.min(i + chunkSize, bytes.length)); + binary += String.fromCharCode.apply(null, chunk); + } + var base64 = btoa(binary); + builder.sendMessage({ + type: "screenshot_response", + id: msg.id, + data: base64 + }); + }) + .catch(function (err) { + builder.sendMessage({ + type: "error", + id: msg.id, + message: err.message || "Screenshot failed" + }); + }); + }); + + // --- run_tests_request --- + // Reloads SpecRunner with the requested category and/or spec URL params. + builder.registerHandler("run_tests_request", function (msg) { + var category = msg.category || "unit"; + var spec = msg.spec || null; + + // Respond before reloading so the MCP server gets the ack + builder.sendMessage({ + type: "run_tests_response", + id: msg.id, + success: true, + message: "Reloading test runner with category=" + category + (spec ? ", spec=" + spec : "") + }); + + // Build the new URL and reload. + // Construct the query string manually with encodeURIComponent so spaces + // become %20 (not +). The SpecRunner UrlParams parser uses decodeURIComponent + // which only decodes %20, not +. + var base = window.location.href.split("?")[0]; + var qs = "category=" + encodeURIComponent(category) + + "&spec=" + encodeURIComponent(spec || "all"); + + setTimeout(function () { + window.location.href = base + "?" + qs; + }, 100); + }); + + // --- get_test_results_request --- + // Returns structured test results from the live reporter and window.testResults. + builder.registerHandler("get_test_results_request", function (msg) { + var results = _gatherTestResults(); + results.type = "get_test_results_response"; + results.id = msg.id; + builder.sendMessage(results); + }); + + function _gatherTestResults() { + var testResults = window.testResults || {}; + var completed = !!window.playWrightRunComplete; + + // Try to access the reporter via the global that SpecRunner sets up. + // The reporter is attached to the BootstrapReporterView which reads from UnitTestReporter. + // We get what we can from the DOM and globals. + var reporter = window._unitTestReporter || null; + + var totalSpecCount = 0; + var totalPassedCount = 0; + var totalFailedCount = 0; + var activeSpecCompleteCount = 0; + var currentSpec = ""; + var activeSuite = null; + var categories = []; + var running = false; + var passed = !!testResults.passed; + var failures = []; + + if (reporter) { + totalSpecCount = reporter.totalSpecCount || 0; + totalPassedCount = reporter.totalPassedCount || 0; + totalFailedCount = reporter.totalFailedCount || 0; + activeSpecCompleteCount = reporter.activeSpecCompleteCount || 0; + activeSuite = reporter.activeSuite || null; + categories = reporter.selectedCategories || []; + passed = !!reporter.passed; + + // If tests started but haven't completed, they're running + running = !!activeSuite && !completed; + + // Current spec from the info element + var infoEl = document.querySelector(".alert-info"); + if (infoEl && infoEl.textContent && infoEl.textContent.indexOf("Running ") === 0) { + currentSpec = infoEl.textContent.substring("Running ".length); + } + + // Gather failures from reporter suites + var suiteNames = Object.keys(reporter.suites || {}); + for (var i = 0; i < suiteNames.length; i++) { + var suite = reporter.suites[suiteNames[i]]; + if (suite && suite.specs) { + for (var j = 0; j < suite.specs.length; j++) { + var spec = suite.specs[j]; + if (spec && !spec.passed) { + var msgs = []; + if (spec.messages && spec.messages.length) { + for (var k = 0; k < spec.messages.length; k++) { + var m = spec.messages[k]; + msgs.push(m.message || String(m)); + } + } + failures.push({ + suite: suite.name, + spec: spec.name, + messages: msgs + }); + } + } + } + } + } else { + // No reporter yet — tests haven't loaded + running = false; + } + + return { + running: running, + completed: completed, + passed: passed, + totalSpecCount: totalSpecCount, + totalPassedCount: totalPassedCount, + totalFailedCount: totalFailedCount, + activeSpecCompleteCount: activeSpecCompleteCount, + failures: failures, + currentSpec: currentSpec, + categories: categories, + activeSuite: activeSuite + }; + } + +}());