diff --git a/nodescraper/cli/cli.py b/nodescraper/cli/cli.py index 94a1f0f..04f0ef3 100644 --- a/nodescraper/cli/cli.py +++ b/nodescraper/cli/cli.py @@ -338,8 +338,10 @@ def process_args( else: cur_plugin = None for arg in plugin_args: - # Handle comma-separated plugin names (but not arguments) - if not arg.startswith("-") and "," in arg: + # Handle comma-separated plugin names (but not option values like JSON) + arg_stripped = arg.strip() + looks_like_value = arg_stripped.startswith("{") or arg_stripped.startswith("[") + if not arg.startswith("-") and "," in arg and not looks_like_value: # Split comma-separated plugin names for potential_plugin in arg.split(","): potential_plugin = potential_plugin.strip() diff --git a/nodescraper/plugins/inband/rocm/analyzer_args.py b/nodescraper/plugins/inband/rocm/analyzer_args.py index ff0751e..f10821e 100644 --- a/nodescraper/plugins/inband/rocm/analyzer_args.py +++ b/nodescraper/plugins/inband/rocm/analyzer_args.py @@ -34,6 +34,8 @@ class RocmAnalyzerArgs(AnalyzerArgs): exp_rocm: Union[str, list] = Field(default_factory=list) exp_rocm_latest: str = Field(default="") + # Key = sub-version name (e.g. version_rocm); value = expected string or list of allowed strings + exp_rocm_sub_versions: dict[str, Union[str, list]] = Field(default_factory=dict) @field_validator("exp_rocm", mode="before") @classmethod diff --git a/nodescraper/plugins/inband/rocm/rocm_analyzer.py b/nodescraper/plugins/inband/rocm/rocm_analyzer.py index ea17969..98ba6e3 100644 --- a/nodescraper/plugins/inband/rocm/rocm_analyzer.py +++ b/nodescraper/plugins/inband/rocm/rocm_analyzer.py @@ -23,7 +23,7 @@ # SOFTWARE. # ############################################################################### -from typing import Optional +from typing import List, Optional, Tuple, Union from nodescraper.enums import EventCategory, EventPriority, ExecutionStatus from nodescraper.interfaces import DataAnalyzer @@ -34,10 +34,62 @@ class RocmAnalyzer(DataAnalyzer[RocmDataModel, RocmAnalyzerArgs]): - """Check ROCm matches expected versions""" + """Check ROCm matches expected versions. + + The expected ROCm version (exp_rocm) can be a string or a list of allowed strings. + Sub-versions (exp_rocm_sub_versions) are a dict: each value can be a string or + a list of allowed strings for that key. + """ DATA_MODEL = RocmDataModel + def check_exp_rocm_sub_versions( + self, + data: RocmDataModel, + exp_rocm_sub_versions: dict[str, Union[str, list]], + ) -> bool: + """Check if ROCm sub-versions match expected versions when provided. + + Each expected value can be a single string (exact match) or a list of + allowed strings (actual must be in the list). + + Returns: + bool: True if all sub-versions match, False otherwise + """ + events: dict[str, Tuple[Union[str, List[str]], str]] = {} # {Version: (expected, actual)} + for exp_version_key, exp_rocm in exp_rocm_sub_versions.items(): + actual_version = data.rocm_sub_versions.get(exp_version_key) + if actual_version is None: + events[exp_version_key] = ( + exp_rocm, + "Not Available", + ) + continue + + if isinstance(exp_rocm, list) and actual_version not in exp_rocm: + events[exp_version_key] = ( + exp_rocm, + actual_version, + ) + elif actual_version != exp_rocm: + events[exp_version_key] = ( + exp_rocm, + actual_version, + ) + if events: + self._log_event( + category=EventCategory.SW_DRIVER, + description="ROCm sub-version mismatch!", + data={ + "expected": {k: r[0] for k, r in events.items()}, + "actual": {k: r[1] for k, r in events.items()}, + }, + priority=EventPriority.ERROR, + console_log=True, + ) + return False + return True + def analyze_data( self, data: RocmDataModel, args: Optional[RocmAnalyzerArgs] = None ) -> TaskResult: @@ -51,48 +103,65 @@ def analyze_data( Returns: TaskResult: Result of the analysis containing status and message. """ - # skip check if data not provided in config - if not args or not args.exp_rocm: + if args is None: + args = RocmAnalyzerArgs() + + exp_rocm = args.exp_rocm + exp_rocm_sub_versions = args.exp_rocm_sub_versions + exp_rocm_latest = args.exp_rocm_latest + + if not exp_rocm and not exp_rocm_sub_versions and not exp_rocm_latest: self.result.message = "Expected ROCm not provided" self.result.status = ExecutionStatus.NOT_RAN return self.result - for rocm_version in args.exp_rocm: - if data.rocm_version == rocm_version: - self.result.message = "ROCm version matches expected" - self.result.status = ExecutionStatus.OK - break - else: - # No matching version found - self.result.message = "ROCm version mismatch!" - self.result.status = ExecutionStatus.ERROR + errors: List[str] = [] + + if exp_rocm and data.rocm_version not in exp_rocm: self._log_event( category=EventCategory.SW_DRIVER, - description=f"ROCm version mismatch! Expected: {args.exp_rocm}, actual: {data.rocm_version}", - data={"expected": args.exp_rocm, "actual": data.rocm_version}, + description=f"ROCm version mismatch! Expected: {exp_rocm}, actual: {data.rocm_version}", + data={"expected": exp_rocm, "actual": data.rocm_version}, priority=EventPriority.CRITICAL, console_log=True, ) - return self.result + errors.append("ROCm version mismatch!") + + if exp_rocm_sub_versions and not self.check_exp_rocm_sub_versions( + data=data, + exp_rocm_sub_versions=exp_rocm_sub_versions, + ): + errors.append("ROCm sub-version mismatch") - # validate rocm_latest if provided in args - if args.exp_rocm_latest: - if data.rocm_latest_versioned_path != args.exp_rocm_latest: - self.result.message = "ROCm latest path mismatch!" - self.result.status = ExecutionStatus.ERROR - self._log_event( - category=EventCategory.SW_DRIVER, - description=f"ROCm latest path mismatch! Expected: {args.exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}", - data={ - "expected": args.exp_rocm_latest, - "actual": data.rocm_latest_versioned_path, - }, - priority=EventPriority.CRITICAL, - console_log=True, + if exp_rocm_latest and data.rocm_latest_versioned_path != exp_rocm_latest: + self._log_event( + category=EventCategory.SW_DRIVER, + description=f"ROCm latest path mismatch! Expected: {exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}", + data={ + "expected": exp_rocm_latest, + "actual": data.rocm_latest_versioned_path, + }, + priority=EventPriority.CRITICAL, + console_log=True, + ) + errors.append( + f"ROCm latest path mismatch! Expected: {exp_rocm_latest}, actual: {data.rocm_latest_versioned_path}" + ) + + if errors: + self.result.status = ExecutionStatus.ERROR + self.result.message = ". ".join(errors) + else: + self.result.status = ExecutionStatus.OK + success_parts = [] + if exp_rocm: + success_parts.append("ROCm version matches expected") + if exp_rocm_sub_versions: + success_parts.append("ROCm sub-versions match expected") + if exp_rocm_latest: + success_parts.append( + f"ROCm latest path validated: {data.rocm_latest_versioned_path}" ) - return self.result - else: - # Update message to include rocm_latest validation result - self.result.message = f"ROCm version matches expected. ROCm latest path validated: {data.rocm_latest_versioned_path}" + self.result.message = ". ".join(success_parts) return self.result diff --git a/nodescraper/plugins/inband/rocm/rocm_collector.py b/nodescraper/plugins/inband/rocm/rocm_collector.py index f7692e4..31ea149 100644 --- a/nodescraper/plugins/inband/rocm/rocm_collector.py +++ b/nodescraper/plugins/inband/rocm/rocm_collector.py @@ -44,6 +44,7 @@ class RocmCollector(InBandDataCollector[RocmDataModel, None]): "/opt/rocm/.info/version-rocm", "/opt/rocm/.info/version", ] + CMD_ROCM_SUB_VERSIONS = "grep . -r /opt/rocm/.info/*" CMD_ROCMINFO = "{rocm_path}/bin/rocminfo" CMD_ROCM_LATEST = "ls -v -d /opt/rocm-[3-7]* | tail -1" CMD_ROCM_DIRS = "ls -v -d /opt/rocm*" @@ -60,11 +61,27 @@ def collect_data(self, args=None) -> tuple[TaskResult, Optional[RocmDataModel]]: tuple[TaskResult, Optional[RocmDataModel]]: tuple containing the task result and ROCm data model if available. """ rocm_data = None + rocm_sub_versions = {} + + # First, try to collect all sub-versions + sub_versions_res = self._run_sut_cmd(self.CMD_ROCM_SUB_VERSIONS) + if sub_versions_res.exit_code == 0: + for line in sub_versions_res.stdout.splitlines(): + if ":" in line: + # Split the line into key and value (format: /path/to/file:version) + key, value = line.split(":", 1) + # Extract just the filename from the path + key = key.split("/")[-1] + rocm_sub_versions[key.strip()] = value.strip() + + # Determine the main ROCm version for path in self.CMD_VERSION_PATHS: res = self._run_sut_cmd(f"grep . {path}") if res.exit_code == 0: try: - rocm_data = RocmDataModel(rocm_version=res.stdout) + rocm_data = RocmDataModel( + rocm_version=res.stdout, rocm_sub_versions=rocm_sub_versions + ) self._log_event( category="ROCM_VERSION_READ", description="ROCm version data collected", diff --git a/nodescraper/plugins/inband/rocm/rocmdata.py b/nodescraper/plugins/inband/rocm/rocmdata.py index f0fb261..c7e7560 100644 --- a/nodescraper/plugins/inband/rocm/rocmdata.py +++ b/nodescraper/plugins/inband/rocm/rocmdata.py @@ -33,6 +33,7 @@ class RocmDataModel(DataModel): rocm_version: str + rocm_sub_versions: dict[str, str] = {} rocminfo: List[str] = [] rocm_latest_versioned_path: str = "" rocm_all_paths: List[str] = [] diff --git a/test/functional/fixtures/rocm_plugin_config.json b/test/functional/fixtures/rocm_plugin_config.json index 95665a6..f26ed95 100644 --- a/test/functional/fixtures/rocm_plugin_config.json +++ b/test/functional/fixtures/rocm_plugin_config.json @@ -3,7 +3,10 @@ "plugins": { "RocmPlugin": { "analysis_args": { - "exp_rocm": "7.0.0-38" + "exp_rocm": "7.0.0-38", + "exp_rocm_sub_versions": { + "version_rocm": "7.0.0-38" + } } } }, diff --git a/test/unit/plugin/test_rocm_analyzer.py b/test/unit/plugin/test_rocm_analyzer.py index 9ecc7fb..b831d47 100644 --- a/test/unit/plugin/test_rocm_analyzer.py +++ b/test/unit/plugin/test_rocm_analyzer.py @@ -45,6 +45,20 @@ def model_obj(): return RocmDataModel(rocm_version="6.2.0-66", rocm_latest_versioned_path="/opt/rocm-7.1.0") +@pytest.fixture +def model_obj_with_sub_versions(): + """Model with rocm_sub_versions populated (for sub-version tests).""" + return RocmDataModel( + rocm_version="6.2.0-66", + rocm_latest_versioned_path="/opt/rocm-7.1.0", + rocm_sub_versions={ + "version": "6.2.0-66", + "version-rocm": "6.2.0-66", + "version-hip-sdk": "6.2.0-66", + }, + ) + + @pytest.fixture def config(): return { @@ -109,3 +123,59 @@ def test_rocm_latest_path_mismatch(analyzer, model_obj): for event in result.events: assert event.priority == EventPriority.CRITICAL assert event.category == EventCategory.SW_DRIVER.value + + +def test_sub_versions_pass(analyzer, model_obj_with_sub_versions, config): + """Test that exp_rocm + exp_rocm_sub_versions matching yields OK.""" + args = RocmAnalyzerArgs( + exp_rocm=config["rocm_version"], + exp_rocm_latest=config["rocm_latest"], + exp_rocm_sub_versions={"version-rocm": "6.2.0-66"}, + ) + result = analyzer.analyze_data(model_obj_with_sub_versions, args) + assert result.status == ExecutionStatus.OK + assert "ROCm version matches expected" in result.message + assert "ROCm sub-versions match expected" in result.message + assert all( + event.priority not in {EventPriority.WARNING, EventPriority.ERROR, EventPriority.CRITICAL} + for event in result.events + ) + + +def test_sub_versions_mismatch(analyzer, model_obj_with_sub_versions): + """Test that exp_rocm_sub_versions value mismatch is detected.""" + # Main version matches; sub-version expected 6.2.0-65, actual 6.2.0-66 + model = copy.deepcopy(model_obj_with_sub_versions) + model.rocm_version = "6.2.0" + args = RocmAnalyzerArgs( + exp_rocm="6.2.0", + exp_rocm_sub_versions={"version-rocm": "6.2.0-65"}, + ) + result = analyzer.analyze_data(model, args) + assert result.status == ExecutionStatus.ERROR + assert "ROCm sub-version mismatch" in result.message + assert len(result.events) == 1 + for event in result.events: + assert event.priority == EventPriority.ERROR + assert event.category == EventCategory.SW_DRIVER.value + assert event.description == "ROCm sub-version mismatch!" + assert event.data["actual"] == {"version-rocm": "6.2.0-66"} + assert event.data["expected"] == {"version-rocm": "6.2.0-65"} + + +def test_sub_versions_not_bad_key(analyzer, model_obj_with_sub_versions, config): + """Test that a requested sub-version key not present in data logs 'Not Available'.""" + args = RocmAnalyzerArgs( + exp_rocm=config["rocm_version"], + exp_rocm_sub_versions={"versiodsalodn-rocm": "6.2.0-66"}, + ) + result = analyzer.analyze_data(model_obj_with_sub_versions, args) + assert result.status == ExecutionStatus.ERROR + assert "ROCm sub-version mismatch" in result.message + assert len(result.events) == 1 + for event in result.events: + assert event.priority == EventPriority.ERROR + assert event.category == EventCategory.SW_DRIVER.value + assert event.description == "ROCm sub-version mismatch!" + assert event.data["actual"] == {"versiodsalodn-rocm": "Not Available"} + assert event.data["expected"] == {"versiodsalodn-rocm": "6.2.0-66"} diff --git a/test/unit/plugin/test_rocm_collector.py b/test/unit/plugin/test_rocm_collector.py index 2b419ad..ea08c0a 100644 --- a/test/unit/plugin/test_rocm_collector.py +++ b/test/unit/plugin/test_rocm_collector.py @@ -286,3 +286,65 @@ def test_invalid_rocm_version_format(collector): assert result.status == ExecutionStatus.ERROR assert data is None assert len(result.events) >= 1 + + +def test_collect_rocm_sub_versions(collector): + """Test collection of ROCm version and multiple sub-versions (mirrors error-scraper test_run_new_version).""" + sub_versions_stdout = ( + "/opt/rocm/.info/version:6.4.0-47\n" + "/opt/rocm/.info/version-hip-libraries:6.4.0-47\n" + "/opt/rocm/.info/version-hiprt:6.4.0-47\n" + "/opt/rocm/.info/version-hiprt-devel:6.4.0-47\n" + "/opt/rocm/.info/version-hip-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-lrt:6.4.0-47\n" + "/opt/rocm/.info/version-ml-libraries:6.4.0-47\n" + "/opt/rocm/.info/version-ml-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-oclrt:6.4.0-47\n" + "/opt/rocm/.info/version-ocl-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-openmp-sdk:6.4.0-47\n" + "/opt/rocm/.info/version-rocm:6.4.0-47\n" + "/opt/rocm/.info/version-rocm-developer-tools:6.4.0-47\n" + "/opt/rocm/.info/version-utils:6.4.0-47\n" + ) + expected_sub_versions = { + "version": "6.4.0-47", + "version-hip-libraries": "6.4.0-47", + "version-hiprt": "6.4.0-47", + "version-hiprt-devel": "6.4.0-47", + "version-hip-sdk": "6.4.0-47", + "version-lrt": "6.4.0-47", + "version-ml-libraries": "6.4.0-47", + "version-ml-sdk": "6.4.0-47", + "version-oclrt": "6.4.0-47", + "version-ocl-sdk": "6.4.0-47", + "version-openmp-sdk": "6.4.0-47", + "version-rocm": "6.4.0-47", + "version-rocm-developer-tools": "6.4.0-47", + "version-utils": "6.4.0-47", + } + collector._run_sut_cmd = MagicMock( + side_effect=[ + # First: grep . -r /opt/rocm/.info/* (sub-versions) + MagicMock(exit_code=0, stdout=sub_versions_stdout), + # Second: grep . /opt/rocm/.info/version-rocm (main version) + MagicMock(exit_code=0, stdout="6.4.0-47"), + # Optional data (all fail for minimal test) + MagicMock(exit_code=1, stdout=""), # latest path + MagicMock(exit_code=1, stdout=""), # all paths + MagicMock(exit_code=1, stdout=""), # rocminfo + MagicMock(exit_code=1, stdout=""), # ld.so.conf + MagicMock(exit_code=1, stdout=""), # rocm_libs + MagicMock(exit_code=1, stdout=""), # env_vars + MagicMock(exit_code=1, stdout=""), # clinfo + MagicMock(exit_code=1, stdout=""), # kfd_proc + ] + ) + + result, data = collector.collect_data() + + assert result.status == ExecutionStatus.OK + assert data is not None + assert data.rocm_version == "6.4.0-47" + assert data.rocm_sub_versions == expected_sub_versions + assert any(event.category == "ROCM_VERSION_READ" for event in result.events) + assert "ROCm version: 6.4.0-47" in result.message