Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 75 additions & 17 deletions nodescraper/plugins/inband/amdsmi/amdsmi_collector.py
Original file line number Diff line number Diff line change
Expand Up @@ -67,10 +67,11 @@
StaticXgmiPlpd,
ValueUnit,
)
from nodescraper.plugins.inband.amdsmi.collector_args import AmdSmiCollectorArgs
from nodescraper.utils import get_exception_traceback


class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]):
class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, AmdSmiCollectorArgs]):
"""Class for collection of inband tool amd-smi data."""

AMD_SMI_EXE = "amd-smi"
Expand All @@ -87,6 +88,7 @@ class AmdSmiCollector(InBandDataCollector[AmdSmiDataModel, None]):
CMD_STATIC = "static -g all --json"
CMD_STATIC_GPU = "static -g {gpu_id} --json"
CMD_RAS = "ras --cper --folder={folder}"
CMD_RAS_AFID = "ras --afid --cper-file {cper_file}"

def _check_amdsmi_installed(self) -> bool:
"""Check if amd-smi is installed
Expand Down Expand Up @@ -331,7 +333,7 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
firmware = self.get_firmware()
gpu_list = self.get_gpu_list()
statics = self.get_static()
cper_data = self.get_cper_data()
cper_data, cper_afids = self.get_cper_data()
except Exception as e:
self._log_event(
category=EventCategory.APPLICATION,
Expand All @@ -352,6 +354,7 @@ def _get_amdsmi_data(self) -> Optional[AmdSmiDataModel]:
firmware=firmware,
static=statics,
cper_data=cper_data,
cper_afids=cper_afids,
)
except ValidationError as err:
self.logger.warning("Validation err: %s", err)
Expand Down Expand Up @@ -1173,11 +1176,12 @@ def _parse_clock_dict(self, data: dict) -> Optional[dict[str, Union[StaticClockD

return clock_dict if clock_dict else None

def get_cper_data(self) -> List[FileModel]:
"""Collect CPER data from amd-smi ras command
def get_cper_data(self) -> tuple[List[FileModel], Dict[str, int]]:
"""Collect CPER data from amd-smi ras command and extract AFID for each file

Returns:
list[FileModel]: List of CPER files or empty list if not supported/available
tuple[list[FileModel], dict[str, int]]: Tuple of (list of CPER files, dict mapping filenames to AFIDs)
Returns empty list and dict if not supported/available
"""
try:
AMD_SMI_CPER_FOLDER = "/tmp/amd_smi_cper"
Expand All @@ -1192,14 +1196,14 @@ def get_cper_data(self) -> List[FileModel]:
sudo=True,
)
if cper_cmd_ret.exit_code != 0:
# Command failed, return empty list
return []
# Command failed, return empty list and dict
return [], {}
cper_cmd = cper_cmd_ret.stdout
# search that a CPER is actually created here
regex_cper_search = re.findall(r"(\w+\.cper)", cper_cmd)
if not regex_cper_search:
# Early exit if no CPER files were created
return []
return [], {}
# tar the cper folder
self._run_sut_cmd(
f"tar -czf {AMD_SMI_CPER_FOLDER}.tar.gz -C {AMD_SMI_CPER_FOLDER} .",
Expand All @@ -1213,11 +1217,12 @@ def get_cper_data(self) -> List[FileModel]:
if hasattr(cper_zip, "contents"):
io_bytes = io.BytesIO(cper_zip.contents) # type: ignore[attr-defined]
else:
return []
return [], {}
del cper_zip # Free memory after reading the file
try:
with TarFile.open(fileobj=io_bytes, mode="r:gz") as tar_file:
cper_data = []
cper_afids = {}
for member in tar_file.getmembers():
if member.isfile() and member.name.endswith(".cper"):
file_content = tar_file.extractfile(member)
Expand All @@ -1230,16 +1235,25 @@ def get_cper_data(self) -> List[FileModel]:
cper_data.append(
FileModel(file_contents=file_content_bytes, file_name=member.name)
)

cper_file_path = f"{AMD_SMI_CPER_FOLDER}/{member.name}"
afid = self._get_cper_afid(cper_file_path)
if afid is not None:
cper_afids[member.name] = afid

# Since we do not log the cper data in the data model create an event informing the user if CPER created
if cper_data:
self._log_event(
category=EventCategory.APPLICATION,
description="CPER data has been extracted from amd-smi",
data={
"cper_count": len(cper_data),
"afid_count": len(cper_afids),
},
priority=EventPriority.INFO,
console_log=True,
)
return cper_data, cper_afids
except Exception as e:
self._log_event(
category=EventCategory.APPLICATION,
Expand All @@ -1250,11 +1264,8 @@ def get_cper_data(self) -> List[FileModel]:
priority=EventPriority.ERROR,
console_log=True,
)
return []
return cper_data
return [], {}
except Exception as e:
# If any unexpected error occurs during CPER collection, log it and return empty list
# This ensures CPER collection failures don't break the entire data collection
self._log_event(
category=EventCategory.APPLICATION,
description="Error collecting CPER data",
Expand All @@ -1264,19 +1275,61 @@ def get_cper_data(self) -> List[FileModel]:
priority=EventPriority.WARNING,
console_log=False,
)
return []
return [], {}

def _get_cper_afid(self, cper_file_path: str) -> Optional[int]:
"""Get AFID from a CPER file using amd-smi ras --afid --cper-file command

Args:
cper_file_path (str): Path to the CPER file

Returns:
Optional[int]: AFID value or None if command fails or no value found
"""
cmd = self.CMD_RAS_AFID.format(cper_file=cper_file_path)
result = self._run_amd_smi(cmd)

if result is None:
self._log_event(
category=EventCategory.APPLICATION,
description=f"Failed to get AFID from CPER file: {cper_file_path}",
priority=EventPriority.ERROR,
console_log=True,
)
return None

try:
afid = int(result.strip())
self._log_event(
category=EventCategory.APPLICATION,
description=f"Successfully retrieved AFID from CPER file: {cper_file_path}",
data={"afid": afid, "cper_file": cper_file_path},
priority=EventPriority.INFO,
console_log=True,
)
return afid
except ValueError:
self._log_event(
category=EventCategory.APPLICATION,
description=f"Failed to parse AFID value from output: {result}",
data={"output": result, "cper_file": cper_file_path},
priority=EventPriority.ERROR,
console_log=True,
)
return None

def collect_data(
self,
args: Any = None,
args: Optional[AmdSmiCollectorArgs] = None,
) -> tuple[TaskResult, Optional[AmdSmiDataModel]]:
"""Collect AmdSmi data from system

Args:
args (Any, optional): optional arguments for data collection. Defaults to None.
args: Optional collector arguments. If cper_file_path is provided,
AFID will be extracted and stored in cper_afids dict.

Returns:
tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and collected data model
tuple[TaskResult, Optional[AmdSmiDataModel]]: task result and data model
"""

if not self._check_amdsmi_installed():
Expand All @@ -1300,6 +1353,11 @@ def collect_data(
if amd_smi_data is None:
return self.result, None

if args and args.cper_file_path:
afid = self._get_cper_afid(args.cper_file_path)
if afid is not None:
amd_smi_data.cper_afids[args.cper_file_path] = afid

return self.result, amd_smi_data
except Exception as e:
self._log_event(
Expand Down
5 changes: 4 additions & 1 deletion nodescraper/plugins/inband/amdsmi/amdsmi_plugin.py
Original file line number Diff line number Diff line change
Expand Up @@ -29,15 +29,18 @@
from .amdsmi_collector import AmdSmiCollector
from .amdsmidata import AmdSmiDataModel
from .analyzer_args import AmdSmiAnalyzerArgs
from .collector_args import AmdSmiCollectorArgs


class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, None, AmdSmiAnalyzerArgs]):
class AmdSmiPlugin(InBandDataPlugin[AmdSmiDataModel, AmdSmiCollectorArgs, AmdSmiAnalyzerArgs]):
"""Plugin for collection and analysis of amdsmi data"""

DATA_MODEL = AmdSmiDataModel

COLLECTOR = AmdSmiCollector

COLLECTOR_ARGS = AmdSmiCollectorArgs

ANALYZER = AmdSmiAnalyzer

ANALYZER_ARGS = AmdSmiAnalyzerArgs
1 change: 1 addition & 0 deletions nodescraper/plugins/inband/amdsmi/amdsmidata.py
Original file line number Diff line number Diff line change
Expand Up @@ -954,6 +954,7 @@ class AmdSmiDataModel(DataModel):
xgmi_metric: Optional[list[XgmiMetrics]] = Field(default_factory=list)
xgmi_link: Optional[list[XgmiLinks]] = Field(default_factory=list)
cper_data: Optional[list[FileModel]] = Field(default_factory=list)
cper_afids: dict[str, int] = Field(default_factory=dict)
amdsmitst_data: AmdSmiTstData = Field(default_factory=AmdSmiTstData)

def get_list(self, gpu: int) -> Optional[AmdSmiListItem]:
Expand Down
34 changes: 34 additions & 0 deletions nodescraper/plugins/inband/amdsmi/collector_args.py
Original file line number Diff line number Diff line change
@@ -0,0 +1,34 @@
###############################################################################
#
# MIT License
#
# Copyright (c) 2026 Advanced Micro Devices, Inc.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in all
# copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
# SOFTWARE.
#
###############################################################################
from typing import Optional

from nodescraper.models import CollectorArgs


class AmdSmiCollectorArgs(CollectorArgs):
"""Collector arguments for AmdSmiPlugin"""

cper_file_path: Optional[str] = None
Loading