diff --git a/sos/report/mellanox_firmware_suite/collectors/collector_manager.py b/sos/report/mellanox_firmware_suite/collectors/collector_manager.py index 2d5dda5b9c..88279a1b2b 100644 --- a/sos/report/mellanox_firmware_suite/collectors/collector_manager.py +++ b/sos/report/mellanox_firmware_suite/collectors/collector_manager.py @@ -1,6 +1,7 @@ from .system_collector import SystemCollector from .firmware_collector import FirmwareCollector from .cable_collector import CableCollector +from .pcc_collector import PccCollector class CollectorManager(object): @@ -12,6 +13,7 @@ def collect_all(self): self.collect_system_info() self.collect_firmware_info() self.collect_cable_info() + self.collect_pcc_info() def collect_system_info(self): for ctx in self.device_contexts: @@ -26,3 +28,9 @@ def collect_firmware_info(self): def collect_cable_info(self): for ctx in self.device_contexts: CableCollector().run(self.plugin, ctx) + + def collect_pcc_info(self): + if not self.plugin.get_option("pcc", default=False): + return + for ctx in self.device_contexts: + PccCollector().run(self.plugin, ctx) diff --git a/sos/report/mellanox_firmware_suite/collectors/pcc_collector.py b/sos/report/mellanox_firmware_suite/collectors/pcc_collector.py new file mode 100644 index 0000000000..08005d981a --- /dev/null +++ b/sos/report/mellanox_firmware_suite/collectors/pcc_collector.py @@ -0,0 +1,386 @@ +import re +from enum import Enum +from typing import Dict, List, Optional, Tuple + +from .base_collector import Collector +from ..tools import ( + MftTools, + MstFlintTools, + get_tool, +) + +# mlxreg/mstreg --op uses string cmd_type values from the PPCC register. +PpccCommandOptions = Dict[str, str] + + +class PpccCommand(str, Enum): + GET_ALGO_INFO = "0x0" + GET_ALGO_STATUS = "0x3" + GET_NUM_PARAMS = "0x4" + GET_PARAM_INFO = "0x5" + GET_PARAM = "0x6" + BULK_GET_PARAMS = "0xA" + BULK_GET_COUNTERS = "0xC" + GET_NUM_COUNTERS = "0xE" + GET_COUNTER_INFO = "0xF" + ALGO_INFO_ARRAY = "0x10" + + +class PccCollector(Collector): + _BASE_REGISTER_INDEXES = "local_port=1,pnat=0,lp_msb=0" + _ALGO_SLOT_TEXT_INDEX_COUNT = 16 + _ALGO_SLOTS_COLLECT_STATUS_CMD_ONLY = frozenset({15}) + _COMMAND_OUTPUT_LOG_MAX_CHARS = 4000 + + _TEXT_TABLE_LINE_PATTERN = re.compile( + r"^\s*text\[(\d+)\]\s*\|\s*0x([0-9a-fA-F]+)", + re.MULTILINE | re.IGNORECASE, + ) + _VALUE_FIELD_PATTERN = re.compile( + r"^\s*value\s*\|\s*0x([0-9a-fA-F]+)", + re.MULTILINE | re.IGNORECASE, + ) + _COUNTER_EN_FIELD_PATTERN = re.compile( + r"^\s*counter_en\s*\|\s*0x([0-9a-fA-F]+)", + re.MULTILINE | re.IGNORECASE, + ) + + @staticmethod + def _op_for_cmd_type(command: PpccCommand) -> PpccCommandOptions: + return {"cmd_type": command.value} + + @staticmethod + def _register_indexes_for_algo_slot(algo_slot_index: int) -> str: + return ( + f"{PccCollector._BASE_REGISTER_INDEXES}," + f"algo_slot={algo_slot_index}" + ) + + @staticmethod + def _make_filename_for_ppcc_get( + collection_file_prefix: str, + command_options: PpccCommandOptions, + register_indexes: str, + ) -> str: + op_part = "_".join( + f"{key}_{value}" for key, value in command_options.items() + ) + index_part = register_indexes.replace("=", "_").replace(",", "_") + return ( + f"{collection_file_prefix}--reg_name_PPCC_--get_" + f"--op_{op_part}_--indexes_{index_part}" + ) + + @classmethod + def _get_algo_slot_indices(cls, mlxreg_output: str) -> List[int]: + slot_count = cls._ALGO_SLOT_TEXT_INDEX_COUNT + values_per_slot = [0] * slot_count + for match in cls._TEXT_TABLE_LINE_PATTERN.finditer(mlxreg_output): + text_index = int(match.group(1)) + if text_index >= slot_count: + continue + values_per_slot[text_index] = int(match.group(2), 16) + return [ + text_index + for text_index, value in enumerate(values_per_slot) + if value != 0 + ] + + @classmethod + def _extract_value_field(cls, mlxreg_output: str) -> Optional[int]: + match = cls._VALUE_FIELD_PATTERN.search(mlxreg_output) + if not match: + return None + return int(match.group(1), 16) + + @classmethod + def _counter_en_enabled(cls, mlxreg_output: str) -> Optional[bool]: + """LSB of counter_en from algo status dump; None if field missing.""" + match = cls._COUNTER_EN_FIELD_PATTERN.search(mlxreg_output) + if not match: + return None + v = int(match.group(1), 16) + return (v & 1) != 0 + + @classmethod + def _clip_command_output(cls, text: str) -> str: + raw = (text or "").strip() + if not raw: + return "(empty)" + limit = cls._COMMAND_OUTPUT_LOG_MAX_CHARS + if len(raw) <= limit: + return raw + return raw[:limit] + + def _ppcc_get( + self, + plugin, + device_label: str, + tool, + collection_file_prefix: str, + output_subdir: str, + command_options: PpccCommandOptions, + register_indexes: str, + ) -> Tuple[int, str]: + return_code, output = tool.ppcc_get( + command_options, + register_indexes, + filename=self._make_filename_for_ppcc_get( + collection_file_prefix, + command_options, + register_indexes, + ), + subdir=output_subdir, + ) + if return_code != 0: + op = command_options.get("cmd_type", "?") + plugin._log_info( + "PPCC command failed " + f"device={device_label} cmd_type={op} " + f"indexes={register_indexes!r} rc={return_code} " + f"output:\n{self._clip_command_output(output)}" + ) + return return_code, output + + def _collect_counters_for_algo_slot( + self, + plugin, + tool, + collection_file_prefix: str, + output_subdir: str, + ctx, + register_indexes: str, + ) -> None: + device_label = ctx.pci + + return_code, output = self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.GET_NUM_COUNTERS), + register_indexes, + ) + if return_code != 0: + return + + counter_count = self._extract_value_field(output) + if counter_count is None: + return + + counter_info_op = self._op_for_cmd_type( + PpccCommand.GET_COUNTER_INFO + ) + for counter_index in range(counter_count): + counter_indexes = ( + f"{register_indexes},algo_param_index={counter_index}" + ) + self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + counter_info_op, + counter_indexes, + ) + + if counter_count == 0: + return + + self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.BULK_GET_COUNTERS), + register_indexes, + ) + + def _collect_params_for_algo_slot( + self, + plugin, + tool, + collection_file_prefix: str, + output_subdir: str, + ctx, + register_indexes: str, + ) -> None: + device_label = ctx.pci + + return_code, output = self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.GET_NUM_PARAMS), + register_indexes, + ) + if return_code != 0: + return + + param_count = self._extract_value_field(output) + if param_count is None: + return + + param_info_op = self._op_for_cmd_type( + PpccCommand.GET_PARAM_INFO + ) + for param_index in range(param_count): + param_indexes = ( + f"{register_indexes},algo_param_index={param_index}" + ) + self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + param_info_op, + param_indexes, + ) + + if param_count == 0: + return + + return_code, _ = self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.BULK_GET_PARAMS), + register_indexes, + ) + + if return_code == 0: + return + + get_param_op = self._op_for_cmd_type(PpccCommand.GET_PARAM) + for param_index in range(param_count): + param_indexes = ( + f"{register_indexes},algo_param_index={param_index}" + ) + self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + get_param_op, + param_indexes, + ) + + def _collect_single_algo_slot( + self, + plugin, + tool, + collection_file_prefix: str, + output_subdir: str, + ctx, + algo_slot_index: int, + ) -> None: + register_indexes = self._register_indexes_for_algo_slot( + algo_slot_index + ) + device_label = ctx.pci + + if algo_slot_index in self._ALGO_SLOTS_COLLECT_STATUS_CMD_ONLY: + self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.GET_ALGO_STATUS), + register_indexes, + ) + return + + self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.GET_ALGO_INFO), + register_indexes, + ) + + return_code, output = self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.GET_ALGO_STATUS), + register_indexes, + ) + if return_code != 0: + return + + algo_status = self._extract_value_field(output) + if algo_status is not None and algo_status != 1: + return + + counter_en_on = self._counter_en_enabled(output) + if counter_en_on: + self._collect_counters_for_algo_slot( + plugin, + tool, + collection_file_prefix, + output_subdir, + ctx, + register_indexes, + ) + self._collect_params_for_algo_slot( + plugin, + tool, + collection_file_prefix, + output_subdir, + ctx, + register_indexes, + ) + + def _collect_ppcc_data(self, plugin, tool, tool_name: str, ctx) -> None: + collection_file_prefix = f"{tool_name}_{ctx.bdf}_" + output_subdir = "pcc_info" + device_label = ctx.pci + + return_code, output = self._ppcc_get( + plugin, + device_label, + tool, + collection_file_prefix, + output_subdir, + self._op_for_cmd_type(PpccCommand.ALGO_INFO_ARRAY), + self._BASE_REGISTER_INDEXES, + ) + if return_code != 0: + return + + present_algo_slots = sorted( + frozenset(self._get_algo_slot_indices(output)) + | self._ALGO_SLOTS_COLLECT_STATUS_CMD_ONLY, + ) + + for algo_slot_index in present_algo_slots: + self._collect_single_algo_slot( + plugin, + tool, + collection_file_prefix, + output_subdir, + ctx, + algo_slot_index, + ) + + def _collect_with_mft(self, plugin, ctx): + mlxreg_tool = get_tool(MftTools.MLXREG, plugin, ctx) + self._collect_ppcc_data(plugin, mlxreg_tool, "mlxreg", ctx) + + def _collect_with_mstflint(self, plugin, ctx): + mstreg_tool = get_tool(MstFlintTools.MSTREG, plugin, ctx) + self._collect_ppcc_data(plugin, mstreg_tool, "mstreg", ctx) diff --git a/sos/report/mellanox_firmware_suite/tools/MFT/mlxreg.py b/sos/report/mellanox_firmware_suite/tools/MFT/mlxreg.py index 45da706f22..ad814a90c7 100644 --- a/sos/report/mellanox_firmware_suite/tools/MFT/mlxreg.py +++ b/sos/report/mellanox_firmware_suite/tools/MFT/mlxreg.py @@ -9,3 +9,13 @@ def mlxreg_roce_accl_query(self, filename=None): "--get", filename=filename ) + + @supports_fwctl + def ppcc_get(self, op, indexes, filename=None, subdir=None): + op_str = ",".join(f"{k}={v}" for k, v in op.items()) + return self.execute_cmd( + f'mlxreg -d {self.ctx.effective_device} --reg_name PPCC --get ' + f'--op "{op_str}" --indexes "{indexes}"', + filename=filename, + subdir=subdir, + ) diff --git a/sos/report/mellanox_firmware_suite/tools/MSTFlint/mstreg.py b/sos/report/mellanox_firmware_suite/tools/MSTFlint/mstreg.py index 09e87b3caa..bfb78270df 100644 --- a/sos/report/mellanox_firmware_suite/tools/MSTFlint/mstreg.py +++ b/sos/report/mellanox_firmware_suite/tools/MSTFlint/mstreg.py @@ -9,3 +9,13 @@ def mstreg_roce_accl_query(self, filename=None): "--get", filename=filename ) + + @supports_fwctl + def ppcc_get(self, op, indexes, filename=None, subdir=None): + op_str = ",".join(f"{k}={v}" for k, v in op.items()) + return self.execute_cmd( + f'mstreg -d {self.ctx.effective_device} --reg_name PPCC --get ' + f'--op "{op_str}" --indexes "{indexes}"', + filename=filename, + subdir=subdir, + ) diff --git a/sos/report/mellanox_firmware_suite/tools/base_tool.py b/sos/report/mellanox_firmware_suite/tools/base_tool.py index e6bf3adfc8..d2accd8cc6 100644 --- a/sos/report/mellanox_firmware_suite/tools/base_tool.py +++ b/sos/report/mellanox_firmware_suite/tools/base_tool.py @@ -41,14 +41,15 @@ def execute_cmd( cache=True, get_cached=True, key=None, - filename=None + filename=None, + subdir=None, ): cache_key = key or cmd if get_cached and cache_key in self.ctx.cache: return self.ctx.cache[cache_key] - rc, output = self._run_command(cmd, timeout, filename) + rc, output = self._run_command(cmd, timeout, filename, subdir=subdir) if rc != 0: self.plugin._log_info( @@ -60,7 +61,7 @@ def execute_cmd( return (rc, output) - def _run_command(self, cmd, timeout, filename): + def _run_command(self, cmd, timeout, filename, subdir=None): if filename is None: res = self.plugin.exec_cmd(cmd=cmd, timeout=timeout) @@ -70,6 +71,7 @@ def _run_command(self, cmd, timeout, filename): suggest_filename=filename, timeout=timeout, stderr=True, + subdir=subdir, ) return res.get("status", 1), res.get("output", "") diff --git a/sos/report/plugins/mellanox_firmware.py b/sos/report/plugins/mellanox_firmware.py index 96a2b3dbd9..0ad6d2c412 100644 --- a/sos/report/plugins/mellanox_firmware.py +++ b/sos/report/plugins/mellanox_firmware.py @@ -9,7 +9,7 @@ import re import shutil -from sos.report.plugins import Plugin, IndependentPlugin +from sos.report.plugins import Plugin, IndependentPlugin, PluginOpt from sos.report.mellanox_firmware_suite.tools import FirmwareTools from sos.report.mellanox_firmware_suite.device_context import DeviceContext from sos.report.mellanox_firmware_suite.collectors.collector_manager import ( @@ -40,6 +40,16 @@ class MellanoxFirmware(Plugin, IndependentPlugin): packages = ("mst", "mstflint") profiles = ("hardware", "system") + option_list = [ + PluginOpt( + "pcc", + default=False, + desc=( + "Collect PCC-related information" + ), + ), + ] + def __init__(self, commons): super().__init__(commons=commons)