In case firmware has a bug and erroneously reports a status error
(e.g. device unusable) during boot, allow the user to tell the driver
to continue the boot regardless of the error status.
This will be done via kernel parameter which exposes a mask. The
user that loads the driver can decide exactly which status error to
ignore and which to take into account. The bitmask is according to
defines in hl_boot_if.h
Signed-off-by: Oded Gabbay <[email protected]>
+ if (err_exists && ((err_val & ~CPU_BOOT_ERR0_ENABLED) &
+ lower_32_bits(hdev->boot_error_status_mask)))
* @clock_gating_mask: is clock gating enabled. bitmask that represents the
* different engines. See debugfs-driver-habanalabs for
* details.
* @clock_gating_mask: is clock gating enabled. bitmask that represents the
* different engines. See debugfs-driver-habanalabs for
* details.
+ * @boot_error_status_mask: contains a mask of the device boot error status.
+ * Each bit represents a different error, according to
+ * the defines in hl_boot_if.h. If the bit is cleared,
+ * the error will be ignored by the driver during
+ * device initialization. Mainly used to debug and
+ * workaround firmware bugs
* @in_reset: is device in reset flow.
* @curr_pll_profile: current PLL profile.
* @card_type: Various ASICs have several card types. This indicates the card
* @in_reset: is device in reset flow.
* @curr_pll_profile: current PLL profile.
* @card_type: Various ASICs have several card types. This indicates the card
u64 timeout_jiffies;
u64 max_power;
u64 clock_gating_mask;
u64 timeout_jiffies;
u64 max_power;
u64 clock_gating_mask;
+ u64 boot_error_status_mask;
atomic_t in_reset;
enum hl_pll_frequency curr_pll_profile;
enum cpucp_card_types card_type;
atomic_t in_reset;
enum hl_pll_frequency curr_pll_profile;
enum cpucp_card_types card_type;
static int timeout_locked = 30;
static int reset_on_lockup = 1;
static int memory_scrub = 1;
static int timeout_locked = 30;
static int reset_on_lockup = 1;
static int memory_scrub = 1;
+static ulong boot_error_status_mask = ULONG_MAX;
module_param(timeout_locked, int, 0444);
MODULE_PARM_DESC(timeout_locked,
module_param(timeout_locked, int, 0444);
MODULE_PARM_DESC(timeout_locked,
MODULE_PARM_DESC(memory_scrub,
"Scrub device memory in various states (0 = no, 1 = yes, default yes)");
MODULE_PARM_DESC(memory_scrub,
"Scrub device memory in various states (0 = no, 1 = yes, default yes)");
+module_param(boot_error_status_mask, ulong, 0444);
+MODULE_PARM_DESC(boot_error_status_mask,
+ "Mask of the error status during device CPU boot (If bitX is cleared then error X is masked. Default all 1's)");
+
#define PCI_VENDOR_ID_HABANALABS 0x1da3
#define PCI_IDS_GOYA 0x0001
#define PCI_VENDOR_ID_HABANALABS 0x1da3
#define PCI_IDS_GOYA 0x0001
hdev->major = hl_major;
hdev->reset_on_lockup = reset_on_lockup;
hdev->memory_scrub = memory_scrub;
hdev->major = hl_major;
hdev->reset_on_lockup = reset_on_lockup;
hdev->memory_scrub = memory_scrub;
+ hdev->boot_error_status_mask = boot_error_status_mask;
+
hdev->pldm = 0;
set_driver_behavior_per_device(hdev);
hdev->pldm = 0;
set_driver_behavior_per_device(hdev);