]> Git Repo - linux.git/blobdiff - drivers/gpu/drm/amd/amdgpu/amdgpu_device.c
drm/nouveau/kms: Don't change EDID when it hasn't actually changed
[linux.git] / drivers / gpu / drm / amd / amdgpu / amdgpu_device.c
index affde2de2a0dbd3f68f934c693ff7a906f9223a7..4204cda680f5668d6d0c284e285e5852c3b17c22 100644 (file)
 #include "amdgpu_xgmi.h"
 #include "amdgpu_ras.h"
 #include "amdgpu_pmu.h"
+#include "amdgpu_fru_eeprom.h"
 
 #include <linux/suspend.h>
 #include <drm/task_barrier.h>
+#include <linux/pm_runtime.h>
 
 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
@@ -78,6 +80,8 @@ MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
+MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
 
 #define AMDGPU_RESUME_MS               2000
 
@@ -110,6 +114,8 @@ const char *amdgpu_asic_name[] = {
        "NAVI10",
        "NAVI14",
        "NAVI12",
+       "SIENNA_CICHLID",
+       "NAVY_FLOUNDER",
        "LAST",
 };
 
@@ -137,6 +143,72 @@ static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 
 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 
+/**
+ * DOC: product_name
+ *
+ * The amdgpu driver provides a sysfs API for reporting the product name
+ * for the device
+ * The file serial_number is used for this and returns the product name
+ * as returned from the FRU.
+ * NOTE: This is only available for certain server cards
+ */
+
+static ssize_t amdgpu_device_get_product_name(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct drm_device *ddev = dev_get_drvdata(dev);
+       struct amdgpu_device *adev = ddev->dev_private;
+
+       return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
+}
+
+static DEVICE_ATTR(product_name, S_IRUGO,
+               amdgpu_device_get_product_name, NULL);
+
+/**
+ * DOC: product_number
+ *
+ * The amdgpu driver provides a sysfs API for reporting the part number
+ * for the device
+ * The file serial_number is used for this and returns the part number
+ * as returned from the FRU.
+ * NOTE: This is only available for certain server cards
+ */
+
+static ssize_t amdgpu_device_get_product_number(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct drm_device *ddev = dev_get_drvdata(dev);
+       struct amdgpu_device *adev = ddev->dev_private;
+
+       return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
+}
+
+static DEVICE_ATTR(product_number, S_IRUGO,
+               amdgpu_device_get_product_number, NULL);
+
+/**
+ * DOC: serial_number
+ *
+ * The amdgpu driver provides a sysfs API for reporting the serial number
+ * for the device
+ * The file serial_number is used for this and returns the serial number
+ * as returned from the FRU.
+ * NOTE: This is only available for certain server cards
+ */
+
+static ssize_t amdgpu_device_get_serial_number(struct device *dev,
+               struct device_attribute *attr, char *buf)
+{
+       struct drm_device *ddev = dev_get_drvdata(dev);
+       struct amdgpu_device *adev = ddev->dev_private;
+
+       return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
+}
+
+static DEVICE_ATTR(serial_number, S_IRUGO,
+               amdgpu_device_get_serial_number, NULL);
+
 /**
  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
  *
@@ -247,10 +319,10 @@ uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 {
        uint32_t ret;
 
-       if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
                return amdgpu_kiq_rreg(adev, reg);
 
-       if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
+       if ((reg * 4) < adev->rmmio_size)
                ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
        else {
                unsigned long flags;
@@ -310,7 +382,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
 {
        trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 
-       if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
+       if ((reg * 4) < adev->rmmio_size)
                writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
        else {
                unsigned long flags;
@@ -320,10 +392,6 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
                writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
                spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
        }
-
-       if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
-               udelay(500);
-       }
 }
 
 /**
@@ -339,11 +407,7 @@ void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg,
 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
                    uint32_t acc_flags)
 {
-       if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
-               adev->last_mm_index = v;
-       }
-
-       if ((acc_flags & AMDGPU_REGS_KIQ) || (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev)))
+       if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
                return amdgpu_kiq_wreg(adev, reg, v);
 
        amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
@@ -397,20 +461,12 @@ u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
  */
 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 {
-       if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
-               adev->last_mm_index = v;
-       }
-
        if ((reg * 4) < adev->rio_mem_size)
                iowrite32(v, adev->rio_mem + (reg * 4));
        else {
                iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
                iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
        }
-
-       if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
-               udelay(500);
-       }
 }
 
 /**
@@ -866,6 +922,11 @@ int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
        if (amdgpu_sriov_vf(adev))
                return 0;
 
+       /* skip if the bios has already enabled large BAR */
+       if (adev->gmc.real_vram_size &&
+           (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
+               return 0;
+
        /* Check if the root BUS has 64bit memory resources */
        root = adev->pdev->bus;
        while (root->parent)
@@ -1118,6 +1179,16 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
                amdgpu_vm_fragment_size = -1;
        }
 
+       if (amdgpu_sched_hw_submission < 2) {
+               dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
+                        amdgpu_sched_hw_submission);
+               amdgpu_sched_hw_submission = 2;
+       } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
+               dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
+                        amdgpu_sched_hw_submission);
+               amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
+       }
+
        amdgpu_device_check_smu_prv_buffer_size(adev);
 
        amdgpu_device_check_vm_size(adev);
@@ -1126,6 +1197,8 @@ static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
 
        adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
 
+       amdgpu_gmc_tmz_set(adev);
+
        return 0;
 }
 
@@ -1147,7 +1220,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero
                return;
 
        if (state == VGA_SWITCHEROO_ON) {
-               pr_info("amdgpu: switched on\n");
+               pr_info("switched on\n");
                /* don't suspend or resume card normally */
                dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
 
@@ -1161,7 +1234,7 @@ static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switchero
                dev->switch_power_state = DRM_SWITCH_POWER_ON;
                drm_kms_helper_poll_enable(dev);
        } else {
-               pr_info("amdgpu: switched off\n");
+               pr_info("switched off\n");
                drm_kms_helper_poll_disable(dev);
                dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
                amdgpu_device_suspend(dev, true);
@@ -1484,22 +1557,25 @@ static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
 {
        const char *chip_name;
-       char fw_name[30];
+       char fw_name[40];
        int err;
        const struct gpu_info_firmware_header_v1_0 *hdr;
 
        adev->firmware.gpu_info_fw = NULL;
 
+       if (adev->discovery_bin) {
+               amdgpu_discovery_get_gfx_info(adev);
+
+               /*
+                * FIXME: The bounding box is still needed by Navi12, so
+                * temporarily read it from gpu_info firmware. Should be droped
+                * when DAL no longer needs it.
+                */
+               if (adev->asic_type != CHIP_NAVI12)
+                       return 0;
+       }
+
        switch (adev->asic_type) {
-       case CHIP_TOPAZ:
-       case CHIP_TONGA:
-       case CHIP_FIJI:
-       case CHIP_POLARIS10:
-       case CHIP_POLARIS11:
-       case CHIP_POLARIS12:
-       case CHIP_VEGAM:
-       case CHIP_CARRIZO:
-       case CHIP_STONEY:
 #ifdef CONFIG_DRM_AMDGPU_SI
        case CHIP_VERDE:
        case CHIP_TAHITI:
@@ -1514,6 +1590,15 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
        case CHIP_KABINI:
        case CHIP_MULLINS:
 #endif
+       case CHIP_TOPAZ:
+       case CHIP_TONGA:
+       case CHIP_FIJI:
+       case CHIP_POLARIS10:
+       case CHIP_POLARIS11:
+       case CHIP_POLARIS12:
+       case CHIP_VEGAM:
+       case CHIP_CARRIZO:
+       case CHIP_STONEY:
        case CHIP_VEGA20:
        default:
                return 0;
@@ -1524,9 +1609,9 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
                chip_name = "vega12";
                break;
        case CHIP_RAVEN:
-               if (adev->rev_id >= 8)
+               if (adev->apu_flags & AMD_APU_IS_RAVEN2)
                        chip_name = "raven2";
-               else if (adev->pdev->device == 0x15d8)
+               else if (adev->apu_flags & AMD_APU_IS_PICASSO)
                        chip_name = "picasso";
                else
                        chip_name = "raven";
@@ -1546,6 +1631,12 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
        case CHIP_NAVI12:
                chip_name = "navi12";
                break;
+       case CHIP_SIENNA_CICHLID:
+               chip_name = "sienna_cichlid";
+               break;
+       case CHIP_NAVY_FLOUNDER:
+               chip_name = "navy_flounder";
+               break;
        }
 
        snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
@@ -1574,7 +1665,10 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
                        (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
                                                                le32_to_cpu(hdr->header.ucode_array_offset_bytes));
 
-               if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
+               /*
+                * Should be droped when DAL no longer needs it.
+                */
+               if (adev->asic_type == CHIP_NAVI12)
                        goto parse_soc_bounding_box;
 
                adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
@@ -1608,7 +1702,7 @@ static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
 parse_soc_bounding_box:
                /*
                 * soc bounding box info is not integrated in disocovery table,
-                * we always need to parse it from gpu info firmware.
+                * we always need to parse it from gpu info firmware if needed.
                 */
                if (hdr->version_minor == 2) {
                        const struct gpu_info_firmware_v1_2 *gpu_info_fw =
@@ -1644,25 +1738,13 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
 
        amdgpu_device_enable_virtual_display(adev);
 
-       switch (adev->asic_type) {
-       case CHIP_TOPAZ:
-       case CHIP_TONGA:
-       case CHIP_FIJI:
-       case CHIP_POLARIS10:
-       case CHIP_POLARIS11:
-       case CHIP_POLARIS12:
-       case CHIP_VEGAM:
-       case CHIP_CARRIZO:
-       case CHIP_STONEY:
-               if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY)
-                       adev->family = AMDGPU_FAMILY_CZ;
-               else
-                       adev->family = AMDGPU_FAMILY_VI;
-
-               r = vi_set_ip_blocks(adev);
+       if (amdgpu_sriov_vf(adev)) {
+               r = amdgpu_virt_request_full_gpu(adev, true);
                if (r)
                        return r;
-               break;
+       }
+
+       switch (adev->asic_type) {
 #ifdef CONFIG_DRM_AMDGPU_SI
        case CHIP_VERDE:
        case CHIP_TAHITI:
@@ -1681,24 +1763,41 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
        case CHIP_KAVERI:
        case CHIP_KABINI:
        case CHIP_MULLINS:
-               if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII))
-                       adev->family = AMDGPU_FAMILY_CI;
-               else
+               if (adev->flags & AMD_IS_APU)
                        adev->family = AMDGPU_FAMILY_KV;
+               else
+                       adev->family = AMDGPU_FAMILY_CI;
 
                r = cik_set_ip_blocks(adev);
                if (r)
                        return r;
                break;
 #endif
+       case CHIP_TOPAZ:
+       case CHIP_TONGA:
+       case CHIP_FIJI:
+       case CHIP_POLARIS10:
+       case CHIP_POLARIS11:
+       case CHIP_POLARIS12:
+       case CHIP_VEGAM:
+       case CHIP_CARRIZO:
+       case CHIP_STONEY:
+               if (adev->flags & AMD_IS_APU)
+                       adev->family = AMDGPU_FAMILY_CZ;
+               else
+                       adev->family = AMDGPU_FAMILY_VI;
+
+               r = vi_set_ip_blocks(adev);
+               if (r)
+                       return r;
+               break;
        case CHIP_VEGA10:
        case CHIP_VEGA12:
        case CHIP_VEGA20:
        case CHIP_RAVEN:
        case CHIP_ARCTURUS:
        case CHIP_RENOIR:
-               if (adev->asic_type == CHIP_RAVEN ||
-                   adev->asic_type == CHIP_RENOIR)
+               if (adev->flags & AMD_IS_APU)
                        adev->family = AMDGPU_FAMILY_RV;
                else
                        adev->family = AMDGPU_FAMILY_AI;
@@ -1710,6 +1809,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
        case  CHIP_NAVI10:
        case  CHIP_NAVI14:
        case  CHIP_NAVI12:
+       case  CHIP_SIENNA_CICHLID:
+       case  CHIP_NAVY_FLOUNDER:
                adev->family = AMDGPU_FAMILY_NV;
 
                r = nv_set_ip_blocks(adev);
@@ -1721,21 +1822,8 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
                return -EINVAL;
        }
 
-       r = amdgpu_device_parse_gpu_info_fw(adev);
-       if (r)
-               return r;
-
-       if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
-               amdgpu_discovery_get_gfx_info(adev);
-
        amdgpu_amdkfd_device_probe(adev);
 
-       if (amdgpu_sriov_vf(adev)) {
-               r = amdgpu_virt_request_full_gpu(adev, true);
-               if (r)
-                       return -EAGAIN;
-       }
-
        adev->pm.pp_feature = amdgpu_pp_feature_mask;
        if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
                adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
@@ -1763,6 +1851,10 @@ static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
                }
                /* get the vbios after the asic_funcs are set up */
                if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
+                       r = amdgpu_device_parse_gpu_info_fw(adev);
+                       if (r)
+                               return r;
+
                        /* Read BIOS */
                        if (!amdgpu_get_bios(adev))
                                return -EINVAL;
@@ -1975,6 +2067,8 @@ static int amdgpu_device_ip_init(struct amdgpu_device *adev)
                amdgpu_xgmi_add_device(adev);
        amdgpu_amdkfd_device_init(adev);
 
+       amdgpu_fru_get_product_info(adev);
+
 init_failed:
        if (amdgpu_sriov_vf(adev))
                amdgpu_virt_release_full_gpu(adev, true);
@@ -2171,6 +2265,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
                adev->ip_blocks[i].status.late_initialized = true;
        }
 
+       amdgpu_ras_set_error_query_ready(adev, true);
+
        amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
        amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
 
@@ -2203,7 +2299,8 @@ static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
                                if (gpu_instance->adev->flags & AMD_IS_APU)
                                        continue;
 
-                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev, 0);
+                               r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
+                                               AMDGPU_XGMI_PSTATE_MIN);
                                if (r) {
                                        DRM_ERROR("pstate setting failed (%d).\n", r);
                                        break;
@@ -2232,6 +2329,9 @@ static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
 {
        int i, r;
 
+       if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
+               amdgpu_virt_release_ras_err_handler_data(adev);
+
        amdgpu_ras_pre_fini(adev);
 
        if (adev->gmc.xgmi.num_physical_nodes > 1)
@@ -2362,18 +2462,21 @@ static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
        for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
                if (!adev->ip_blocks[i].status.valid)
                        continue;
+
                /* displays are handled separately */
-               if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
-                       /* XXX handle errors */
-                       r = adev->ip_blocks[i].version->funcs->suspend(adev);
-                       /* XXX handle errors */
-                       if (r) {
-                               DRM_ERROR("suspend of IP block <%s> failed %d\n",
-                                         adev->ip_blocks[i].version->funcs->name, r);
-                               return r;
-                       }
-                       adev->ip_blocks[i].status.hw = false;
+               if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
+                       continue;
+
+               /* XXX handle errors */
+               r = adev->ip_blocks[i].version->funcs->suspend(adev);
+               /* XXX handle errors */
+               if (r) {
+                       DRM_ERROR("suspend of IP block <%s> failed %d\n",
+                                 adev->ip_blocks[i].version->funcs->name, r);
+                       return r;
                }
+
+               adev->ip_blocks[i].status.hw = false;
        }
 
        return 0;
@@ -2471,6 +2574,9 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
                AMD_IP_BLOCK_TYPE_IH,
        };
 
+       for (i = 0; i < adev->num_ip_blocks; i++)
+               adev->ip_blocks[i].status.hw = false;
+
        for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
                int j;
                struct amdgpu_ip_block *block;
@@ -2478,7 +2584,6 @@ static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
                for (j = 0; j < adev->num_ip_blocks; j++) {
                        block = &adev->ip_blocks[j];
 
-                       block->status.hw = false;
                        if (block->version->type != ip_order[i] ||
                                !block->status.valid)
                                continue;
@@ -2702,6 +2807,10 @@ bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
        case CHIP_NAVI14:
        case CHIP_NAVI12:
        case CHIP_RENOIR:
+#endif
+#if defined(CONFIG_DRM_AMD_DC_DCN3_0)
+       case CHIP_SIENNA_CICHLID:
+       case CHIP_NAVY_FLOUNDER:
 #endif
                return amdgpu_dc != 0;
 #endif
@@ -2785,12 +2894,12 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
         * By default timeout for non compute jobs is 10000.
         * And there is no timeout enforced on compute jobs.
         * In SR-IOV or passthrough mode, timeout for compute
-        * jobs are 10000 by default.
+        * jobs are 60000 by default.
         */
        adev->gfx_timeout = msecs_to_jiffies(10000);
        adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
        if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
-               adev->compute_timeout = adev->gfx_timeout;
+               adev->compute_timeout =  msecs_to_jiffies(60000);
        else
                adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
 
@@ -2841,6 +2950,14 @@ static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
        return ret;
 }
 
+static const struct attribute *amdgpu_dev_attributes[] = {
+       &dev_attr_product_name.attr,
+       &dev_attr_product_number.attr,
+       &dev_attr_serial_number.attr,
+       &dev_attr_pcie_replay_count.attr,
+       NULL
+};
+
 /**
  * amdgpu_device_init - initialize the driver
  *
@@ -2942,9 +3059,6 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        INIT_LIST_HEAD(&adev->shadow_list);
        mutex_init(&adev->shadow_list_lock);
 
-       INIT_LIST_HEAD(&adev->ring_lru_list);
-       spin_lock_init(&adev->ring_lru_list_lock);
-
        INIT_DELAYED_WORK(&adev->delayed_init_work,
                          amdgpu_device_delayed_init_work_handler);
        INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
@@ -2953,7 +3067,18 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
 
        adev->gfx.gfx_off_req_count = 1;
-       adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
+       adev->pm.ac_power = power_supply_is_system_supplied() > 0;
+
+       atomic_set(&adev->throttling_logging_enabled, 1);
+       /*
+        * If throttling continues, logging will be performed every minute
+        * to avoid log flooding. "-1" is subtracted since the thermal
+        * throttling interrupt comes every second. Thus, the total logging
+        * interval is 59 seconds(retelimited printk interval) + 1(waiting
+        * for throttling interrupt) = 60 seconds.
+        */
+       ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
+       ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
 
        /* Registers mapping */
        /* TODO: block userspace mapping of io register */
@@ -3002,18 +3127,8 @@ int amdgpu_device_init(struct amdgpu_device *adev,
        if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
                adev->enable_mes = true;
 
-       if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) {
-               r = amdgpu_discovery_init(adev);
-               if (r) {
-                       dev_err(adev->dev, "amdgpu_discovery_init failed\n");
-                       return r;
-               }
-       }
-
-       /* early init functions */
-       r = amdgpu_device_ip_early_init(adev);
-       if (r)
-               return r;
+       /* detect hw virtualization here */
+       amdgpu_detect_virtualization(adev);
 
        r = amdgpu_device_get_job_timeout_settings(adev);
        if (r) {
@@ -3021,6 +3136,11 @@ int amdgpu_device_init(struct amdgpu_device *adev,
                return r;
        }
 
+       /* early init functions */
+       r = amdgpu_device_ip_early_init(adev);
+       if (r)
+               return r;
+
        /* doorbell bar mapping and doorbell index init*/
        amdgpu_device_doorbell_init(adev);
 
@@ -3127,14 +3247,13 @@ fence_driver_init:
                goto failed;
        }
 
-       DRM_DEBUG("SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
+       dev_info(adev->dev,
+               "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
                        adev->gfx.config.max_shader_engines,
                        adev->gfx.config.max_sh_per_se,
                        adev->gfx.config.max_cu_per_sh,
                        adev->gfx.cu_info.number);
 
-       amdgpu_ctx_init_sched(adev);
-
        adev->accel_working = true;
 
        amdgpu_vm_check_compute_bug(adev);
@@ -3199,9 +3318,12 @@ fence_driver_init:
        queue_delayed_work(system_wq, &adev->delayed_init_work,
                           msecs_to_jiffies(AMDGPU_RESUME_MS));
 
-       r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
+       if (amdgpu_sriov_vf(adev))
+               flush_delayed_work(&adev->delayed_init_work);
+
+       r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
        if (r) {
-               dev_err(adev->dev, "Could not create pcie_replay_count");
+               dev_err(adev->dev, "Could not create amdgpu device attr\n");
                return r;
        }
 
@@ -3255,10 +3377,8 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
                amdgpu_pm_sysfs_fini(adev);
        amdgpu_fbdev_fini(adev);
        r = amdgpu_device_ip_fini(adev);
-       if (adev->firmware.gpu_info_fw) {
-               release_firmware(adev->firmware.gpu_info_fw);
-               adev->firmware.gpu_info_fw = NULL;
-       }
+       release_firmware(adev->firmware.gpu_info_fw);
+       adev->firmware.gpu_info_fw = NULL;
        adev->accel_working = false;
        /* free i2c buses */
        if (!amdgpu_device_has_dc_support(adev))
@@ -3284,12 +3404,13 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
        adev->rmmio = NULL;
        amdgpu_device_doorbell_fini(adev);
 
-       device_remove_file(adev->dev, &dev_attr_pcie_replay_count);
        if (adev->ucode_sysfs_en)
                amdgpu_ucode_sysfs_fini(adev);
+
+       sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
        if (IS_ENABLED(CONFIG_PERF_EVENTS))
                amdgpu_pmu_fini(adev);
-       if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
+       if (adev->discovery_bin)
                amdgpu_discovery_fini(adev);
 }
 
@@ -3301,7 +3422,6 @@ void amdgpu_device_fini(struct amdgpu_device *adev)
  * amdgpu_device_suspend - initiate device suspend
  *
  * @dev: drm dev pointer
- * @suspend: suspend state
  * @fbcon : notify the fbdev of suspend
  *
  * Puts the hw in the suspend state (all asics).
@@ -3398,7 +3518,6 @@ int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
  * amdgpu_device_resume - initiate device resume
  *
  * @dev: drm dev pointer
- * @resume: resume state
  * @fbcon : notify the fbdev of resume
  *
  * Bring the hw back to operating state (all asics).
@@ -3754,6 +3873,8 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
        if (r)
                return r;
 
+       amdgpu_amdkfd_pre_reset(adev);
+
        /* Resume IP prior to SMC */
        r = amdgpu_device_ip_reinit_early_sriov(adev);
        if (r)
@@ -3761,7 +3882,7 @@ static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
 
        amdgpu_virt_init_data_exchange(adev);
        /* we need recover gart prior to run SMC/CP/SDMA resume */
-       amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
+       amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
 
        r = amdgpu_device_fw_loading(adev);
        if (r)
@@ -3827,6 +3948,7 @@ bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
                case CHIP_NAVI10:
                case CHIP_NAVI14:
                case CHIP_NAVI12:
+               case CHIP_SIENNA_CICHLID:
                        break;
                default:
                        goto disabled;
@@ -3848,6 +3970,8 @@ static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
        int i, r = 0;
        bool need_full_reset  = *need_full_reset_arg;
 
+       amdgpu_debugfs_wait_dump(adev);
+
        /* block all schedulers and reset given job's ring */
        for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
                struct amdgpu_ring *ring = adev->rings[i];
@@ -3957,8 +4081,7 @@ static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
                                        amdgpu_inc_vram_lost(tmp_adev);
                                }
 
-                               r = amdgpu_gtt_mgr_recover(
-                                       &tmp_adev->mman.bdev.man[TTM_PL_TT]);
+                               r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
                                if (r)
                                        goto out;
 
@@ -4052,6 +4175,64 @@ static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
        mutex_unlock(&adev->lock_reset);
 }
 
+static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
+{
+       struct pci_dev *p = NULL;
+
+       p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
+                       adev->pdev->bus->number, 1);
+       if (p) {
+               pm_runtime_enable(&(p->dev));
+               pm_runtime_resume(&(p->dev));
+       }
+}
+
+static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
+{
+       enum amd_reset_method reset_method;
+       struct pci_dev *p = NULL;
+       u64 expires;
+
+       /*
+        * For now, only BACO and mode1 reset are confirmed
+        * to suffer the audio issue without proper suspended.
+        */
+       reset_method = amdgpu_asic_reset_method(adev);
+       if ((reset_method != AMD_RESET_METHOD_BACO) &&
+            (reset_method != AMD_RESET_METHOD_MODE1))
+               return -EINVAL;
+
+       p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
+                       adev->pdev->bus->number, 1);
+       if (!p)
+               return -ENODEV;
+
+       expires = pm_runtime_autosuspend_expiration(&(p->dev));
+       if (!expires)
+               /*
+                * If we cannot get the audio device autosuspend delay,
+                * a fixed 4S interval will be used. Considering 3S is
+                * the audio controller default autosuspend delay setting.
+                * 4S used here is guaranteed to cover that.
+                */
+               expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
+
+       while (!pm_runtime_status_suspended(&(p->dev))) {
+               if (!pm_runtime_suspend(&(p->dev)))
+                       break;
+
+               if (expires < ktime_get_mono_fast_ns()) {
+                       dev_warn(adev->dev, "failed to suspend display audio\n");
+                       /* TODO: abort the succeeding gpu reset? */
+                       return -ETIMEDOUT;
+               }
+       }
+
+       pm_runtime_disable(&(p->dev));
+
+       return 0;
+}
+
 /**
  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
  *
@@ -4067,36 +4248,32 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
                              struct amdgpu_job *job)
 {
        struct list_head device_list, *device_list_handle =  NULL;
-       bool need_full_reset, job_signaled;
+       bool need_full_reset = false;
+       bool job_signaled = false;
        struct amdgpu_hive_info *hive = NULL;
        struct amdgpu_device *tmp_adev = NULL;
        int i, r = 0;
-       bool in_ras_intr = amdgpu_ras_intr_triggered();
-       bool use_baco =
-               (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
-               true : false;
+       bool need_emergency_restart = false;
+       bool audio_suspended = false;
+
+       /**
+        * Special case: RAS triggered and full reset isn't supported
+        */
+       need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
 
        /*
         * Flush RAM to disk so that after reboot
         * the user can read log and see why the system rebooted.
         */
-       if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
-
+       if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
                DRM_WARN("Emergency reboot.");
 
                ksys_sync_helper();
                emergency_restart();
        }
 
-       need_full_reset = job_signaled = false;
-       INIT_LIST_HEAD(&device_list);
-
        dev_info(adev->dev, "GPU %s begin!\n",
-               (in_ras_intr && !use_baco) ? "jobs stop":"reset");
-
-       cancel_delayed_work_sync(&adev->delayed_init_work);
-
-       hive = amdgpu_get_xgmi_hive(adev, false);
+               need_emergency_restart ? "jobs stop":"reset");
 
        /*
         * Here we trylock to avoid chain of resets executing from
@@ -4105,39 +4282,25 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         * We always reset all schedulers for device and all devices for XGMI
         * hive so that should take care of them too.
         */
-
+       hive = amdgpu_get_xgmi_hive(adev, true);
        if (hive && !mutex_trylock(&hive->reset_lock)) {
                DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
                          job ? job->base.id : -1, hive->hive_id);
+               mutex_unlock(&hive->hive_lock);
                return 0;
        }
 
-       /* Start with adev pre asic reset first for soft reset check.*/
-       if (!amdgpu_device_lock_adev(adev, !hive)) {
-               DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
-                         job ? job->base.id : -1);
-               return 0;
-       }
-
-       /* Block kfd: SRIOV would do it separately */
-       if (!amdgpu_sriov_vf(adev))
-                amdgpu_amdkfd_pre_reset(adev);
-
-       /* Build list of devices to reset */
-       if  (adev->gmc.xgmi.num_physical_nodes > 1) {
-               if (!hive) {
-                       /*unlock kfd: SRIOV would do it separately */
-                       if (!amdgpu_sriov_vf(adev))
-                               amdgpu_amdkfd_post_reset(adev);
-                       amdgpu_device_unlock_adev(adev);
+       /*
+        * Build list of devices to reset.
+        * In case we are in XGMI hive mode, resort the device list
+        * to put adev in the 1st position.
+        */
+       INIT_LIST_HEAD(&device_list);
+       if (adev->gmc.xgmi.num_physical_nodes > 1) {
+               if (!hive)
                        return -ENODEV;
-               }
-
-               /*
-                * In case we are in XGMI hive mode device reset is done for all the
-                * nodes in the hive to retrain all XGMI links and hence the reset
-                * sequence is executed in loop on all nodes.
-                */
+               if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
+                       list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
                device_list_handle = &hive->device_list;
        } else {
                list_add_tail(&adev->gmc.xgmi.head, &device_list);
@@ -4146,22 +4309,43 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
        /* block all schedulers and reset given job's ring */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-               if (tmp_adev != adev) {
-                       amdgpu_device_lock_adev(tmp_adev, false);
-                       if (!amdgpu_sriov_vf(tmp_adev))
-                                       amdgpu_amdkfd_pre_reset(tmp_adev);
+               if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
+                       DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
+                                 job ? job->base.id : -1);
+                       mutex_unlock(&hive->hive_lock);
+                       return 0;
                }
 
+               /*
+                * Try to put the audio codec into suspend state
+                * before gpu reset started.
+                *
+                * Due to the power domain of the graphics device
+                * is shared with AZ power domain. Without this,
+                * we may change the audio hardware from behind
+                * the audio driver's back. That will trigger
+                * some audio codec errors.
+                */
+               if (!amdgpu_device_suspend_display_audio(tmp_adev))
+                       audio_suspended = true;
+
+               amdgpu_ras_set_error_query_ready(tmp_adev, false);
+
+               cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
+
+               if (!amdgpu_sriov_vf(tmp_adev))
+                       amdgpu_amdkfd_pre_reset(tmp_adev);
+
                /*
                 * Mark these ASICs to be reseted as untracked first
                 * And add them back after reset completed
                 */
                amdgpu_unregister_gpu_instance(tmp_adev);
 
-               amdgpu_fbdev_set_suspend(adev, 1);
+               amdgpu_fbdev_set_suspend(tmp_adev, 1);
 
                /* disable ras on ALL IPs */
-               if (!(in_ras_intr && !use_baco) &&
+               if (!need_emergency_restart &&
                      amdgpu_device_ip_need_full_reset(tmp_adev))
                        amdgpu_ras_suspend(tmp_adev);
 
@@ -4173,13 +4357,12 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
 
                        drm_sched_stop(&ring->sched, job ? &job->base : NULL);
 
-                       if (in_ras_intr && !use_baco)
+                       if (need_emergency_restart)
                                amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
                }
        }
 
-
-       if (in_ras_intr && !use_baco)
+       if (need_emergency_restart)
                goto skip_sched_resume;
 
        /*
@@ -4189,30 +4372,14 @@ int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
         * job->base holds a reference to parent fence
         */
        if (job && job->base.s_fence->parent &&
-           dma_fence_is_signaled(job->base.s_fence->parent))
+           dma_fence_is_signaled(job->base.s_fence->parent)) {
                job_signaled = true;
-
-       if (job_signaled) {
                dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
                goto skip_hw_reset;
        }
 
-
-       /* Guilty job will be freed after this*/
-       r = amdgpu_device_pre_asic_reset(adev, job, &need_full_reset);
-       if (r) {
-               /*TODO Should we stop ?*/
-               DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
-                         r, adev->ddev->unique);
-               adev->asic_reset_res = r;
-       }
-
 retry: /* Rest of adevs pre asic reset from XGMI hive. */
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
-
-               if (tmp_adev == adev)
-                       continue;
-
                r = amdgpu_device_pre_asic_reset(tmp_adev,
                                                 NULL,
                                                 &need_full_reset);
@@ -4272,13 +4439,17 @@ skip_hw_reset:
 skip_sched_resume:
        list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
                /*unlock kfd: SRIOV would do it separately */
-               if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
+               if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
                        amdgpu_amdkfd_post_reset(tmp_adev);
+               if (audio_suspended)
+                       amdgpu_device_resume_display_audio(tmp_adev);
                amdgpu_device_unlock_adev(tmp_adev);
        }
 
-       if (hive)
+       if (hive) {
                mutex_unlock(&hive->reset_lock);
+               mutex_unlock(&hive->hive_lock);
+       }
 
        if (r)
                dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
This page took 0.074679 seconds and 4 git commands to generate.