drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83
  84 #define AMDGPU_RESUME_MS                2000
  85
  86 const char *amdgpu_asic_name[] = {
  87         "TAHITI",
  88         "PITCAIRN",
  89         "VERDE",
  90         "OLAND",
  91         "HAINAN",
  92         "BONAIRE",
  93         "KAVERI",
  94         "KABINI",
  95         "HAWAII",
  96         "MULLINS",
  97         "TOPAZ",
  98         "TONGA",
  99         "FIJI",
 100         "CARRIZO",
 101         "STONEY",
 102         "POLARIS10",
 103         "POLARIS11",
 104         "POLARIS12",
 105         "VEGAM",
 106         "VEGA10",
 107         "VEGA12",
 108         "VEGA20",
 109         "RAVEN",
 110         "ARCTURUS",
 111         "RENOIR",
 112         "NAVI10",
 113         "NAVI14",
 114         "NAVI12",
 115         "LAST",
 116 };
 117
 118 /**
 119  * DOC: pcie_replay_count
 120  *
 121  * The amdgpu driver provides a sysfs API for reporting the total number
 122  * of PCIe replays (NAKs)
 123  * The file pcie_replay_count is used for this and returns the total
 124  * number of replays as a sum of the NAKs generated and NAKs received
 125  */
 126
 127 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 128                 struct device_attribute *attr, char *buf)
 129 {
 130         struct drm_device *ddev = dev_get_drvdata(dev);
 131         struct amdgpu_device *adev = ddev->dev_private;
 132         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 133
 134         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 135 }
 136
 137 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 138                 amdgpu_device_get_pcie_replay_count, NULL);
 139
 140 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 141
 142 /**
 143  * DOC: product_name
 144  *
 145  * The amdgpu driver provides a sysfs API for reporting the product name
 146  * for the device
 147  * The file serial_number is used for this and returns the product name
 148  * as returned from the FRU.
 149  * NOTE: This is only available for certain server cards
 150  */
 151
 152 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 153                 struct device_attribute *attr, char *buf)
 154 {
 155         struct drm_device *ddev = dev_get_drvdata(dev);
 156         struct amdgpu_device *adev = ddev->dev_private;
 157
 158         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 159 }
 160
 161 static DEVICE_ATTR(product_name, S_IRUGO,
 162                 amdgpu_device_get_product_name, NULL);
 163
 164 /**
 165  * DOC: product_number
 166  *
 167  * The amdgpu driver provides a sysfs API for reporting the part number
 168  * for the device
 169  * The file serial_number is used for this and returns the part number
 170  * as returned from the FRU.
 171  * NOTE: This is only available for certain server cards
 172  */
 173
 174 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 175                 struct device_attribute *attr, char *buf)
 176 {
 177         struct drm_device *ddev = dev_get_drvdata(dev);
 178         struct amdgpu_device *adev = ddev->dev_private;
 179
 180         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 181 }
 182
 183 static DEVICE_ATTR(product_number, S_IRUGO,
 184                 amdgpu_device_get_product_number, NULL);
 185
 186 /**
 187  * DOC: serial_number
 188  *
 189  * The amdgpu driver provides a sysfs API for reporting the serial number
 190  * for the device
 191  * The file serial_number is used for this and returns the serial number
 192  * as returned from the FRU.
 193  * NOTE: This is only available for certain server cards
 194  */
 195
 196 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 197                 struct device_attribute *attr, char *buf)
 198 {
 199         struct drm_device *ddev = dev_get_drvdata(dev);
 200         struct amdgpu_device *adev = ddev->dev_private;
 201
 202         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 203 }
 204
 205 static DEVICE_ATTR(serial_number, S_IRUGO,
 206                 amdgpu_device_get_serial_number, NULL);
 207
 208 /**
 209  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 210  *
 211  * @dev: drm_device pointer
 212  *
 213  * Returns true if the device is a dGPU with HG/PX power control,
 214  * otherwise return false.
 215  */
 216 bool amdgpu_device_supports_boco(struct drm_device *dev)
 217 {
 218         struct amdgpu_device *adev = dev->dev_private;
 219
 220         if (adev->flags & AMD_IS_PX)
 221                 return true;
 222         return false;
 223 }
 224
 225 /**
 226  * amdgpu_device_supports_baco - Does the device support BACO
 227  *
 228  * @dev: drm_device pointer
 229  *
 230  * Returns true if the device supporte BACO,
 231  * otherwise return false.
 232  */
 233 bool amdgpu_device_supports_baco(struct drm_device *dev)
 234 {
 235         struct amdgpu_device *adev = dev->dev_private;
 236
 237         return amdgpu_asic_supports_baco(adev);
 238 }
 239
 240 /**
 241  * VRAM access helper functions.
 242  *
 243  * amdgpu_device_vram_access - read/write a buffer in vram
 244  *
 245  * @adev: amdgpu_device pointer
 246  * @pos: offset of the buffer in vram
 247  * @buf: virtual address of the buffer in system memory
 248  * @size: read/write size, sizeof(@buf) must > @size
 249  * @write: true - write to vram, otherwise - read from vram
 250  */
 251 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 252                                uint32_t *buf, size_t size, bool write)
 253 {
 254         unsigned long flags;
 255         uint32_t hi = ~0;
 256         uint64_t last;
 257
 258
 259 #ifdef CONFIG_64BIT
 260         last = min(pos + size, adev->gmc.visible_vram_size);
 261         if (last > pos) {
 262                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 263                 size_t count = last - pos;
 264
 265                 if (write) {
 266                         memcpy_toio(addr, buf, count);
 267                         mb();
 268                         amdgpu_asic_flush_hdp(adev, NULL);
 269                 } else {
 270                         amdgpu_asic_invalidate_hdp(adev, NULL);
 271                         mb();
 272                         memcpy_fromio(buf, addr, count);
 273                 }
 274
 275                 if (count == size)
 276                         return;
 277
 278                 pos += count;
 279                 buf += count / 4;
 280                 size -= count;
 281         }
 282 #endif
 283
 284         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 285         for (last = pos + size; pos < last; pos += 4) {
 286                 uint32_t tmp = pos >> 31;
 287
 288                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 289                 if (tmp != hi) {
 290                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 291                         hi = tmp;
 292                 }
 293                 if (write)
 294                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 295                 else
 296                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 297         }
 298         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 299 }
 300
 301 /*
 302  * device register access helper functions.
 303  */
 304 /**
 305  * amdgpu_device_rreg - read a register
 306  *
 307  * @adev: amdgpu_device pointer
 308  * @reg: dword aligned register offset
 309  * @acc_flags: access flags which require special behavior
 310  *
 311  * Returns the 32 bit value from the offset specified.
 312  */
 313 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, uint32_t reg,
 314                             uint32_t acc_flags)
 315 {
 316         uint32_t ret;
 317
 318         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 319                 return amdgpu_kiq_rreg(adev, reg);
 320
 321         if ((reg * 4) < adev->rmmio_size)
 322                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 323         else
 324                 ret = adev->pcie_rreg(adev, (reg * 4));
 325         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 326         return ret;
 327 }
 328
 329 /*
 330  * MMIO register read with bytes helper functions
 331  * @offset:bytes offset from MMIO start
 332  *
 333 */
 334
 335 /**
 336  * amdgpu_mm_rreg8 - read a memory mapped IO register
 337  *
 338  * @adev: amdgpu_device pointer
 339  * @offset: byte aligned register offset
 340  *
 341  * Returns the 8 bit value from the offset specified.
 342  */
 343 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
 344         if (offset < adev->rmmio_size)
 345                 return (readb(adev->rmmio + offset));
 346         BUG();
 347 }
 348
 349 /*
 350  * MMIO register write with bytes helper functions
 351  * @offset:bytes offset from MMIO start
 352  * @value: the value want to be written to the register
 353  *
 354 */
 355 /**
 356  * amdgpu_mm_wreg8 - read a memory mapped IO register
 357  *
 358  * @adev: amdgpu_device pointer
 359  * @offset: byte aligned register offset
 360  * @value: 8 bit value to write
 361  *
 362  * Writes the value specified to the offset specified.
 363  */
 364 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
 365         if (offset < adev->rmmio_size)
 366                 writeb(value, adev->rmmio + offset);
 367         else
 368                 BUG();
 369 }
 370
 371 void static inline amdgpu_device_wreg_no_kiq(struct amdgpu_device *adev, uint32_t reg,
 372                                              uint32_t v, uint32_t acc_flags)
 373 {
 374         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 375
 376         if ((reg * 4) < adev->rmmio_size)
 377                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 378         else
 379                 adev->pcie_wreg(adev, (reg * 4), v);
 380 }
 381
 382 /**
 383  * amdgpu_device_wreg - write to a register
 384  *
 385  * @adev: amdgpu_device pointer
 386  * @reg: dword aligned register offset
 387  * @v: 32 bit value to write to the register
 388  * @acc_flags: access flags which require special behavior
 389  *
 390  * Writes the value specified to the offset specified.
 391  */
 392 void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 393                         uint32_t acc_flags)
 394 {
 395         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 396                 return amdgpu_kiq_wreg(adev, reg, v);
 397
 398         amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);
 399 }
 400
 401 /*
 402  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 403  *
 404  * this function is invoked only the debugfs register access
 405  * */
 406 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 407                     uint32_t acc_flags)
 408 {
 409         if (amdgpu_sriov_fullaccess(adev) &&
 410                 adev->gfx.rlc.funcs &&
 411                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
 412
 413                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 414                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 415         }
 416
 417         amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);
 418 }
 419
 420 /**
 421  * amdgpu_io_rreg - read an IO register
 422  *
 423  * @adev: amdgpu_device pointer
 424  * @reg: dword aligned register offset
 425  *
 426  * Returns the 32 bit value from the offset specified.
 427  */
 428 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 429 {
 430         if ((reg * 4) < adev->rio_mem_size)
 431                 return ioread32(adev->rio_mem + (reg * 4));
 432         else {
 433                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 434                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 435         }
 436 }
 437
 438 /**
 439  * amdgpu_io_wreg - write to an IO register
 440  *
 441  * @adev: amdgpu_device pointer
 442  * @reg: dword aligned register offset
 443  * @v: 32 bit value to write to the register
 444  *
 445  * Writes the value specified to the offset specified.
 446  */
 447 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 448 {
 449         if ((reg * 4) < adev->rio_mem_size)
 450                 iowrite32(v, adev->rio_mem + (reg * 4));
 451         else {
 452                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 453                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 454         }
 455 }
 456
 457 /**
 458  * amdgpu_mm_rdoorbell - read a doorbell dword
 459  *
 460  * @adev: amdgpu_device pointer
 461  * @index: doorbell index
 462  *
 463  * Returns the value in the doorbell aperture at the
 464  * requested doorbell index (CIK).
 465  */
 466 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 467 {
 468         if (index < adev->doorbell.num_doorbells) {
 469                 return readl(adev->doorbell.ptr + index);
 470         } else {
 471                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 472                 return 0;
 473         }
 474 }
 475
 476 /**
 477  * amdgpu_mm_wdoorbell - write a doorbell dword
 478  *
 479  * @adev: amdgpu_device pointer
 480  * @index: doorbell index
 481  * @v: value to write
 482  *
 483  * Writes @v to the doorbell aperture at the
 484  * requested doorbell index (CIK).
 485  */
 486 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 487 {
 488         if (index < adev->doorbell.num_doorbells) {
 489                 writel(v, adev->doorbell.ptr + index);
 490         } else {
 491                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 492         }
 493 }
 494
 495 /**
 496  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 497  *
 498  * @adev: amdgpu_device pointer
 499  * @index: doorbell index
 500  *
 501  * Returns the value in the doorbell aperture at the
 502  * requested doorbell index (VEGA10+).
 503  */
 504 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 505 {
 506         if (index < adev->doorbell.num_doorbells) {
 507                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 508         } else {
 509                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 510                 return 0;
 511         }
 512 }
 513
 514 /**
 515  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 516  *
 517  * @adev: amdgpu_device pointer
 518  * @index: doorbell index
 519  * @v: value to write
 520  *
 521  * Writes @v to the doorbell aperture at the
 522  * requested doorbell index (VEGA10+).
 523  */
 524 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 525 {
 526         if (index < adev->doorbell.num_doorbells) {
 527                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 528         } else {
 529                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 530         }
 531 }
 532
 533 /**
 534  * amdgpu_invalid_rreg - dummy reg read function
 535  *
 536  * @adev: amdgpu device pointer
 537  * @reg: offset of register
 538  *
 539  * Dummy register read function.  Used for register blocks
 540  * that certain asics don't have (all asics).
 541  * Returns the value in the register.
 542  */
 543 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 544 {
 545         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 546         BUG();
 547         return 0;
 548 }
 549
 550 /**
 551  * amdgpu_invalid_wreg - dummy reg write function
 552  *
 553  * @adev: amdgpu device pointer
 554  * @reg: offset of register
 555  * @v: value to write to the register
 556  *
 557  * Dummy register read function.  Used for register blocks
 558  * that certain asics don't have (all asics).
 559  */
 560 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 561 {
 562         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 563                   reg, v);
 564         BUG();
 565 }
 566
 567 /**
 568  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 569  *
 570  * @adev: amdgpu device pointer
 571  * @reg: offset of register
 572  *
 573  * Dummy register read function.  Used for register blocks
 574  * that certain asics don't have (all asics).
 575  * Returns the value in the register.
 576  */
 577 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 578 {
 579         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 580         BUG();
 581         return 0;
 582 }
 583
 584 /**
 585  * amdgpu_invalid_wreg64 - dummy reg write function
 586  *
 587  * @adev: amdgpu device pointer
 588  * @reg: offset of register
 589  * @v: value to write to the register
 590  *
 591  * Dummy register read function.  Used for register blocks
 592  * that certain asics don't have (all asics).
 593  */
 594 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 595 {
 596         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 597                   reg, v);
 598         BUG();
 599 }
 600
 601 /**
 602  * amdgpu_block_invalid_rreg - dummy reg read function
 603  *
 604  * @adev: amdgpu device pointer
 605  * @block: offset of instance
 606  * @reg: offset of register
 607  *
 608  * Dummy register read function.  Used for register blocks
 609  * that certain asics don't have (all asics).
 610  * Returns the value in the register.
 611  */
 612 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 613                                           uint32_t block, uint32_t reg)
 614 {
 615         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 616                   reg, block);
 617         BUG();
 618         return 0;
 619 }
 620
 621 /**
 622  * amdgpu_block_invalid_wreg - dummy reg write function
 623  *
 624  * @adev: amdgpu device pointer
 625  * @block: offset of instance
 626  * @reg: offset of register
 627  * @v: value to write to the register
 628  *
 629  * Dummy register read function.  Used for register blocks
 630  * that certain asics don't have (all asics).
 631  */
 632 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 633                                       uint32_t block,
 634                                       uint32_t reg, uint32_t v)
 635 {
 636         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 637                   reg, block, v);
 638         BUG();
 639 }
 640
 641 /**
 642  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 643  *
 644  * @adev: amdgpu device pointer
 645  *
 646  * Allocates a scratch page of VRAM for use by various things in the
 647  * driver.
 648  */
 649 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 650 {
 651         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 652                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 653                                        &adev->vram_scratch.robj,
 654                                        &adev->vram_scratch.gpu_addr,
 655                                        (void **)&adev->vram_scratch.ptr);
 656 }
 657
 658 /**
 659  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 660  *
 661  * @adev: amdgpu device pointer
 662  *
 663  * Frees the VRAM scratch page.
 664  */
 665 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 666 {
 667         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 668 }
 669
 670 /**
 671  * amdgpu_device_program_register_sequence - program an array of registers.
 672  *
 673  * @adev: amdgpu_device pointer
 674  * @registers: pointer to the register array
 675  * @array_size: size of the register array
 676  *
 677  * Programs an array or registers with and and or masks.
 678  * This is a helper for setting golden registers.
 679  */
 680 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 681                                              const u32 *registers,
 682                                              const u32 array_size)
 683 {
 684         u32 tmp, reg, and_mask, or_mask;
 685         int i;
 686
 687         if (array_size % 3)
 688                 return;
 689
 690         for (i = 0; i < array_size; i +=3) {
 691                 reg = registers[i + 0];
 692                 and_mask = registers[i + 1];
 693                 or_mask = registers[i + 2];
 694
 695                 if (and_mask == 0xffffffff) {
 696                         tmp = or_mask;
 697                 } else {
 698                         tmp = RREG32(reg);
 699                         tmp &= ~and_mask;
 700                         if (adev->family >= AMDGPU_FAMILY_AI)
 701                                 tmp |= (or_mask & and_mask);
 702                         else
 703                                 tmp |= or_mask;
 704                 }
 705                 WREG32(reg, tmp);
 706         }
 707 }
 708
 709 /**
 710  * amdgpu_device_pci_config_reset - reset the GPU
 711  *
 712  * @adev: amdgpu_device pointer
 713  *
 714  * Resets the GPU using the pci config reset sequence.
 715  * Only applicable to asics prior to vega10.
 716  */
 717 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 718 {
 719         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 720 }
 721
 722 /*
 723  * GPU doorbell aperture helpers function.
 724  */
 725 /**
 726  * amdgpu_device_doorbell_init - Init doorbell driver information.
 727  *
 728  * @adev: amdgpu_device pointer
 729  *
 730  * Init doorbell driver information (CIK)
 731  * Returns 0 on success, error on failure.
 732  */
 733 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 734 {
 735
 736         /* No doorbell on SI hardware generation */
 737         if (adev->asic_type < CHIP_BONAIRE) {
 738                 adev->doorbell.base = 0;
 739                 adev->doorbell.size = 0;
 740                 adev->doorbell.num_doorbells = 0;
 741                 adev->doorbell.ptr = NULL;
 742                 return 0;
 743         }
 744
 745         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 746                 return -EINVAL;
 747
 748         amdgpu_asic_init_doorbell_index(adev);
 749
 750         /* doorbell bar mapping */
 751         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 752         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 753
 754         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 755                                              adev->doorbell_index.max_assignment+1);
 756         if (adev->doorbell.num_doorbells == 0)
 757                 return -EINVAL;
 758
 759         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 760          * paging queue doorbell use the second page. The
 761          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 762          * doorbells are in the first page. So with paging queue enabled,
 763          * the max num_doorbells should + 1 page (0x400 in dword)
 764          */
 765         if (adev->asic_type >= CHIP_VEGA10)
 766                 adev->doorbell.num_doorbells += 0x400;
 767
 768         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 769                                      adev->doorbell.num_doorbells *
 770                                      sizeof(u32));
 771         if (adev->doorbell.ptr == NULL)
 772                 return -ENOMEM;
 773
 774         return 0;
 775 }
 776
 777 /**
 778  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 779  *
 780  * @adev: amdgpu_device pointer
 781  *
 782  * Tear down doorbell driver information (CIK)
 783  */
 784 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 785 {
 786         iounmap(adev->doorbell.ptr);
 787         adev->doorbell.ptr = NULL;
 788 }
 789
 790
 791
 792 /*
 793  * amdgpu_device_wb_*()
 794  * Writeback is the method by which the GPU updates special pages in memory
 795  * with the status of certain GPU events (fences, ring pointers,etc.).
 796  */
 797
 798 /**
 799  * amdgpu_device_wb_fini - Disable Writeback and free memory
 800  *
 801  * @adev: amdgpu_device pointer
 802  *
 803  * Disables Writeback and frees the Writeback memory (all asics).
 804  * Used at driver shutdown.
 805  */
 806 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 807 {
 808         if (adev->wb.wb_obj) {
 809                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 810                                       &adev->wb.gpu_addr,
 811                                       (void **)&adev->wb.wb);
 812                 adev->wb.wb_obj = NULL;
 813         }
 814 }
 815
 816 /**
 817  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 818  *
 819  * @adev: amdgpu_device pointer
 820  *
 821  * Initializes writeback and allocates writeback memory (all asics).
 822  * Used at driver startup.
 823  * Returns 0 on success or an -error on failure.
 824  */
 825 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 826 {
 827         int r;
 828
 829         if (adev->wb.wb_obj == NULL) {
 830                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 831                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 832                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 833                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 834                                             (void **)&adev->wb.wb);
 835                 if (r) {
 836                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 837                         return r;
 838                 }
 839
 840                 adev->wb.num_wb = AMDGPU_MAX_WB;
 841                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 842
 843                 /* clear wb memory */
 844                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 845         }
 846
 847         return 0;
 848 }
 849
 850 /**
 851  * amdgpu_device_wb_get - Allocate a wb entry
 852  *
 853  * @adev: amdgpu_device pointer
 854  * @wb: wb index
 855  *
 856  * Allocate a wb slot for use by the driver (all asics).
 857  * Returns 0 on success or -EINVAL on failure.
 858  */
 859 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 860 {
 861         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 862
 863         if (offset < adev->wb.num_wb) {
 864                 __set_bit(offset, adev->wb.used);
 865                 *wb = offset << 3; /* convert to dw offset */
 866                 return 0;
 867         } else {
 868                 return -EINVAL;
 869         }
 870 }
 871
 872 /**
 873  * amdgpu_device_wb_free - Free a wb entry
 874  *
 875  * @adev: amdgpu_device pointer
 876  * @wb: wb index
 877  *
 878  * Free a wb slot allocated for use by the driver (all asics)
 879  */
 880 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 881 {
 882         wb >>= 3;
 883         if (wb < adev->wb.num_wb)
 884                 __clear_bit(wb, adev->wb.used);
 885 }
 886
 887 /**
 888  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 889  *
 890  * @adev: amdgpu_device pointer
 891  *
 892  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 893  * to fail, but if any of the BARs is not accessible after the size we abort
 894  * driver loading by returning -ENODEV.
 895  */
 896 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 897 {
 898         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 899         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 900         struct pci_bus *root;
 901         struct resource *res;
 902         unsigned i;
 903         u16 cmd;
 904         int r;
 905
 906         /* Bypass for VF */
 907         if (amdgpu_sriov_vf(adev))
 908                 return 0;
 909
 910         /* Check if the root BUS has 64bit memory resources */
 911         root = adev->pdev->bus;
 912         while (root->parent)
 913                 root = root->parent;
 914
 915         pci_bus_for_each_resource(root, res, i) {
 916                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
 917                     res->start > 0x100000000ull)
 918                         break;
 919         }
 920
 921         /* Trying to resize is pointless without a root hub window above 4GB */
 922         if (!res)
 923                 return 0;
 924
 925         /* Disable memory decoding while we change the BAR addresses and size */
 926         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
 927         pci_write_config_word(adev->pdev, PCI_COMMAND,
 928                               cmd & ~PCI_COMMAND_MEMORY);
 929
 930         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
 931         amdgpu_device_doorbell_fini(adev);
 932         if (adev->asic_type >= CHIP_BONAIRE)
 933                 pci_release_resource(adev->pdev, 2);
 934
 935         pci_release_resource(adev->pdev, 0);
 936
 937         r = pci_resize_resource(adev->pdev, 0, rbar_size);
 938         if (r == -ENOSPC)
 939                 DRM_INFO("Not enough PCI address space for a large BAR.");
 940         else if (r && r != -ENOTSUPP)
 941                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
 942
 943         pci_assign_unassigned_bus_resources(adev->pdev->bus);
 944
 945         /* When the doorbell or fb BAR isn't available we have no chance of
 946          * using the device.
 947          */
 948         r = amdgpu_device_doorbell_init(adev);
 949         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
 950                 return -ENODEV;
 951
 952         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
 953
 954         return 0;
 955 }
 956
 957 /*
 958  * GPU helpers function.
 959  */
 960 /**
 961  * amdgpu_device_need_post - check if the hw need post or not
 962  *
 963  * @adev: amdgpu_device pointer
 964  *
 965  * Check if the asic has been initialized (all asics) at driver startup
 966  * or post is needed if  hw reset is performed.
 967  * Returns true if need or false if not.
 968  */
 969 bool amdgpu_device_need_post(struct amdgpu_device *adev)
 970 {
 971         uint32_t reg;
 972
 973         if (amdgpu_sriov_vf(adev))
 974                 return false;
 975
 976         if (amdgpu_passthrough(adev)) {
 977                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
 978                  * some old smc fw still need driver do vPost otherwise gpu hang, while
 979                  * those smc fw version above 22.15 doesn't have this flaw, so we force
 980                  * vpost executed for smc version below 22.15
 981                  */
 982                 if (adev->asic_type == CHIP_FIJI) {
 983                         int err;
 984                         uint32_t fw_ver;
 985                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
 986                         /* force vPost if error occured */
 987                         if (err)
 988                                 return true;
 989
 990                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
 991                         if (fw_ver < 0x00160e00)
 992                                 return true;
 993                 }
 994         }
 995
 996         if (adev->has_hw_reset) {
 997                 adev->has_hw_reset = false;
 998                 return true;
 999         }
1000
1001         /* bios scratch used on CIK+ */
1002         if (adev->asic_type >= CHIP_BONAIRE)
1003                 return amdgpu_atombios_scratch_need_asic_init(adev);
1004
1005         /* check MEM_SIZE for older asics */
1006         reg = amdgpu_asic_get_config_memsize(adev);
1007
1008         if ((reg != 0) && (reg != 0xffffffff))
1009                 return false;
1010
1011         return true;
1012 }
1013
1014 /* if we get transitioned to only one device, take VGA back */
1015 /**
1016  * amdgpu_device_vga_set_decode - enable/disable vga decode
1017  *
1018  * @cookie: amdgpu_device pointer
1019  * @state: enable/disable vga decode
1020  *
1021  * Enable/disable vga decode (all asics).
1022  * Returns VGA resource flags.
1023  */
1024 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1025 {
1026         struct amdgpu_device *adev = cookie;
1027         amdgpu_asic_set_vga_state(adev, state);
1028         if (state)
1029                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1030                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1031         else
1032                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1033 }
1034
1035 /**
1036  * amdgpu_device_check_block_size - validate the vm block size
1037  *
1038  * @adev: amdgpu_device pointer
1039  *
1040  * Validates the vm block size specified via module parameter.
1041  * The vm block size defines number of bits in page table versus page directory,
1042  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1043  * page table and the remaining bits are in the page directory.
1044  */
1045 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1046 {
1047         /* defines number of bits in page table versus page directory,
1048          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1049          * page table and the remaining bits are in the page directory */
1050         if (amdgpu_vm_block_size == -1)
1051                 return;
1052
1053         if (amdgpu_vm_block_size < 9) {
1054                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1055                          amdgpu_vm_block_size);
1056                 amdgpu_vm_block_size = -1;
1057         }
1058 }
1059
1060 /**
1061  * amdgpu_device_check_vm_size - validate the vm size
1062  *
1063  * @adev: amdgpu_device pointer
1064  *
1065  * Validates the vm size in GB specified via module parameter.
1066  * The VM size is the size of the GPU virtual memory space in GB.
1067  */
1068 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1069 {
1070         /* no need to check the default value */
1071         if (amdgpu_vm_size == -1)
1072                 return;
1073
1074         if (amdgpu_vm_size < 1) {
1075                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1076                          amdgpu_vm_size);
1077                 amdgpu_vm_size = -1;
1078         }
1079 }
1080
1081 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1082 {
1083         struct sysinfo si;
1084         bool is_os_64 = (sizeof(void *) == 8);
1085         uint64_t total_memory;
1086         uint64_t dram_size_seven_GB = 0x1B8000000;
1087         uint64_t dram_size_three_GB = 0xB8000000;
1088
1089         if (amdgpu_smu_memory_pool_size == 0)
1090                 return;
1091
1092         if (!is_os_64) {
1093                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1094                 goto def_value;
1095         }
1096         si_meminfo(&si);
1097         total_memory = (uint64_t)si.totalram * si.mem_unit;
1098
1099         if ((amdgpu_smu_memory_pool_size == 1) ||
1100                 (amdgpu_smu_memory_pool_size == 2)) {
1101                 if (total_memory < dram_size_three_GB)
1102                         goto def_value1;
1103         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1104                 (amdgpu_smu_memory_pool_size == 8)) {
1105                 if (total_memory < dram_size_seven_GB)
1106                         goto def_value1;
1107         } else {
1108                 DRM_WARN("Smu memory pool size not supported\n");
1109                 goto def_value;
1110         }
1111         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1112
1113         return;
1114
1115 def_value1:
1116         DRM_WARN("No enough system memory\n");
1117 def_value:
1118         adev->pm.smu_prv_buffer_size = 0;
1119 }
1120
1121 /**
1122  * amdgpu_device_check_arguments - validate module params
1123  *
1124  * @adev: amdgpu_device pointer
1125  *
1126  * Validates certain module parameters and updates
1127  * the associated values used by the driver (all asics).
1128  */
1129 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1130 {
1131         if (amdgpu_sched_jobs < 4) {
1132                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1133                          amdgpu_sched_jobs);
1134                 amdgpu_sched_jobs = 4;
1135         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1136                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1137                          amdgpu_sched_jobs);
1138                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1139         }
1140
1141         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1142                 /* gart size must be greater or equal to 32M */
1143                 dev_warn(adev->dev, "gart size (%d) too small\n",
1144                          amdgpu_gart_size);
1145                 amdgpu_gart_size = -1;
1146         }
1147
1148         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1149                 /* gtt size must be greater or equal to 32M */
1150                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1151                                  amdgpu_gtt_size);
1152                 amdgpu_gtt_size = -1;
1153         }
1154
1155         /* valid range is between 4 and 9 inclusive */
1156         if (amdgpu_vm_fragment_size != -1 &&
1157             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1158                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1159                 amdgpu_vm_fragment_size = -1;
1160         }
1161
1162         amdgpu_device_check_smu_prv_buffer_size(adev);
1163
1164         amdgpu_device_check_vm_size(adev);
1165
1166         amdgpu_device_check_block_size(adev);
1167
1168         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1169
1170         amdgpu_gmc_tmz_set(adev);
1171
1172         return 0;
1173 }
1174
1175 /**
1176  * amdgpu_switcheroo_set_state - set switcheroo state
1177  *
1178  * @pdev: pci dev pointer
1179  * @state: vga_switcheroo state
1180  *
1181  * Callback for the switcheroo driver.  Suspends or resumes the
1182  * the asics before or after it is powered up using ACPI methods.
1183  */
1184 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1185 {
1186         struct drm_device *dev = pci_get_drvdata(pdev);
1187         int r;
1188
1189         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1190                 return;
1191
1192         if (state == VGA_SWITCHEROO_ON) {
1193                 pr_info("switched on\n");
1194                 /* don't suspend or resume card normally */
1195                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1196
1197                 pci_set_power_state(dev->pdev, PCI_D0);
1198                 pci_restore_state(dev->pdev);
1199                 r = pci_enable_device(dev->pdev);
1200                 if (r)
1201                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1202                 amdgpu_device_resume(dev, true);
1203
1204                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1205                 drm_kms_helper_poll_enable(dev);
1206         } else {
1207                 pr_info("switched off\n");
1208                 drm_kms_helper_poll_disable(dev);
1209                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1210                 amdgpu_device_suspend(dev, true);
1211                 pci_save_state(dev->pdev);
1212                 /* Shut down the device */
1213                 pci_disable_device(dev->pdev);
1214                 pci_set_power_state(dev->pdev, PCI_D3cold);
1215                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1216         }
1217 }
1218
1219 /**
1220  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1221  *
1222  * @pdev: pci dev pointer
1223  *
1224  * Callback for the switcheroo driver.  Check of the switcheroo
1225  * state can be changed.
1226  * Returns true if the state can be changed, false if not.
1227  */
1228 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1229 {
1230         struct drm_device *dev = pci_get_drvdata(pdev);
1231
1232         /*
1233         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1234         * locking inversion with the driver load path. And the access here is
1235         * completely racy anyway. So don't bother with locking for now.
1236         */
1237         return atomic_read(&dev->open_count) == 0;
1238 }
1239
1240 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1241         .set_gpu_state = amdgpu_switcheroo_set_state,
1242         .reprobe = NULL,
1243         .can_switch = amdgpu_switcheroo_can_switch,
1244 };
1245
1246 /**
1247  * amdgpu_device_ip_set_clockgating_state - set the CG state
1248  *
1249  * @dev: amdgpu_device pointer
1250  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1251  * @state: clockgating state (gate or ungate)
1252  *
1253  * Sets the requested clockgating state for all instances of
1254  * the hardware IP specified.
1255  * Returns the error code from the last instance.
1256  */
1257 int amdgpu_device_ip_set_clockgating_state(void *dev,
1258                                            enum amd_ip_block_type block_type,
1259                                            enum amd_clockgating_state state)
1260 {
1261         struct amdgpu_device *adev = dev;
1262         int i, r = 0;
1263
1264         for (i = 0; i < adev->num_ip_blocks; i++) {
1265                 if (!adev->ip_blocks[i].status.valid)
1266                         continue;
1267                 if (adev->ip_blocks[i].version->type != block_type)
1268                         continue;
1269                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1270                         continue;
1271                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1272                         (void *)adev, state);
1273                 if (r)
1274                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1275                                   adev->ip_blocks[i].version->funcs->name, r);
1276         }
1277         return r;
1278 }
1279
1280 /**
1281  * amdgpu_device_ip_set_powergating_state - set the PG state
1282  *
1283  * @dev: amdgpu_device pointer
1284  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1285  * @state: powergating state (gate or ungate)
1286  *
1287  * Sets the requested powergating state for all instances of
1288  * the hardware IP specified.
1289  * Returns the error code from the last instance.
1290  */
1291 int amdgpu_device_ip_set_powergating_state(void *dev,
1292                                            enum amd_ip_block_type block_type,
1293                                            enum amd_powergating_state state)
1294 {
1295         struct amdgpu_device *adev = dev;
1296         int i, r = 0;
1297
1298         for (i = 0; i < adev->num_ip_blocks; i++) {
1299                 if (!adev->ip_blocks[i].status.valid)
1300                         continue;
1301                 if (adev->ip_blocks[i].version->type != block_type)
1302                         continue;
1303                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1304                         continue;
1305                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1306                         (void *)adev, state);
1307                 if (r)
1308                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1309                                   adev->ip_blocks[i].version->funcs->name, r);
1310         }
1311         return r;
1312 }
1313
1314 /**
1315  * amdgpu_device_ip_get_clockgating_state - get the CG state
1316  *
1317  * @adev: amdgpu_device pointer
1318  * @flags: clockgating feature flags
1319  *
1320  * Walks the list of IPs on the device and updates the clockgating
1321  * flags for each IP.
1322  * Updates @flags with the feature flags for each hardware IP where
1323  * clockgating is enabled.
1324  */
1325 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1326                                             u32 *flags)
1327 {
1328         int i;
1329
1330         for (i = 0; i < adev->num_ip_blocks; i++) {
1331                 if (!adev->ip_blocks[i].status.valid)
1332                         continue;
1333                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1334                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1335         }
1336 }
1337
1338 /**
1339  * amdgpu_device_ip_wait_for_idle - wait for idle
1340  *
1341  * @adev: amdgpu_device pointer
1342  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1343  *
1344  * Waits for the request hardware IP to be idle.
1345  * Returns 0 for success or a negative error code on failure.
1346  */
1347 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1348                                    enum amd_ip_block_type block_type)
1349 {
1350         int i, r;
1351
1352         for (i = 0; i < adev->num_ip_blocks; i++) {
1353                 if (!adev->ip_blocks[i].status.valid)
1354                         continue;
1355                 if (adev->ip_blocks[i].version->type == block_type) {
1356                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1357                         if (r)
1358                                 return r;
1359                         break;
1360                 }
1361         }
1362         return 0;
1363
1364 }
1365
1366 /**
1367  * amdgpu_device_ip_is_idle - is the hardware IP idle
1368  *
1369  * @adev: amdgpu_device pointer
1370  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1371  *
1372  * Check if the hardware IP is idle or not.
1373  * Returns true if it the IP is idle, false if not.
1374  */
1375 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1376                               enum amd_ip_block_type block_type)
1377 {
1378         int i;
1379
1380         for (i = 0; i < adev->num_ip_blocks; i++) {
1381                 if (!adev->ip_blocks[i].status.valid)
1382                         continue;
1383                 if (adev->ip_blocks[i].version->type == block_type)
1384                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1385         }
1386         return true;
1387
1388 }
1389
1390 /**
1391  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1392  *
1393  * @adev: amdgpu_device pointer
1394  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1395  *
1396  * Returns a pointer to the hardware IP block structure
1397  * if it exists for the asic, otherwise NULL.
1398  */
1399 struct amdgpu_ip_block *
1400 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1401                               enum amd_ip_block_type type)
1402 {
1403         int i;
1404
1405         for (i = 0; i < adev->num_ip_blocks; i++)
1406                 if (adev->ip_blocks[i].version->type == type)
1407                         return &adev->ip_blocks[i];
1408
1409         return NULL;
1410 }
1411
1412 /**
1413  * amdgpu_device_ip_block_version_cmp
1414  *
1415  * @adev: amdgpu_device pointer
1416  * @type: enum amd_ip_block_type
1417  * @major: major version
1418  * @minor: minor version
1419  *
1420  * return 0 if equal or greater
1421  * return 1 if smaller or the ip_block doesn't exist
1422  */
1423 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1424                                        enum amd_ip_block_type type,
1425                                        u32 major, u32 minor)
1426 {
1427         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1428
1429         if (ip_block && ((ip_block->version->major > major) ||
1430                         ((ip_block->version->major == major) &&
1431                         (ip_block->version->minor >= minor))))
1432                 return 0;
1433
1434         return 1;
1435 }
1436
1437 /**
1438  * amdgpu_device_ip_block_add
1439  *
1440  * @adev: amdgpu_device pointer
1441  * @ip_block_version: pointer to the IP to add
1442  *
1443  * Adds the IP block driver information to the collection of IPs
1444  * on the asic.
1445  */
1446 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1447                                const struct amdgpu_ip_block_version *ip_block_version)
1448 {
1449         if (!ip_block_version)
1450                 return -EINVAL;
1451
1452         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1453                   ip_block_version->funcs->name);
1454
1455         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1456
1457         return 0;
1458 }
1459
1460 /**
1461  * amdgpu_device_enable_virtual_display - enable virtual display feature
1462  *
1463  * @adev: amdgpu_device pointer
1464  *
1465  * Enabled the virtual display feature if the user has enabled it via
1466  * the module parameter virtual_display.  This feature provides a virtual
1467  * display hardware on headless boards or in virtualized environments.
1468  * This function parses and validates the configuration string specified by
1469  * the user and configues the virtual display configuration (number of
1470  * virtual connectors, crtcs, etc.) specified.
1471  */
1472 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1473 {
1474         adev->enable_virtual_display = false;
1475
1476         if (amdgpu_virtual_display) {
1477                 struct drm_device *ddev = adev->ddev;
1478                 const char *pci_address_name = pci_name(ddev->pdev);
1479                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1480
1481                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1482                 pciaddstr_tmp = pciaddstr;
1483                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1484                         pciaddname = strsep(&pciaddname_tmp, ",");
1485                         if (!strcmp("all", pciaddname)
1486                             || !strcmp(pci_address_name, pciaddname)) {
1487                                 long num_crtc;
1488                                 int res = -1;
1489
1490                                 adev->enable_virtual_display = true;
1491
1492                                 if (pciaddname_tmp)
1493                                         res = kstrtol(pciaddname_tmp, 10,
1494                                                       &num_crtc);
1495
1496                                 if (!res) {
1497                                         if (num_crtc < 1)
1498                                                 num_crtc = 1;
1499                                         if (num_crtc > 6)
1500                                                 num_crtc = 6;
1501                                         adev->mode_info.num_crtc = num_crtc;
1502                                 } else {
1503                                         adev->mode_info.num_crtc = 1;
1504                                 }
1505                                 break;
1506                         }
1507                 }
1508
1509                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1510                          amdgpu_virtual_display, pci_address_name,
1511                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1512
1513                 kfree(pciaddstr);
1514         }
1515 }
1516
1517 /**
1518  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1519  *
1520  * @adev: amdgpu_device pointer
1521  *
1522  * Parses the asic configuration parameters specified in the gpu info
1523  * firmware and makes them availale to the driver for use in configuring
1524  * the asic.
1525  * Returns 0 on success, -EINVAL on failure.
1526  */
1527 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1528 {
1529         const char *chip_name;
1530         char fw_name[30];
1531         int err;
1532         const struct gpu_info_firmware_header_v1_0 *hdr;
1533
1534         adev->firmware.gpu_info_fw = NULL;
1535
1536         switch (adev->asic_type) {
1537         case CHIP_TOPAZ:
1538         case CHIP_TONGA:
1539         case CHIP_FIJI:
1540         case CHIP_POLARIS10:
1541         case CHIP_POLARIS11:
1542         case CHIP_POLARIS12:
1543         case CHIP_VEGAM:
1544         case CHIP_CARRIZO:
1545         case CHIP_STONEY:
1546 #ifdef CONFIG_DRM_AMDGPU_SI
1547         case CHIP_VERDE:
1548         case CHIP_TAHITI:
1549         case CHIP_PITCAIRN:
1550         case CHIP_OLAND:
1551         case CHIP_HAINAN:
1552 #endif
1553 #ifdef CONFIG_DRM_AMDGPU_CIK
1554         case CHIP_BONAIRE:
1555         case CHIP_HAWAII:
1556         case CHIP_KAVERI:
1557         case CHIP_KABINI:
1558         case CHIP_MULLINS:
1559 #endif
1560         case CHIP_VEGA20:
1561         default:
1562                 return 0;
1563         case CHIP_VEGA10:
1564                 chip_name = "vega10";
1565                 break;
1566         case CHIP_VEGA12:
1567                 chip_name = "vega12";
1568                 break;
1569         case CHIP_RAVEN:
1570                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1571                         chip_name = "raven2";
1572                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1573                         chip_name = "picasso";
1574                 else
1575                         chip_name = "raven";
1576                 break;
1577         case CHIP_ARCTURUS:
1578                 chip_name = "arcturus";
1579                 break;
1580         case CHIP_RENOIR:
1581                 chip_name = "renoir";
1582                 break;
1583         case CHIP_NAVI10:
1584                 chip_name = "navi10";
1585                 break;
1586         case CHIP_NAVI14:
1587                 chip_name = "navi14";
1588                 break;
1589         case CHIP_NAVI12:
1590                 chip_name = "navi12";
1591                 break;
1592         }
1593
1594         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1595         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1596         if (err) {
1597                 dev_err(adev->dev,
1598                         "Failed to load gpu_info firmware \"%s\"\n",
1599                         fw_name);
1600                 goto out;
1601         }
1602         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1603         if (err) {
1604                 dev_err(adev->dev,
1605                         "Failed to validate gpu_info firmware \"%s\"\n",
1606                         fw_name);
1607                 goto out;
1608         }
1609
1610         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1611         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1612
1613         switch (hdr->version_major) {
1614         case 1:
1615         {
1616                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1617                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1618                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1619
1620                 if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) {
1621                         amdgpu_discovery_get_gfx_info(adev);
1622                         goto parse_soc_bounding_box;
1623                 }
1624
1625                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1626                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1627                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1628                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1629                 adev->gfx.config.max_texture_channel_caches =
1630                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1631                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1632                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1633                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1634                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1635                 adev->gfx.config.double_offchip_lds_buf =
1636                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1637                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1638                 adev->gfx.cu_info.max_waves_per_simd =
1639                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1640                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1641                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1642                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1643                 if (hdr->version_minor >= 1) {
1644                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1645                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1646                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1647                         adev->gfx.config.num_sc_per_sh =
1648                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1649                         adev->gfx.config.num_packer_per_sc =
1650                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1651                 }
1652
1653 parse_soc_bounding_box:
1654                 /*
1655                  * soc bounding box info is not integrated in disocovery table,
1656                  * we always need to parse it from gpu info firmware.
1657                  */
1658                 if (hdr->version_minor == 2) {
1659                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1660                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1661                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1662                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1663                 }
1664                 break;
1665         }
1666         default:
1667                 dev_err(adev->dev,
1668                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1669                 err = -EINVAL;
1670                 goto out;
1671         }
1672 out:
1673         return err;
1674 }
1675
1676 /**
1677  * amdgpu_device_ip_early_init - run early init for hardware IPs
1678  *
1679  * @adev: amdgpu_device pointer
1680  *
1681  * Early initialization pass for hardware IPs.  The hardware IPs that make
1682  * up each asic are discovered each IP's early_init callback is run.  This
1683  * is the first stage in initializing the asic.
1684  * Returns 0 on success, negative error code on failure.
1685  */
1686 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1687 {
1688         int i, r;
1689
1690         amdgpu_device_enable_virtual_display(adev);
1691
1692         switch (adev->asic_type) {
1693         case CHIP_TOPAZ:
1694         case CHIP_TONGA:
1695         case CHIP_FIJI:
1696         case CHIP_POLARIS10:
1697         case CHIP_POLARIS11:
1698         case CHIP_POLARIS12:
1699         case CHIP_VEGAM:
1700         case CHIP_CARRIZO:
1701         case CHIP_STONEY:
1702                 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY)
1703                         adev->family = AMDGPU_FAMILY_CZ;
1704                 else
1705                         adev->family = AMDGPU_FAMILY_VI;
1706
1707                 r = vi_set_ip_blocks(adev);
1708                 if (r)
1709                         return r;
1710                 break;
1711 #ifdef CONFIG_DRM_AMDGPU_SI
1712         case CHIP_VERDE:
1713         case CHIP_TAHITI:
1714         case CHIP_PITCAIRN:
1715         case CHIP_OLAND:
1716         case CHIP_HAINAN:
1717                 adev->family = AMDGPU_FAMILY_SI;
1718                 r = si_set_ip_blocks(adev);
1719                 if (r)
1720                         return r;
1721                 break;
1722 #endif
1723 #ifdef CONFIG_DRM_AMDGPU_CIK
1724         case CHIP_BONAIRE:
1725         case CHIP_HAWAII:
1726         case CHIP_KAVERI:
1727         case CHIP_KABINI:
1728         case CHIP_MULLINS:
1729                 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII))
1730                         adev->family = AMDGPU_FAMILY_CI;
1731                 else
1732                         adev->family = AMDGPU_FAMILY_KV;
1733
1734                 r = cik_set_ip_blocks(adev);
1735                 if (r)
1736                         return r;
1737                 break;
1738 #endif
1739         case CHIP_VEGA10:
1740         case CHIP_VEGA12:
1741         case CHIP_VEGA20:
1742         case CHIP_RAVEN:
1743         case CHIP_ARCTURUS:
1744         case CHIP_RENOIR:
1745                 if (adev->asic_type == CHIP_RAVEN ||
1746                     adev->asic_type == CHIP_RENOIR)
1747                         adev->family = AMDGPU_FAMILY_RV;
1748                 else
1749                         adev->family = AMDGPU_FAMILY_AI;
1750
1751                 r = soc15_set_ip_blocks(adev);
1752                 if (r)
1753                         return r;
1754                 break;
1755         case  CHIP_NAVI10:
1756         case  CHIP_NAVI14:
1757         case  CHIP_NAVI12:
1758                 adev->family = AMDGPU_FAMILY_NV;
1759
1760                 r = nv_set_ip_blocks(adev);
1761                 if (r)
1762                         return r;
1763                 break;
1764         default:
1765                 /* FIXME: not supported yet */
1766                 return -EINVAL;
1767         }
1768
1769         amdgpu_amdkfd_device_probe(adev);
1770
1771         if (amdgpu_sriov_vf(adev)) {
1772                 /* handle vbios stuff prior full access mode for new handshake */
1773                 if (adev->virt.req_init_data_ver == 1) {
1774                         if (!amdgpu_get_bios(adev)) {
1775                                 DRM_ERROR("failed to get vbios\n");
1776                                 return -EINVAL;
1777                         }
1778
1779                         r = amdgpu_atombios_init(adev);
1780                         if (r) {
1781                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1782                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1783                                 return r;
1784                         }
1785                 }
1786         }
1787
1788         /* we need to send REQ_GPU here for legacy handshaker otherwise the vbios
1789          * will not be prepared by host for this VF */
1790         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver < 1) {
1791                 r = amdgpu_virt_request_full_gpu(adev, true);
1792                 if (r)
1793                         return r;
1794         }
1795
1796         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1797         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1798                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1799
1800         for (i = 0; i < adev->num_ip_blocks; i++) {
1801                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1802                         DRM_ERROR("disabled ip block: %d <%s>\n",
1803                                   i, adev->ip_blocks[i].version->funcs->name);
1804                         adev->ip_blocks[i].status.valid = false;
1805                 } else {
1806                         if (adev->ip_blocks[i].version->funcs->early_init) {
1807                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1808                                 if (r == -ENOENT) {
1809                                         adev->ip_blocks[i].status.valid = false;
1810                                 } else if (r) {
1811                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1812                                                   adev->ip_blocks[i].version->funcs->name, r);
1813                                         return r;
1814                                 } else {
1815                                         adev->ip_blocks[i].status.valid = true;
1816                                 }
1817                         } else {
1818                                 adev->ip_blocks[i].status.valid = true;
1819                         }
1820                 }
1821                 /* get the vbios after the asic_funcs are set up */
1822                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1823                         r = amdgpu_device_parse_gpu_info_fw(adev);
1824                         if (r)
1825                                 return r;
1826
1827                         /* skip vbios handling for new handshake */
1828                         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver == 1)
1829                                 continue;
1830
1831                         /* Read BIOS */
1832                         if (!amdgpu_get_bios(adev))
1833                                 return -EINVAL;
1834
1835                         r = amdgpu_atombios_init(adev);
1836                         if (r) {
1837                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1838                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1839                                 return r;
1840                         }
1841                 }
1842         }
1843
1844         adev->cg_flags &= amdgpu_cg_mask;
1845         adev->pg_flags &= amdgpu_pg_mask;
1846
1847         return 0;
1848 }
1849
1850 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1851 {
1852         int i, r;
1853
1854         for (i = 0; i < adev->num_ip_blocks; i++) {
1855                 if (!adev->ip_blocks[i].status.sw)
1856                         continue;
1857                 if (adev->ip_blocks[i].status.hw)
1858                         continue;
1859                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1860                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1861                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1862                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1863                         if (r) {
1864                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1865                                           adev->ip_blocks[i].version->funcs->name, r);
1866                                 return r;
1867                         }
1868                         adev->ip_blocks[i].status.hw = true;
1869                 }
1870         }
1871
1872         return 0;
1873 }
1874
1875 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1876 {
1877         int i, r;
1878
1879         for (i = 0; i < adev->num_ip_blocks; i++) {
1880                 if (!adev->ip_blocks[i].status.sw)
1881                         continue;
1882                 if (adev->ip_blocks[i].status.hw)
1883                         continue;
1884                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1885                 if (r) {
1886                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1887                                   adev->ip_blocks[i].version->funcs->name, r);
1888                         return r;
1889                 }
1890                 adev->ip_blocks[i].status.hw = true;
1891         }
1892
1893         return 0;
1894 }
1895
1896 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1897 {
1898         int r = 0;
1899         int i;
1900         uint32_t smu_version;
1901
1902         if (adev->asic_type >= CHIP_VEGA10) {
1903                 for (i = 0; i < adev->num_ip_blocks; i++) {
1904                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1905                                 continue;
1906
1907                         /* no need to do the fw loading again if already done*/
1908                         if (adev->ip_blocks[i].status.hw == true)
1909                                 break;
1910
1911                         if (adev->in_gpu_reset || adev->in_suspend) {
1912                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
1913                                 if (r) {
1914                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
1915                                                           adev->ip_blocks[i].version->funcs->name, r);
1916                                         return r;
1917                                 }
1918                         } else {
1919                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1920                                 if (r) {
1921                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1922                                                           adev->ip_blocks[i].version->funcs->name, r);
1923                                         return r;
1924                                 }
1925                         }
1926
1927                         adev->ip_blocks[i].status.hw = true;
1928                         break;
1929                 }
1930         }
1931
1932         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1933                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1934
1935         return r;
1936 }
1937
1938 /**
1939  * amdgpu_device_ip_init - run init for hardware IPs
1940  *
1941  * @adev: amdgpu_device pointer
1942  *
1943  * Main initialization pass for hardware IPs.  The list of all the hardware
1944  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1945  * are run.  sw_init initializes the software state associated with each IP
1946  * and hw_init initializes the hardware associated with each IP.
1947  * Returns 0 on success, negative error code on failure.
1948  */
1949 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1950 {
1951         int i, r;
1952
1953         r = amdgpu_ras_init(adev);
1954         if (r)
1955                 return r;
1956
1957         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver > 0) {
1958                 r = amdgpu_virt_request_full_gpu(adev, true);
1959                 if (r)
1960                         return -EAGAIN;
1961         }
1962
1963         for (i = 0; i < adev->num_ip_blocks; i++) {
1964                 if (!adev->ip_blocks[i].status.valid)
1965                         continue;
1966                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1967                 if (r) {
1968                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1969                                   adev->ip_blocks[i].version->funcs->name, r);
1970                         goto init_failed;
1971                 }
1972                 adev->ip_blocks[i].status.sw = true;
1973
1974                 /* need to do gmc hw init early so we can allocate gpu mem */
1975                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
1976                         r = amdgpu_device_vram_scratch_init(adev);
1977                         if (r) {
1978                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
1979                                 goto init_failed;
1980                         }
1981                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
1982                         if (r) {
1983                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
1984                                 goto init_failed;
1985                         }
1986                         r = amdgpu_device_wb_init(adev);
1987                         if (r) {
1988                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
1989                                 goto init_failed;
1990                         }
1991                         adev->ip_blocks[i].status.hw = true;
1992
1993                         /* right after GMC hw init, we create CSA */
1994                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1995                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
1996                                                                 AMDGPU_GEM_DOMAIN_VRAM,
1997                                                                 AMDGPU_CSA_SIZE);
1998                                 if (r) {
1999                                         DRM_ERROR("allocate CSA failed %d\n", r);
2000                                         goto init_failed;
2001                                 }
2002                         }
2003                 }
2004         }
2005
2006         if (amdgpu_sriov_vf(adev))
2007                 amdgpu_virt_init_data_exchange(adev);
2008
2009         r = amdgpu_ib_pool_init(adev);
2010         if (r) {
2011                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2012                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2013                 goto init_failed;
2014         }
2015
2016         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2017         if (r)
2018                 goto init_failed;
2019
2020         r = amdgpu_device_ip_hw_init_phase1(adev);
2021         if (r)
2022                 goto init_failed;
2023
2024         r = amdgpu_device_fw_loading(adev);
2025         if (r)
2026                 goto init_failed;
2027
2028         r = amdgpu_device_ip_hw_init_phase2(adev);
2029         if (r)
2030                 goto init_failed;
2031
2032         /*
2033          * retired pages will be loaded from eeprom and reserved here,
2034          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2035          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2036          * for I2C communication which only true at this point.
2037          * recovery_init may fail, but it can free all resources allocated by
2038          * itself and its failure should not stop amdgpu init process.
2039          *
2040          * Note: theoretically, this should be called before all vram allocations
2041          * to protect retired page from abusing
2042          */
2043         amdgpu_ras_recovery_init(adev);
2044
2045         if (adev->gmc.xgmi.num_physical_nodes > 1)
2046                 amdgpu_xgmi_add_device(adev);
2047         amdgpu_amdkfd_device_init(adev);
2048
2049         amdgpu_fru_get_product_info(adev);
2050
2051 init_failed:
2052         if (amdgpu_sriov_vf(adev))
2053                 amdgpu_virt_release_full_gpu(adev, true);
2054
2055         return r;
2056 }
2057
2058 /**
2059  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2060  *
2061  * @adev: amdgpu_device pointer
2062  *
2063  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2064  * this function before a GPU reset.  If the value is retained after a
2065  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2066  */
2067 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2068 {
2069         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2070 }
2071
2072 /**
2073  * amdgpu_device_check_vram_lost - check if vram is valid
2074  *
2075  * @adev: amdgpu_device pointer
2076  *
2077  * Checks the reset magic value written to the gart pointer in VRAM.
2078  * The driver calls this after a GPU reset to see if the contents of
2079  * VRAM is lost or now.
2080  * returns true if vram is lost, false if not.
2081  */
2082 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2083 {
2084         if (memcmp(adev->gart.ptr, adev->reset_magic,
2085                         AMDGPU_RESET_MAGIC_NUM))
2086                 return true;
2087
2088         if (!adev->in_gpu_reset)
2089                 return false;
2090
2091         /*
2092          * For all ASICs with baco/mode1 reset, the VRAM is
2093          * always assumed to be lost.
2094          */
2095         switch (amdgpu_asic_reset_method(adev)) {
2096         case AMD_RESET_METHOD_BACO:
2097         case AMD_RESET_METHOD_MODE1:
2098                 return true;
2099         default:
2100                 return false;
2101         }
2102 }
2103
2104 /**
2105  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2106  *
2107  * @adev: amdgpu_device pointer
2108  * @state: clockgating state (gate or ungate)
2109  *
2110  * The list of all the hardware IPs that make up the asic is walked and the
2111  * set_clockgating_state callbacks are run.
2112  * Late initialization pass enabling clockgating for hardware IPs.
2113  * Fini or suspend, pass disabling clockgating for hardware IPs.
2114  * Returns 0 on success, negative error code on failure.
2115  */
2116
2117 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2118                                                 enum amd_clockgating_state state)
2119 {
2120         int i, j, r;
2121
2122         if (amdgpu_emu_mode == 1)
2123                 return 0;
2124
2125         for (j = 0; j < adev->num_ip_blocks; j++) {
2126                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2127                 if (!adev->ip_blocks[i].status.late_initialized)
2128                         continue;
2129                 /* skip CG for VCE/UVD, it's handled specially */
2130                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2131                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2132                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2133                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2134                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2135                         /* enable clockgating to save power */
2136                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2137                                                                                      state);
2138                         if (r) {
2139                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2140                                           adev->ip_blocks[i].version->funcs->name, r);
2141                                 return r;
2142                         }
2143                 }
2144         }
2145
2146         return 0;
2147 }
2148
2149 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2150 {
2151         int i, j, r;
2152
2153         if (amdgpu_emu_mode == 1)
2154                 return 0;
2155
2156         for (j = 0; j < adev->num_ip_blocks; j++) {
2157                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2158                 if (!adev->ip_blocks[i].status.late_initialized)
2159                         continue;
2160                 /* skip CG for VCE/UVD, it's handled specially */
2161                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2162                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2163                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2164                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2165                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2166                         /* enable powergating to save power */
2167                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2168                                                                                         state);
2169                         if (r) {
2170                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2171                                           adev->ip_blocks[i].version->funcs->name, r);
2172                                 return r;
2173                         }
2174                 }
2175         }
2176         return 0;
2177 }
2178
2179 static int amdgpu_device_enable_mgpu_fan_boost(void)
2180 {
2181         struct amdgpu_gpu_instance *gpu_ins;
2182         struct amdgpu_device *adev;
2183         int i, ret = 0;
2184
2185         mutex_lock(&mgpu_info.mutex);
2186
2187         /*
2188          * MGPU fan boost feature should be enabled
2189          * only when there are two or more dGPUs in
2190          * the system
2191          */
2192         if (mgpu_info.num_dgpu < 2)
2193                 goto out;
2194
2195         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2196                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2197                 adev = gpu_ins->adev;
2198                 if (!(adev->flags & AMD_IS_APU) &&
2199                     !gpu_ins->mgpu_fan_enabled &&
2200                     adev->powerplay.pp_funcs &&
2201                     adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2202                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2203                         if (ret)
2204                                 break;
2205
2206                         gpu_ins->mgpu_fan_enabled = 1;
2207                 }
2208         }
2209
2210 out:
2211         mutex_unlock(&mgpu_info.mutex);
2212
2213         return ret;
2214 }
2215
2216 /**
2217  * amdgpu_device_ip_late_init - run late init for hardware IPs
2218  *
2219  * @adev: amdgpu_device pointer
2220  *
2221  * Late initialization pass for hardware IPs.  The list of all the hardware
2222  * IPs that make up the asic is walked and the late_init callbacks are run.
2223  * late_init covers any special initialization that an IP requires
2224  * after all of the have been initialized or something that needs to happen
2225  * late in the init process.
2226  * Returns 0 on success, negative error code on failure.
2227  */
2228 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2229 {
2230         struct amdgpu_gpu_instance *gpu_instance;
2231         int i = 0, r;
2232
2233         for (i = 0; i < adev->num_ip_blocks; i++) {
2234                 if (!adev->ip_blocks[i].status.hw)
2235                         continue;
2236                 if (adev->ip_blocks[i].version->funcs->late_init) {
2237                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2238                         if (r) {
2239                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2240                                           adev->ip_blocks[i].version->funcs->name, r);
2241                                 return r;
2242                         }
2243                 }
2244                 adev->ip_blocks[i].status.late_initialized = true;
2245         }
2246
2247         amdgpu_ras_set_error_query_ready(adev, true);
2248
2249         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2250         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2251
2252         amdgpu_device_fill_reset_magic(adev);
2253
2254         r = amdgpu_device_enable_mgpu_fan_boost();
2255         if (r)
2256                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2257
2258
2259         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2260                 mutex_lock(&mgpu_info.mutex);
2261
2262                 /*
2263                  * Reset device p-state to low as this was booted with high.
2264                  *
2265                  * This should be performed only after all devices from the same
2266                  * hive get initialized.
2267                  *
2268                  * However, it's unknown how many device in the hive in advance.
2269                  * As this is counted one by one during devices initializations.
2270                  *
2271                  * So, we wait for all XGMI interlinked devices initialized.
2272                  * This may bring some delays as those devices may come from
2273                  * different hives. But that should be OK.
2274                  */
2275                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2276                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2277                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2278                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2279                                         continue;
2280
2281                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2282                                                 AMDGPU_XGMI_PSTATE_MIN);
2283                                 if (r) {
2284                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2285                                         break;
2286                                 }
2287                         }
2288                 }
2289
2290                 mutex_unlock(&mgpu_info.mutex);
2291         }
2292
2293         return 0;
2294 }
2295
2296 /**
2297  * amdgpu_device_ip_fini - run fini for hardware IPs
2298  *
2299  * @adev: amdgpu_device pointer
2300  *
2301  * Main teardown pass for hardware IPs.  The list of all the hardware
2302  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2303  * are run.  hw_fini tears down the hardware associated with each IP
2304  * and sw_fini tears down any software state associated with each IP.
2305  * Returns 0 on success, negative error code on failure.
2306  */
2307 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2308 {
2309         int i, r;
2310
2311         amdgpu_ras_pre_fini(adev);
2312
2313         if (adev->gmc.xgmi.num_physical_nodes > 1)
2314                 amdgpu_xgmi_remove_device(adev);
2315
2316         amdgpu_amdkfd_device_fini(adev);
2317
2318         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2319         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2320
2321         /* need to disable SMC first */
2322         for (i = 0; i < adev->num_ip_blocks; i++) {
2323                 if (!adev->ip_blocks[i].status.hw)
2324                         continue;
2325                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2326                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2327                         /* XXX handle errors */
2328                         if (r) {
2329                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2330                                           adev->ip_blocks[i].version->funcs->name, r);
2331                         }
2332                         adev->ip_blocks[i].status.hw = false;
2333                         break;
2334                 }
2335         }
2336
2337         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2338                 if (!adev->ip_blocks[i].status.hw)
2339                         continue;
2340
2341                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2342                 /* XXX handle errors */
2343                 if (r) {
2344                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2345                                   adev->ip_blocks[i].version->funcs->name, r);
2346                 }
2347
2348                 adev->ip_blocks[i].status.hw = false;
2349         }
2350
2351
2352         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2353                 if (!adev->ip_blocks[i].status.sw)
2354                         continue;
2355
2356                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2357                         amdgpu_ucode_free_bo(adev);
2358                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2359                         amdgpu_device_wb_fini(adev);
2360                         amdgpu_device_vram_scratch_fini(adev);
2361                         amdgpu_ib_pool_fini(adev);
2362                 }
2363
2364                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2365                 /* XXX handle errors */
2366                 if (r) {
2367                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2368                                   adev->ip_blocks[i].version->funcs->name, r);
2369                 }
2370                 adev->ip_blocks[i].status.sw = false;
2371                 adev->ip_blocks[i].status.valid = false;
2372         }
2373
2374         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2375                 if (!adev->ip_blocks[i].status.late_initialized)
2376                         continue;
2377                 if (adev->ip_blocks[i].version->funcs->late_fini)
2378                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2379                 adev->ip_blocks[i].status.late_initialized = false;
2380         }
2381
2382         amdgpu_ras_fini(adev);
2383
2384         if (amdgpu_sriov_vf(adev))
2385                 if (amdgpu_virt_release_full_gpu(adev, false))
2386                         DRM_ERROR("failed to release exclusive mode on fini\n");
2387
2388         return 0;
2389 }
2390
2391 /**
2392  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2393  *
2394  * @work: work_struct.
2395  */
2396 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2397 {
2398         struct amdgpu_device *adev =
2399                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2400         int r;
2401
2402         r = amdgpu_ib_ring_tests(adev);
2403         if (r)
2404                 DRM_ERROR("ib ring test failed (%d).\n", r);
2405 }
2406
2407 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2408 {
2409         struct amdgpu_device *adev =
2410                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2411
2412         mutex_lock(&adev->gfx.gfx_off_mutex);
2413         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2414                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2415                         adev->gfx.gfx_off_state = true;
2416         }
2417         mutex_unlock(&adev->gfx.gfx_off_mutex);
2418 }
2419
2420 /**
2421  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2422  *
2423  * @adev: amdgpu_device pointer
2424  *
2425  * Main suspend function for hardware IPs.  The list of all the hardware
2426  * IPs that make up the asic is walked, clockgating is disabled and the
2427  * suspend callbacks are run.  suspend puts the hardware and software state
2428  * in each IP into a state suitable for suspend.
2429  * Returns 0 on success, negative error code on failure.
2430  */
2431 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2432 {
2433         int i, r;
2434
2435         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2436         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2437
2438         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2439                 if (!adev->ip_blocks[i].status.valid)
2440                         continue;
2441                 /* displays are handled separately */
2442                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
2443                         /* XXX handle errors */
2444                         r = adev->ip_blocks[i].version->funcs->suspend(adev);
2445                         /* XXX handle errors */
2446                         if (r) {
2447                                 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2448                                           adev->ip_blocks[i].version->funcs->name, r);
2449                                 return r;
2450                         }
2451                         adev->ip_blocks[i].status.hw = false;
2452                 }
2453         }
2454
2455         return 0;
2456 }
2457
2458 /**
2459  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2460  *
2461  * @adev: amdgpu_device pointer
2462  *
2463  * Main suspend function for hardware IPs.  The list of all the hardware
2464  * IPs that make up the asic is walked, clockgating is disabled and the
2465  * suspend callbacks are run.  suspend puts the hardware and software state
2466  * in each IP into a state suitable for suspend.
2467  * Returns 0 on success, negative error code on failure.
2468  */
2469 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2470 {
2471         int i, r;
2472
2473         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2474                 if (!adev->ip_blocks[i].status.valid)
2475                         continue;
2476                 /* displays are handled in phase1 */
2477                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2478                         continue;
2479                 /* PSP lost connection when err_event_athub occurs */
2480                 if (amdgpu_ras_intr_triggered() &&
2481                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2482                         adev->ip_blocks[i].status.hw = false;
2483                         continue;
2484                 }
2485                 /* XXX handle errors */
2486                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2487                 /* XXX handle errors */
2488                 if (r) {
2489                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2490                                   adev->ip_blocks[i].version->funcs->name, r);
2491                 }
2492                 adev->ip_blocks[i].status.hw = false;
2493                 /* handle putting the SMC in the appropriate state */
2494                 if(!amdgpu_sriov_vf(adev)){
2495                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2496                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2497                                 if (r) {
2498                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2499                                                         adev->mp1_state, r);
2500                                         return r;
2501                                 }
2502                         }
2503                 }
2504                 adev->ip_blocks[i].status.hw = false;
2505         }
2506
2507         return 0;
2508 }
2509
2510 /**
2511  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2512  *
2513  * @adev: amdgpu_device pointer
2514  *
2515  * Main suspend function for hardware IPs.  The list of all the hardware
2516  * IPs that make up the asic is walked, clockgating is disabled and the
2517  * suspend callbacks are run.  suspend puts the hardware and software state
2518  * in each IP into a state suitable for suspend.
2519  * Returns 0 on success, negative error code on failure.
2520  */
2521 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2522 {
2523         int r;
2524
2525         if (amdgpu_sriov_vf(adev))
2526                 amdgpu_virt_request_full_gpu(adev, false);
2527
2528         r = amdgpu_device_ip_suspend_phase1(adev);
2529         if (r)
2530                 return r;
2531         r = amdgpu_device_ip_suspend_phase2(adev);
2532
2533         if (amdgpu_sriov_vf(adev))
2534                 amdgpu_virt_release_full_gpu(adev, false);
2535
2536         return r;
2537 }
2538
2539 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2540 {
2541         int i, r;
2542
2543         static enum amd_ip_block_type ip_order[] = {
2544                 AMD_IP_BLOCK_TYPE_GMC,
2545                 AMD_IP_BLOCK_TYPE_COMMON,
2546                 AMD_IP_BLOCK_TYPE_PSP,
2547                 AMD_IP_BLOCK_TYPE_IH,
2548         };
2549
2550         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2551                 int j;
2552                 struct amdgpu_ip_block *block;
2553
2554                 for (j = 0; j < adev->num_ip_blocks; j++) {
2555                         block = &adev->ip_blocks[j];
2556
2557                         block->status.hw = false;
2558                         if (block->version->type != ip_order[i] ||
2559                                 !block->status.valid)
2560                                 continue;
2561
2562                         r = block->version->funcs->hw_init(adev);
2563                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2564                         if (r)
2565                                 return r;
2566                         block->status.hw = true;
2567                 }
2568         }
2569
2570         return 0;
2571 }
2572
2573 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2574 {
2575         int i, r;
2576
2577         static enum amd_ip_block_type ip_order[] = {
2578                 AMD_IP_BLOCK_TYPE_SMC,
2579                 AMD_IP_BLOCK_TYPE_DCE,
2580                 AMD_IP_BLOCK_TYPE_GFX,
2581                 AMD_IP_BLOCK_TYPE_SDMA,
2582                 AMD_IP_BLOCK_TYPE_UVD,
2583                 AMD_IP_BLOCK_TYPE_VCE,
2584                 AMD_IP_BLOCK_TYPE_VCN
2585         };
2586
2587         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2588                 int j;
2589                 struct amdgpu_ip_block *block;
2590
2591                 for (j = 0; j < adev->num_ip_blocks; j++) {
2592                         block = &adev->ip_blocks[j];
2593
2594                         if (block->version->type != ip_order[i] ||
2595                                 !block->status.valid ||
2596                                 block->status.hw)
2597                                 continue;
2598
2599                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2600                                 r = block->version->funcs->resume(adev);
2601                         else
2602                                 r = block->version->funcs->hw_init(adev);
2603
2604                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2605                         if (r)
2606                                 return r;
2607                         block->status.hw = true;
2608                 }
2609         }
2610
2611         return 0;
2612 }
2613
2614 /**
2615  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2616  *
2617  * @adev: amdgpu_device pointer
2618  *
2619  * First resume function for hardware IPs.  The list of all the hardware
2620  * IPs that make up the asic is walked and the resume callbacks are run for
2621  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2622  * after a suspend and updates the software state as necessary.  This
2623  * function is also used for restoring the GPU after a GPU reset.
2624  * Returns 0 on success, negative error code on failure.
2625  */
2626 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2627 {
2628         int i, r;
2629
2630         for (i = 0; i < adev->num_ip_blocks; i++) {
2631                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2632                         continue;
2633                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2634                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2635                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2636
2637                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2638                         if (r) {
2639                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2640                                           adev->ip_blocks[i].version->funcs->name, r);
2641                                 return r;
2642                         }
2643                         adev->ip_blocks[i].status.hw = true;
2644                 }
2645         }
2646
2647         return 0;
2648 }
2649
2650 /**
2651  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2652  *
2653  * @adev: amdgpu_device pointer
2654  *
2655  * First resume function for hardware IPs.  The list of all the hardware
2656  * IPs that make up the asic is walked and the resume callbacks are run for
2657  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2658  * functional state after a suspend and updates the software state as
2659  * necessary.  This function is also used for restoring the GPU after a GPU
2660  * reset.
2661  * Returns 0 on success, negative error code on failure.
2662  */
2663 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2664 {
2665         int i, r;
2666
2667         for (i = 0; i < adev->num_ip_blocks; i++) {
2668                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2669                         continue;
2670                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2671                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2672                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2673                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2674                         continue;
2675                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2676                 if (r) {
2677                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2678                                   adev->ip_blocks[i].version->funcs->name, r);
2679                         return r;
2680                 }
2681                 adev->ip_blocks[i].status.hw = true;
2682         }
2683
2684         return 0;
2685 }
2686
2687 /**
2688  * amdgpu_device_ip_resume - run resume for hardware IPs
2689  *
2690  * @adev: amdgpu_device pointer
2691  *
2692  * Main resume function for hardware IPs.  The hardware IPs
2693  * are split into two resume functions because they are
2694  * are also used in in recovering from a GPU reset and some additional
2695  * steps need to be take between them.  In this case (S3/S4) they are
2696  * run sequentially.
2697  * Returns 0 on success, negative error code on failure.
2698  */
2699 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2700 {
2701         int r;
2702
2703         r = amdgpu_device_ip_resume_phase1(adev);
2704         if (r)
2705                 return r;
2706
2707         r = amdgpu_device_fw_loading(adev);
2708         if (r)
2709                 return r;
2710
2711         r = amdgpu_device_ip_resume_phase2(adev);
2712
2713         return r;
2714 }
2715
2716 /**
2717  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2718  *
2719  * @adev: amdgpu_device pointer
2720  *
2721  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2722  */
2723 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2724 {
2725         if (amdgpu_sriov_vf(adev)) {
2726                 if (adev->is_atom_fw) {
2727                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2728                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2729                 } else {
2730                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2731                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2732                 }
2733
2734                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2735                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2736         }
2737 }
2738
2739 /**
2740  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2741  *
2742  * @asic_type: AMD asic type
2743  *
2744  * Check if there is DC (new modesetting infrastructre) support for an asic.
2745  * returns true if DC has support, false if not.
2746  */
2747 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2748 {
2749         switch (asic_type) {
2750 #if defined(CONFIG_DRM_AMD_DC)
2751         case CHIP_BONAIRE:
2752         case CHIP_KAVERI:
2753         case CHIP_KABINI:
2754         case CHIP_MULLINS:
2755                 /*
2756                  * We have systems in the wild with these ASICs that require
2757                  * LVDS and VGA support which is not supported with DC.
2758                  *
2759                  * Fallback to the non-DC driver here by default so as not to
2760                  * cause regressions.
2761                  */
2762                 return amdgpu_dc > 0;
2763         case CHIP_HAWAII:
2764         case CHIP_CARRIZO:
2765         case CHIP_STONEY:
2766         case CHIP_POLARIS10:
2767         case CHIP_POLARIS11:
2768         case CHIP_POLARIS12:
2769         case CHIP_VEGAM:
2770         case CHIP_TONGA:
2771         case CHIP_FIJI:
2772         case CHIP_VEGA10:
2773         case CHIP_VEGA12:
2774         case CHIP_VEGA20:
2775 #if defined(CONFIG_DRM_AMD_DC_DCN)
2776         case CHIP_RAVEN:
2777         case CHIP_NAVI10:
2778         case CHIP_NAVI14:
2779         case CHIP_NAVI12:
2780         case CHIP_RENOIR:
2781 #endif
2782                 return amdgpu_dc != 0;
2783 #endif
2784         default:
2785                 if (amdgpu_dc > 0)
2786                         DRM_INFO("Display Core has been requested via kernel parameter "
2787                                          "but isn't supported by ASIC, ignoring\n");
2788                 return false;
2789         }
2790 }
2791
2792 /**
2793  * amdgpu_device_has_dc_support - check if dc is supported
2794  *
2795  * @adev: amdgpu_device_pointer
2796  *
2797  * Returns true for supported, false for not supported
2798  */
2799 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2800 {
2801         if (amdgpu_sriov_vf(adev))
2802                 return false;
2803
2804         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2805 }
2806
2807
2808 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2809 {
2810         struct amdgpu_device *adev =
2811                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2812         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2813
2814         /* It's a bug to not have a hive within this function */
2815         if (WARN_ON(!hive))
2816                 return;
2817
2818         /*
2819          * Use task barrier to synchronize all xgmi reset works across the
2820          * hive. task_barrier_enter and task_barrier_exit will block
2821          * until all the threads running the xgmi reset works reach
2822          * those points. task_barrier_full will do both blocks.
2823          */
2824         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2825
2826                 task_barrier_enter(&hive->tb);
2827                 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2828
2829                 if (adev->asic_reset_res)
2830                         goto fail;
2831
2832                 task_barrier_exit(&hive->tb);
2833                 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2834
2835                 if (adev->asic_reset_res)
2836                         goto fail;
2837
2838                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2839                         adev->mmhub.funcs->reset_ras_error_count(adev);
2840         } else {
2841
2842                 task_barrier_full(&hive->tb);
2843                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2844         }
2845
2846 fail:
2847         if (adev->asic_reset_res)
2848                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2849                          adev->asic_reset_res, adev->ddev->unique);
2850 }
2851
2852 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2853 {
2854         char *input = amdgpu_lockup_timeout;
2855         char *timeout_setting = NULL;
2856         int index = 0;
2857         long timeout;
2858         int ret = 0;
2859
2860         /*
2861          * By default timeout for non compute jobs is 10000.
2862          * And there is no timeout enforced on compute jobs.
2863          * In SR-IOV or passthrough mode, timeout for compute
2864          * jobs are 60000 by default.
2865          */
2866         adev->gfx_timeout = msecs_to_jiffies(10000);
2867         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2868         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2869                 adev->compute_timeout =  msecs_to_jiffies(60000);
2870         else
2871                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2872
2873         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2874                 while ((timeout_setting = strsep(&input, ",")) &&
2875                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2876                         ret = kstrtol(timeout_setting, 0, &timeout);
2877                         if (ret)
2878                                 return ret;
2879
2880                         if (timeout == 0) {
2881                                 index++;
2882                                 continue;
2883                         } else if (timeout < 0) {
2884                                 timeout = MAX_SCHEDULE_TIMEOUT;
2885                         } else {
2886                                 timeout = msecs_to_jiffies(timeout);
2887                         }
2888
2889                         switch (index++) {
2890                         case 0:
2891                                 adev->gfx_timeout = timeout;
2892                                 break;
2893                         case 1:
2894                                 adev->compute_timeout = timeout;
2895                                 break;
2896                         case 2:
2897                                 adev->sdma_timeout = timeout;
2898                                 break;
2899                         case 3:
2900                                 adev->video_timeout = timeout;
2901                                 break;
2902                         default:
2903                                 break;
2904                         }
2905                 }
2906                 /*
2907                  * There is only one value specified and
2908                  * it should apply to all non-compute jobs.
2909                  */
2910                 if (index == 1) {
2911                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2912                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2913                                 adev->compute_timeout = adev->gfx_timeout;
2914                 }
2915         }
2916
2917         return ret;
2918 }
2919
2920 static const struct attribute *amdgpu_dev_attributes[] = {
2921         &dev_attr_product_name.attr,
2922         &dev_attr_product_number.attr,
2923         &dev_attr_serial_number.attr,
2924         &dev_attr_pcie_replay_count.attr,
2925         NULL
2926 };
2927
2928 /**
2929  * amdgpu_device_init - initialize the driver
2930  *
2931  * @adev: amdgpu_device pointer
2932  * @ddev: drm dev pointer
2933  * @pdev: pci dev pointer
2934  * @flags: driver flags
2935  *
2936  * Initializes the driver info and hw (all asics).
2937  * Returns 0 for success or an error on failure.
2938  * Called at driver startup.
2939  */
2940 int amdgpu_device_init(struct amdgpu_device *adev,
2941                        struct drm_device *ddev,
2942                        struct pci_dev *pdev,
2943                        uint32_t flags)
2944 {
2945         int r, i;
2946         bool boco = false;
2947         u32 max_MBps;
2948
2949         adev->shutdown = false;
2950         adev->dev = &pdev->dev;
2951         adev->ddev = ddev;
2952         adev->pdev = pdev;
2953         adev->flags = flags;
2954
2955         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2956                 adev->asic_type = amdgpu_force_asic_type;
2957         else
2958                 adev->asic_type = flags & AMD_ASIC_MASK;
2959
2960         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2961         if (amdgpu_emu_mode == 1)
2962                 adev->usec_timeout *= 10;
2963         adev->gmc.gart_size = 512 * 1024 * 1024;
2964         adev->accel_working = false;
2965         adev->num_rings = 0;
2966         adev->mman.buffer_funcs = NULL;
2967         adev->mman.buffer_funcs_ring = NULL;
2968         adev->vm_manager.vm_pte_funcs = NULL;
2969         adev->vm_manager.vm_pte_num_scheds = 0;
2970         adev->gmc.gmc_funcs = NULL;
2971         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2972         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
2973
2974         adev->smc_rreg = &amdgpu_invalid_rreg;
2975         adev->smc_wreg = &amdgpu_invalid_wreg;
2976         adev->pcie_rreg = &amdgpu_invalid_rreg;
2977         adev->pcie_wreg = &amdgpu_invalid_wreg;
2978         adev->pciep_rreg = &amdgpu_invalid_rreg;
2979         adev->pciep_wreg = &amdgpu_invalid_wreg;
2980         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
2981         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
2982         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
2983         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
2984         adev->didt_rreg = &amdgpu_invalid_rreg;
2985         adev->didt_wreg = &amdgpu_invalid_wreg;
2986         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
2987         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
2988         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
2989         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
2990
2991         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
2992                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
2993                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
2994
2995         /* mutex initialization are all done here so we
2996          * can recall function without having locking issues */
2997         atomic_set(&adev->irq.ih.lock, 0);
2998         mutex_init(&adev->firmware.mutex);
2999         mutex_init(&adev->pm.mutex);
3000         mutex_init(&adev->gfx.gpu_clock_mutex);
3001         mutex_init(&adev->srbm_mutex);
3002         mutex_init(&adev->gfx.pipe_reserve_mutex);
3003         mutex_init(&adev->gfx.gfx_off_mutex);
3004         mutex_init(&adev->grbm_idx_mutex);
3005         mutex_init(&adev->mn_lock);
3006         mutex_init(&adev->virt.vf_errors.lock);
3007         hash_init(adev->mn_hash);
3008         mutex_init(&adev->lock_reset);
3009         mutex_init(&adev->psp.mutex);
3010         mutex_init(&adev->notifier_lock);
3011
3012         r = amdgpu_device_check_arguments(adev);
3013         if (r)
3014                 return r;
3015
3016         spin_lock_init(&adev->mmio_idx_lock);
3017         spin_lock_init(&adev->smc_idx_lock);
3018         spin_lock_init(&adev->pcie_idx_lock);
3019         spin_lock_init(&adev->uvd_ctx_idx_lock);
3020         spin_lock_init(&adev->didt_idx_lock);
3021         spin_lock_init(&adev->gc_cac_idx_lock);
3022         spin_lock_init(&adev->se_cac_idx_lock);
3023         spin_lock_init(&adev->audio_endpt_idx_lock);
3024         spin_lock_init(&adev->mm_stats.lock);
3025
3026         INIT_LIST_HEAD(&adev->shadow_list);
3027         mutex_init(&adev->shadow_list_lock);
3028
3029         INIT_DELAYED_WORK(&adev->delayed_init_work,
3030                           amdgpu_device_delayed_init_work_handler);
3031         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3032                           amdgpu_device_delay_enable_gfx_off);
3033
3034         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3035
3036         adev->gfx.gfx_off_req_count = 1;
3037         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3038
3039         /* Registers mapping */
3040         /* TODO: block userspace mapping of io register */
3041         if (adev->asic_type >= CHIP_BONAIRE) {
3042                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3043                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3044         } else {
3045                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3046                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3047         }
3048
3049         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3050         if (adev->rmmio == NULL) {
3051                 return -ENOMEM;
3052         }
3053         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3054         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3055
3056         /* io port mapping */
3057         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3058                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3059                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3060                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3061                         break;
3062                 }
3063         }
3064         if (adev->rio_mem == NULL)
3065                 DRM_INFO("PCI I/O BAR is not found.\n");
3066
3067         /* enable PCIE atomic ops */
3068         r = pci_enable_atomic_ops_to_root(adev->pdev,
3069                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3070                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3071         if (r) {
3072                 adev->have_atomics_support = false;
3073                 DRM_INFO("PCIE atomic ops is not supported\n");
3074         } else {
3075                 adev->have_atomics_support = true;
3076         }
3077
3078         amdgpu_device_get_pcie_info(adev);
3079
3080         if (amdgpu_mcbp)
3081                 DRM_INFO("MCBP is enabled\n");
3082
3083         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3084                 adev->enable_mes = true;
3085
3086         /* detect hw virtualization here */
3087         amdgpu_detect_virtualization(adev);
3088
3089         r = amdgpu_device_get_job_timeout_settings(adev);
3090         if (r) {
3091                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3092                 return r;
3093         }
3094
3095         /* early init functions */
3096         r = amdgpu_device_ip_early_init(adev);
3097         if (r)
3098                 return r;
3099
3100         /* doorbell bar mapping and doorbell index init*/
3101         amdgpu_device_doorbell_init(adev);
3102
3103         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3104         /* this will fail for cards that aren't VGA class devices, just
3105          * ignore it */
3106         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3107
3108         if (amdgpu_device_supports_boco(ddev))
3109                 boco = true;
3110         if (amdgpu_has_atpx() &&
3111             (amdgpu_is_atpx_hybrid() ||
3112              amdgpu_has_atpx_dgpu_power_cntl()) &&
3113             !pci_is_thunderbolt_attached(adev->pdev))
3114                 vga_switcheroo_register_client(adev->pdev,
3115                                                &amdgpu_switcheroo_ops, boco);
3116         if (boco)
3117                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3118
3119         if (amdgpu_emu_mode == 1) {
3120                 /* post the asic on emulation mode */
3121                 emu_soc_asic_init(adev);
3122                 goto fence_driver_init;
3123         }
3124
3125         /* detect if we are with an SRIOV vbios */
3126         amdgpu_device_detect_sriov_bios(adev);
3127
3128         /* check if we need to reset the asic
3129          *  E.g., driver was not cleanly unloaded previously, etc.
3130          */
3131         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3132                 r = amdgpu_asic_reset(adev);
3133                 if (r) {
3134                         dev_err(adev->dev, "asic reset on init failed\n");
3135                         goto failed;
3136                 }
3137         }
3138
3139         /* Post card if necessary */
3140         if (amdgpu_device_need_post(adev)) {
3141                 if (!adev->bios) {
3142                         dev_err(adev->dev, "no vBIOS found\n");
3143                         r = -EINVAL;
3144                         goto failed;
3145                 }
3146                 DRM_INFO("GPU posting now...\n");
3147                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3148                 if (r) {
3149                         dev_err(adev->dev, "gpu post error!\n");
3150                         goto failed;
3151                 }
3152         }
3153
3154         if (adev->is_atom_fw) {
3155                 /* Initialize clocks */
3156                 r = amdgpu_atomfirmware_get_clock_info(adev);
3157                 if (r) {
3158                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3159                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3160                         goto failed;
3161                 }
3162         } else {
3163                 /* Initialize clocks */
3164                 r = amdgpu_atombios_get_clock_info(adev);
3165                 if (r) {
3166                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3167                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3168                         goto failed;
3169                 }
3170                 /* init i2c buses */
3171                 if (!amdgpu_device_has_dc_support(adev))
3172                         amdgpu_atombios_i2c_init(adev);
3173         }
3174
3175 fence_driver_init:
3176         /* Fence driver */
3177         r = amdgpu_fence_driver_init(adev);
3178         if (r) {
3179                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3180                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3181                 goto failed;
3182         }
3183
3184         /* init the mode config */
3185         drm_mode_config_init(adev->ddev);
3186
3187         r = amdgpu_device_ip_init(adev);
3188         if (r) {
3189                 /* failed in exclusive mode due to timeout */
3190                 if (amdgpu_sriov_vf(adev) &&
3191                     !amdgpu_sriov_runtime(adev) &&
3192                     amdgpu_virt_mmio_blocked(adev) &&
3193                     !amdgpu_virt_wait_reset(adev)) {
3194                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3195                         /* Don't send request since VF is inactive. */
3196                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3197                         adev->virt.ops = NULL;
3198                         r = -EAGAIN;
3199                         goto failed;
3200                 }
3201                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3202                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3203                 goto failed;
3204         }
3205
3206         dev_info(adev->dev,
3207                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3208                         adev->gfx.config.max_shader_engines,
3209                         adev->gfx.config.max_sh_per_se,
3210                         adev->gfx.config.max_cu_per_sh,
3211                         adev->gfx.cu_info.number);
3212
3213         adev->accel_working = true;
3214
3215         amdgpu_vm_check_compute_bug(adev);
3216
3217         /* Initialize the buffer migration limit. */
3218         if (amdgpu_moverate >= 0)
3219                 max_MBps = amdgpu_moverate;
3220         else
3221                 max_MBps = 8; /* Allow 8 MB/s. */
3222         /* Get a log2 for easy divisions. */
3223         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3224
3225         amdgpu_fbdev_init(adev);
3226
3227         r = amdgpu_pm_sysfs_init(adev);
3228         if (r) {
3229                 adev->pm_sysfs_en = false;
3230                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3231         } else
3232                 adev->pm_sysfs_en = true;
3233
3234         r = amdgpu_ucode_sysfs_init(adev);
3235         if (r) {
3236                 adev->ucode_sysfs_en = false;
3237                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3238         } else
3239                 adev->ucode_sysfs_en = true;
3240
3241         if ((amdgpu_testing & 1)) {
3242                 if (adev->accel_working)
3243                         amdgpu_test_moves(adev);
3244                 else
3245                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3246         }
3247         if (amdgpu_benchmarking) {
3248                 if (adev->accel_working)
3249                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3250                 else
3251                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3252         }
3253
3254         /*
3255          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3256          * Otherwise the mgpu fan boost feature will be skipped due to the
3257          * gpu instance is counted less.
3258          */
3259         amdgpu_register_gpu_instance(adev);
3260
3261         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3262          * explicit gating rather than handling it automatically.
3263          */
3264         r = amdgpu_device_ip_late_init(adev);
3265         if (r) {
3266                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3267                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3268                 goto failed;
3269         }
3270
3271         /* must succeed. */
3272         amdgpu_ras_resume(adev);
3273
3274         queue_delayed_work(system_wq, &adev->delayed_init_work,
3275                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3276
3277         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3278         if (r) {
3279                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3280                 return r;
3281         }
3282
3283         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3284                 r = amdgpu_pmu_init(adev);
3285         if (r)
3286                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3287
3288         return 0;
3289
3290 failed:
3291         amdgpu_vf_error_trans_all(adev);
3292         if (boco)
3293                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3294
3295         return r;
3296 }
3297
3298 /**
3299  * amdgpu_device_fini - tear down the driver
3300  *
3301  * @adev: amdgpu_device pointer
3302  *
3303  * Tear down the driver info (all asics).
3304  * Called at driver shutdown.
3305  */
3306 void amdgpu_device_fini(struct amdgpu_device *adev)
3307 {
3308         int r;
3309
3310         DRM_INFO("amdgpu: finishing device.\n");
3311         flush_delayed_work(&adev->delayed_init_work);
3312         adev->shutdown = true;
3313
3314         /* make sure IB test finished before entering exclusive mode
3315          * to avoid preemption on IB test
3316          * */
3317         if (amdgpu_sriov_vf(adev))
3318                 amdgpu_virt_request_full_gpu(adev, false);
3319
3320         /* disable all interrupts */
3321         amdgpu_irq_disable_all(adev);
3322         if (adev->mode_info.mode_config_initialized){
3323                 if (!amdgpu_device_has_dc_support(adev))
3324                         drm_helper_force_disable_all(adev->ddev);
3325                 else
3326                         drm_atomic_helper_shutdown(adev->ddev);
3327         }
3328         amdgpu_fence_driver_fini(adev);
3329         if (adev->pm_sysfs_en)
3330                 amdgpu_pm_sysfs_fini(adev);
3331         amdgpu_fbdev_fini(adev);
3332         r = amdgpu_device_ip_fini(adev);
3333         if (adev->firmware.gpu_info_fw) {
3334                 release_firmware(adev->firmware.gpu_info_fw);
3335                 adev->firmware.gpu_info_fw = NULL;
3336         }
3337         adev->accel_working = false;
3338         /* free i2c buses */
3339         if (!amdgpu_device_has_dc_support(adev))
3340                 amdgpu_i2c_fini(adev);
3341
3342         if (amdgpu_emu_mode != 1)
3343                 amdgpu_atombios_fini(adev);
3344
3345         kfree(adev->bios);
3346         adev->bios = NULL;
3347         if (amdgpu_has_atpx() &&
3348             (amdgpu_is_atpx_hybrid() ||
3349              amdgpu_has_atpx_dgpu_power_cntl()) &&
3350             !pci_is_thunderbolt_attached(adev->pdev))
3351                 vga_switcheroo_unregister_client(adev->pdev);
3352         if (amdgpu_device_supports_boco(adev->ddev))
3353                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3354         vga_client_register(adev->pdev, NULL, NULL, NULL);
3355         if (adev->rio_mem)
3356                 pci_iounmap(adev->pdev, adev->rio_mem);
3357         adev->rio_mem = NULL;
3358         iounmap(adev->rmmio);
3359         adev->rmmio = NULL;
3360         amdgpu_device_doorbell_fini(adev);
3361
3362         if (adev->ucode_sysfs_en)
3363                 amdgpu_ucode_sysfs_fini(adev);
3364
3365         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3366         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3367                 amdgpu_pmu_fini(adev);
3368         if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
3369                 amdgpu_discovery_fini(adev);
3370 }
3371
3372
3373 /*
3374  * Suspend & resume.
3375  */
3376 /**
3377  * amdgpu_device_suspend - initiate device suspend
3378  *
3379  * @dev: drm dev pointer
3380  * @suspend: suspend state
3381  * @fbcon : notify the fbdev of suspend
3382  *
3383  * Puts the hw in the suspend state (all asics).
3384  * Returns 0 for success or an error on failure.
3385  * Called at driver suspend.
3386  */
3387 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3388 {
3389         struct amdgpu_device *adev;
3390         struct drm_crtc *crtc;
3391         struct drm_connector *connector;
3392         struct drm_connector_list_iter iter;
3393         int r;
3394
3395         if (dev == NULL || dev->dev_private == NULL) {
3396                 return -ENODEV;
3397         }
3398
3399         adev = dev->dev_private;
3400
3401         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3402                 return 0;
3403
3404         adev->in_suspend = true;
3405         drm_kms_helper_poll_disable(dev);
3406
3407         if (fbcon)
3408                 amdgpu_fbdev_set_suspend(adev, 1);
3409
3410         cancel_delayed_work_sync(&adev->delayed_init_work);
3411
3412         if (!amdgpu_device_has_dc_support(adev)) {
3413                 /* turn off display hw */
3414                 drm_modeset_lock_all(dev);
3415                 drm_connector_list_iter_begin(dev, &iter);
3416                 drm_for_each_connector_iter(connector, &iter)
3417                         drm_helper_connector_dpms(connector,
3418                                                   DRM_MODE_DPMS_OFF);
3419                 drm_connector_list_iter_end(&iter);
3420                 drm_modeset_unlock_all(dev);
3421                         /* unpin the front buffers and cursors */
3422                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3423                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3424                         struct drm_framebuffer *fb = crtc->primary->fb;
3425                         struct amdgpu_bo *robj;
3426
3427                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3428                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3429                                 r = amdgpu_bo_reserve(aobj, true);
3430                                 if (r == 0) {
3431                                         amdgpu_bo_unpin(aobj);
3432                                         amdgpu_bo_unreserve(aobj);
3433                                 }
3434                         }
3435
3436                         if (fb == NULL || fb->obj[0] == NULL) {
3437                                 continue;
3438                         }
3439                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3440                         /* don't unpin kernel fb objects */
3441                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3442                                 r = amdgpu_bo_reserve(robj, true);
3443                                 if (r == 0) {
3444                                         amdgpu_bo_unpin(robj);
3445                                         amdgpu_bo_unreserve(robj);
3446                                 }
3447                         }
3448                 }
3449         }
3450
3451         amdgpu_ras_suspend(adev);
3452
3453         r = amdgpu_device_ip_suspend_phase1(adev);
3454
3455         amdgpu_amdkfd_suspend(adev, !fbcon);
3456
3457         /* evict vram memory */
3458         amdgpu_bo_evict_vram(adev);
3459
3460         amdgpu_fence_driver_suspend(adev);
3461
3462         r = amdgpu_device_ip_suspend_phase2(adev);
3463
3464         /* evict remaining vram memory
3465          * This second call to evict vram is to evict the gart page table
3466          * using the CPU.
3467          */
3468         amdgpu_bo_evict_vram(adev);
3469
3470         return 0;
3471 }
3472
3473 /**
3474  * amdgpu_device_resume - initiate device resume
3475  *
3476  * @dev: drm dev pointer
3477  * @resume: resume state
3478  * @fbcon : notify the fbdev of resume
3479  *
3480  * Bring the hw back to operating state (all asics).
3481  * Returns 0 for success or an error on failure.
3482  * Called at driver resume.
3483  */
3484 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3485 {
3486         struct drm_connector *connector;
3487         struct drm_connector_list_iter iter;
3488         struct amdgpu_device *adev = dev->dev_private;
3489         struct drm_crtc *crtc;
3490         int r = 0;
3491
3492         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3493                 return 0;
3494
3495         /* post card */
3496         if (amdgpu_device_need_post(adev)) {
3497                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3498                 if (r)
3499                         DRM_ERROR("amdgpu asic init failed\n");
3500         }
3501
3502         r = amdgpu_device_ip_resume(adev);
3503         if (r) {
3504                 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3505                 return r;
3506         }
3507         amdgpu_fence_driver_resume(adev);
3508
3509
3510         r = amdgpu_device_ip_late_init(adev);
3511         if (r)
3512                 return r;
3513
3514         queue_delayed_work(system_wq, &adev->delayed_init_work,
3515                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3516
3517         if (!amdgpu_device_has_dc_support(adev)) {
3518                 /* pin cursors */
3519                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3520                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3521
3522                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3523                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3524                                 r = amdgpu_bo_reserve(aobj, true);
3525                                 if (r == 0) {
3526                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3527                                         if (r != 0)
3528                                                 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3529                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3530                                         amdgpu_bo_unreserve(aobj);
3531                                 }
3532                         }
3533                 }
3534         }
3535         r = amdgpu_amdkfd_resume(adev, !fbcon);
3536         if (r)
3537                 return r;
3538
3539         /* Make sure IB tests flushed */
3540         flush_delayed_work(&adev->delayed_init_work);
3541
3542         /* blat the mode back in */
3543         if (fbcon) {
3544                 if (!amdgpu_device_has_dc_support(adev)) {
3545                         /* pre DCE11 */
3546                         drm_helper_resume_force_mode(dev);
3547
3548                         /* turn on display hw */
3549                         drm_modeset_lock_all(dev);
3550
3551                         drm_connector_list_iter_begin(dev, &iter);
3552                         drm_for_each_connector_iter(connector, &iter)
3553                                 drm_helper_connector_dpms(connector,
3554                                                           DRM_MODE_DPMS_ON);
3555                         drm_connector_list_iter_end(&iter);
3556
3557                         drm_modeset_unlock_all(dev);
3558                 }
3559                 amdgpu_fbdev_set_suspend(adev, 0);
3560         }
3561
3562         drm_kms_helper_poll_enable(dev);
3563
3564         amdgpu_ras_resume(adev);
3565
3566         /*
3567          * Most of the connector probing functions try to acquire runtime pm
3568          * refs to ensure that the GPU is powered on when connector polling is
3569          * performed. Since we're calling this from a runtime PM callback,
3570          * trying to acquire rpm refs will cause us to deadlock.
3571          *
3572          * Since we're guaranteed to be holding the rpm lock, it's safe to
3573          * temporarily disable the rpm helpers so this doesn't deadlock us.
3574          */
3575 #ifdef CONFIG_PM
3576         dev->dev->power.disable_depth++;
3577 #endif
3578         if (!amdgpu_device_has_dc_support(adev))
3579                 drm_helper_hpd_irq_event(dev);
3580         else
3581                 drm_kms_helper_hotplug_event(dev);
3582 #ifdef CONFIG_PM
3583         dev->dev->power.disable_depth--;
3584 #endif
3585         adev->in_suspend = false;
3586
3587         return 0;
3588 }
3589
3590 /**
3591  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3592  *
3593  * @adev: amdgpu_device pointer
3594  *
3595  * The list of all the hardware IPs that make up the asic is walked and
3596  * the check_soft_reset callbacks are run.  check_soft_reset determines
3597  * if the asic is still hung or not.
3598  * Returns true if any of the IPs are still in a hung state, false if not.
3599  */
3600 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3601 {
3602         int i;
3603         bool asic_hang = false;
3604
3605         if (amdgpu_sriov_vf(adev))
3606                 return true;
3607
3608         if (amdgpu_asic_need_full_reset(adev))
3609                 return true;
3610
3611         for (i = 0; i < adev->num_ip_blocks; i++) {
3612                 if (!adev->ip_blocks[i].status.valid)
3613                         continue;
3614                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3615                         adev->ip_blocks[i].status.hang =
3616                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3617                 if (adev->ip_blocks[i].status.hang) {
3618                         DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3619                         asic_hang = true;
3620                 }
3621         }
3622         return asic_hang;
3623 }
3624
3625 /**
3626  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3627  *
3628  * @adev: amdgpu_device pointer
3629  *
3630  * The list of all the hardware IPs that make up the asic is walked and the
3631  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3632  * handles any IP specific hardware or software state changes that are
3633  * necessary for a soft reset to succeed.
3634  * Returns 0 on success, negative error code on failure.
3635  */
3636 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3637 {
3638         int i, r = 0;
3639
3640         for (i = 0; i < adev->num_ip_blocks; i++) {
3641                 if (!adev->ip_blocks[i].status.valid)
3642                         continue;
3643                 if (adev->ip_blocks[i].status.hang &&
3644                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3645                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3646                         if (r)
3647                                 return r;
3648                 }
3649         }
3650
3651         return 0;
3652 }
3653
3654 /**
3655  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3656  *
3657  * @adev: amdgpu_device pointer
3658  *
3659  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3660  * reset is necessary to recover.
3661  * Returns true if a full asic reset is required, false if not.
3662  */
3663 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3664 {
3665         int i;
3666
3667         if (amdgpu_asic_need_full_reset(adev))
3668                 return true;
3669
3670         for (i = 0; i < adev->num_ip_blocks; i++) {
3671                 if (!adev->ip_blocks[i].status.valid)
3672                         continue;
3673                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3674                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3675                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3676                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3677                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3678                         if (adev->ip_blocks[i].status.hang) {
3679                                 DRM_INFO("Some block need full reset!\n");
3680                                 return true;
3681                         }
3682                 }
3683         }
3684         return false;
3685 }
3686
3687 /**
3688  * amdgpu_device_ip_soft_reset - do a soft reset
3689  *
3690  * @adev: amdgpu_device pointer
3691  *
3692  * The list of all the hardware IPs that make up the asic is walked and the
3693  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3694  * IP specific hardware or software state changes that are necessary to soft
3695  * reset the IP.
3696  * Returns 0 on success, negative error code on failure.
3697  */
3698 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3699 {
3700         int i, r = 0;
3701
3702         for (i = 0; i < adev->num_ip_blocks; i++) {
3703                 if (!adev->ip_blocks[i].status.valid)
3704                         continue;
3705                 if (adev->ip_blocks[i].status.hang &&
3706                     adev->ip_blocks[i].version->funcs->soft_reset) {
3707                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3708                         if (r)
3709                                 return r;
3710                 }
3711         }
3712
3713         return 0;
3714 }
3715
3716 /**
3717  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3718  *
3719  * @adev: amdgpu_device pointer
3720  *
3721  * The list of all the hardware IPs that make up the asic is walked and the
3722  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3723  * handles any IP specific hardware or software state changes that are
3724  * necessary after the IP has been soft reset.
3725  * Returns 0 on success, negative error code on failure.
3726  */
3727 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3728 {
3729         int i, r = 0;
3730
3731         for (i = 0; i < adev->num_ip_blocks; i++) {
3732                 if (!adev->ip_blocks[i].status.valid)
3733                         continue;
3734                 if (adev->ip_blocks[i].status.hang &&
3735                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3736                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3737                 if (r)
3738                         return r;
3739         }
3740
3741         return 0;
3742 }
3743
3744 /**
3745  * amdgpu_device_recover_vram - Recover some VRAM contents
3746  *
3747  * @adev: amdgpu_device pointer
3748  *
3749  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3750  * restore things like GPUVM page tables after a GPU reset where
3751  * the contents of VRAM might be lost.
3752  *
3753  * Returns:
3754  * 0 on success, negative error code on failure.
3755  */
3756 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3757 {
3758         struct dma_fence *fence = NULL, *next = NULL;
3759         struct amdgpu_bo *shadow;
3760         long r = 1, tmo;
3761
3762         if (amdgpu_sriov_runtime(adev))
3763                 tmo = msecs_to_jiffies(8000);
3764         else
3765                 tmo = msecs_to_jiffies(100);
3766
3767         DRM_INFO("recover vram bo from shadow start\n");
3768         mutex_lock(&adev->shadow_list_lock);
3769         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3770
3771                 /* No need to recover an evicted BO */
3772                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3773                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3774                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3775                         continue;
3776
3777                 r = amdgpu_bo_restore_shadow(shadow, &next);
3778                 if (r)
3779                         break;
3780
3781                 if (fence) {
3782                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3783                         dma_fence_put(fence);
3784                         fence = next;
3785                         if (tmo == 0) {
3786                                 r = -ETIMEDOUT;
3787                                 break;
3788                         } else if (tmo < 0) {
3789                                 r = tmo;
3790                                 break;
3791                         }
3792                 } else {
3793                         fence = next;
3794                 }
3795         }
3796         mutex_unlock(&adev->shadow_list_lock);
3797
3798         if (fence)
3799                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3800         dma_fence_put(fence);
3801
3802         if (r < 0 || tmo <= 0) {
3803                 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3804                 return -EIO;
3805         }
3806
3807         DRM_INFO("recover vram bo from shadow done\n");
3808         return 0;
3809 }
3810
3811
3812 /**
3813  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3814  *
3815  * @adev: amdgpu device pointer
3816  * @from_hypervisor: request from hypervisor
3817  *
3818  * do VF FLR and reinitialize Asic
3819  * return 0 means succeeded otherwise failed
3820  */
3821 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3822                                      bool from_hypervisor)
3823 {
3824         int r;
3825
3826         if (from_hypervisor)
3827                 r = amdgpu_virt_request_full_gpu(adev, true);
3828         else
3829                 r = amdgpu_virt_reset_gpu(adev);
3830         if (r)
3831                 return r;
3832
3833         amdgpu_amdkfd_pre_reset(adev);
3834
3835         /* Resume IP prior to SMC */
3836         r = amdgpu_device_ip_reinit_early_sriov(adev);
3837         if (r)
3838                 goto error;
3839
3840         amdgpu_virt_init_data_exchange(adev);
3841         /* we need recover gart prior to run SMC/CP/SDMA resume */
3842         amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3843
3844         r = amdgpu_device_fw_loading(adev);
3845         if (r)
3846                 return r;
3847
3848         /* now we are okay to resume SMC/CP/SDMA */
3849         r = amdgpu_device_ip_reinit_late_sriov(adev);
3850         if (r)
3851                 goto error;
3852
3853         amdgpu_irq_gpu_reset_resume_helper(adev);
3854         r = amdgpu_ib_ring_tests(adev);
3855         amdgpu_amdkfd_post_reset(adev);
3856
3857 error:
3858         amdgpu_virt_release_full_gpu(adev, true);
3859         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3860                 amdgpu_inc_vram_lost(adev);
3861                 r = amdgpu_device_recover_vram(adev);
3862         }
3863
3864         return r;
3865 }
3866
3867 /**
3868  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3869  *
3870  * @adev: amdgpu device pointer
3871  *
3872  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3873  * a hung GPU.
3874  */
3875 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3876 {
3877         if (!amdgpu_device_ip_check_soft_reset(adev)) {
3878                 DRM_INFO("Timeout, but no hardware hang detected.\n");
3879                 return false;
3880         }
3881
3882         if (amdgpu_gpu_recovery == 0)
3883                 goto disabled;
3884
3885         if (amdgpu_sriov_vf(adev))
3886                 return true;
3887
3888         if (amdgpu_gpu_recovery == -1) {
3889                 switch (adev->asic_type) {
3890                 case CHIP_BONAIRE:
3891                 case CHIP_HAWAII:
3892                 case CHIP_TOPAZ:
3893                 case CHIP_TONGA:
3894                 case CHIP_FIJI:
3895                 case CHIP_POLARIS10:
3896                 case CHIP_POLARIS11:
3897                 case CHIP_POLARIS12:
3898                 case CHIP_VEGAM:
3899                 case CHIP_VEGA20:
3900                 case CHIP_VEGA10:
3901                 case CHIP_VEGA12:
3902                 case CHIP_RAVEN:
3903                 case CHIP_ARCTURUS:
3904                 case CHIP_RENOIR:
3905                 case CHIP_NAVI10:
3906                 case CHIP_NAVI14:
3907                 case CHIP_NAVI12:
3908                         break;
3909                 default:
3910                         goto disabled;
3911                 }
3912         }
3913
3914         return true;
3915
3916 disabled:
3917                 DRM_INFO("GPU recovery disabled.\n");
3918                 return false;
3919 }
3920
3921
3922 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3923                                         struct amdgpu_job *job,
3924                                         bool *need_full_reset_arg)
3925 {
3926         int i, r = 0;
3927         bool need_full_reset  = *need_full_reset_arg;
3928
3929         amdgpu_debugfs_wait_dump(adev);
3930
3931         /* block all schedulers and reset given job's ring */
3932         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3933                 struct amdgpu_ring *ring = adev->rings[i];
3934
3935                 if (!ring || !ring->sched.thread)
3936                         continue;
3937
3938                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3939                 amdgpu_fence_driver_force_completion(ring);
3940         }
3941
3942         if(job)
3943                 drm_sched_increase_karma(&job->base);
3944
3945         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
3946         if (!amdgpu_sriov_vf(adev)) {
3947
3948                 if (!need_full_reset)
3949                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3950
3951                 if (!need_full_reset) {
3952                         amdgpu_device_ip_pre_soft_reset(adev);
3953                         r = amdgpu_device_ip_soft_reset(adev);
3954                         amdgpu_device_ip_post_soft_reset(adev);
3955                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
3956                                 DRM_INFO("soft reset failed, will fallback to full reset!\n");
3957                                 need_full_reset = true;
3958                         }
3959                 }
3960
3961                 if (need_full_reset)
3962                         r = amdgpu_device_ip_suspend(adev);
3963
3964                 *need_full_reset_arg = need_full_reset;
3965         }
3966
3967         return r;
3968 }
3969
3970 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
3971                                struct list_head *device_list_handle,
3972                                bool *need_full_reset_arg)
3973 {
3974         struct amdgpu_device *tmp_adev = NULL;
3975         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
3976         int r = 0;
3977
3978         /*
3979          * ASIC reset has to be done on all HGMI hive nodes ASAP
3980          * to allow proper links negotiation in FW (within 1 sec)
3981          */
3982         if (need_full_reset) {
3983                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3984                         /* For XGMI run all resets in parallel to speed up the process */
3985                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
3986                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
3987                                         r = -EALREADY;
3988                         } else
3989                                 r = amdgpu_asic_reset(tmp_adev);
3990
3991                         if (r) {
3992                                 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
3993                                          r, tmp_adev->ddev->unique);
3994                                 break;
3995                         }
3996                 }
3997
3998                 /* For XGMI wait for all resets to complete before proceed */
3999                 if (!r) {
4000                         list_for_each_entry(tmp_adev, device_list_handle,
4001                                             gmc.xgmi.head) {
4002                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4003                                         flush_work(&tmp_adev->xgmi_reset_work);
4004                                         r = tmp_adev->asic_reset_res;
4005                                         if (r)
4006                                                 break;
4007                                 }
4008                         }
4009                 }
4010         }
4011
4012         if (!r && amdgpu_ras_intr_triggered()) {
4013                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4014                         if (tmp_adev->mmhub.funcs &&
4015                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4016                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4017                 }
4018
4019                 amdgpu_ras_intr_cleared();
4020         }
4021
4022         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4023                 if (need_full_reset) {
4024                         /* post card */
4025                         if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
4026                                 DRM_WARN("asic atom init failed!");
4027
4028                         if (!r) {
4029                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4030                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4031                                 if (r)
4032                                         goto out;
4033
4034                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4035                                 if (vram_lost) {
4036                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4037                                         amdgpu_inc_vram_lost(tmp_adev);
4038                                 }
4039
4040                                 r = amdgpu_gtt_mgr_recover(
4041                                         &tmp_adev->mman.bdev.man[TTM_PL_TT]);
4042                                 if (r)
4043                                         goto out;
4044
4045                                 r = amdgpu_device_fw_loading(tmp_adev);
4046                                 if (r)
4047                                         return r;
4048
4049                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4050                                 if (r)
4051                                         goto out;
4052
4053                                 if (vram_lost)
4054                                         amdgpu_device_fill_reset_magic(tmp_adev);
4055
4056                                 /*
4057                                  * Add this ASIC as tracked as reset was already
4058                                  * complete successfully.
4059                                  */
4060                                 amdgpu_register_gpu_instance(tmp_adev);
4061
4062                                 r = amdgpu_device_ip_late_init(tmp_adev);
4063                                 if (r)
4064                                         goto out;
4065
4066                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4067
4068                                 /* must succeed. */
4069                                 amdgpu_ras_resume(tmp_adev);
4070
4071                                 /* Update PSP FW topology after reset */
4072                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4073                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4074                         }
4075                 }
4076
4077
4078 out:
4079                 if (!r) {
4080                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4081                         r = amdgpu_ib_ring_tests(tmp_adev);
4082                         if (r) {
4083                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4084                                 r = amdgpu_device_ip_suspend(tmp_adev);
4085                                 need_full_reset = true;
4086                                 r = -EAGAIN;
4087                                 goto end;
4088                         }
4089                 }
4090
4091                 if (!r)
4092                         r = amdgpu_device_recover_vram(tmp_adev);
4093                 else
4094                         tmp_adev->asic_reset_res = r;
4095         }
4096
4097 end:
4098         *need_full_reset_arg = need_full_reset;
4099         return r;
4100 }
4101
4102 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
4103 {
4104         if (trylock) {
4105                 if (!mutex_trylock(&adev->lock_reset))
4106                         return false;
4107         } else
4108                 mutex_lock(&adev->lock_reset);
4109
4110         atomic_inc(&adev->gpu_reset_counter);
4111         adev->in_gpu_reset = true;
4112         switch (amdgpu_asic_reset_method(adev)) {
4113         case AMD_RESET_METHOD_MODE1:
4114                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4115                 break;
4116         case AMD_RESET_METHOD_MODE2:
4117                 adev->mp1_state = PP_MP1_STATE_RESET;
4118                 break;
4119         default:
4120                 adev->mp1_state = PP_MP1_STATE_NONE;
4121                 break;
4122         }
4123
4124         return true;
4125 }
4126
4127 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4128 {
4129         amdgpu_vf_error_trans_all(adev);
4130         adev->mp1_state = PP_MP1_STATE_NONE;
4131         adev->in_gpu_reset = false;
4132         mutex_unlock(&adev->lock_reset);
4133 }
4134
4135 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4136 {
4137         struct pci_dev *p = NULL;
4138
4139         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4140                         adev->pdev->bus->number, 1);
4141         if (p) {
4142                 pm_runtime_enable(&(p->dev));
4143                 pm_runtime_resume(&(p->dev));
4144         }
4145 }
4146
4147 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4148 {
4149         enum amd_reset_method reset_method;
4150         struct pci_dev *p = NULL;
4151         u64 expires;
4152
4153         /*
4154          * For now, only BACO and mode1 reset are confirmed
4155          * to suffer the audio issue without proper suspended.
4156          */
4157         reset_method = amdgpu_asic_reset_method(adev);
4158         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4159              (reset_method != AMD_RESET_METHOD_MODE1))
4160                 return -EINVAL;
4161
4162         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4163                         adev->pdev->bus->number, 1);
4164         if (!p)
4165                 return -ENODEV;
4166
4167         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4168         if (!expires)
4169                 /*
4170                  * If we cannot get the audio device autosuspend delay,
4171                  * a fixed 4S interval will be used. Considering 3S is
4172                  * the audio controller default autosuspend delay setting.
4173                  * 4S used here is guaranteed to cover that.
4174                  */
4175                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4176
4177         while (!pm_runtime_status_suspended(&(p->dev))) {
4178                 if (!pm_runtime_suspend(&(p->dev)))
4179                         break;
4180
4181                 if (expires < ktime_get_mono_fast_ns()) {
4182                         dev_warn(adev->dev, "failed to suspend display audio\n");
4183                         /* TODO: abort the succeeding gpu reset? */
4184                         return -ETIMEDOUT;
4185                 }
4186         }
4187
4188         pm_runtime_disable(&(p->dev));
4189
4190         return 0;
4191 }
4192
4193 /**
4194  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4195  *
4196  * @adev: amdgpu device pointer
4197  * @job: which job trigger hang
4198  *
4199  * Attempt to reset the GPU if it has hung (all asics).
4200  * Attempt to do soft-reset or full-reset and reinitialize Asic
4201  * Returns 0 for success or an error on failure.
4202  */
4203
4204 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4205                               struct amdgpu_job *job)
4206 {
4207         struct list_head device_list, *device_list_handle =  NULL;
4208         bool need_full_reset = false;
4209         bool job_signaled = false;
4210         struct amdgpu_hive_info *hive = NULL;
4211         struct amdgpu_device *tmp_adev = NULL;
4212         int i, r = 0;
4213         bool in_ras_intr = amdgpu_ras_intr_triggered();
4214         bool use_baco =
4215                 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
4216                 true : false;
4217         bool audio_suspended = false;
4218
4219         /*
4220          * Flush RAM to disk so that after reboot
4221          * the user can read log and see why the system rebooted.
4222          */
4223         if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
4224
4225                 DRM_WARN("Emergency reboot.");
4226
4227                 ksys_sync_helper();
4228                 emergency_restart();
4229         }
4230
4231         dev_info(adev->dev, "GPU %s begin!\n",
4232                 (in_ras_intr && !use_baco) ? "jobs stop":"reset");
4233
4234         /*
4235          * Here we trylock to avoid chain of resets executing from
4236          * either trigger by jobs on different adevs in XGMI hive or jobs on
4237          * different schedulers for same device while this TO handler is running.
4238          * We always reset all schedulers for device and all devices for XGMI
4239          * hive so that should take care of them too.
4240          */
4241         hive = amdgpu_get_xgmi_hive(adev, true);
4242         if (hive && !mutex_trylock(&hive->reset_lock)) {
4243                 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4244                           job ? job->base.id : -1, hive->hive_id);
4245                 mutex_unlock(&hive->hive_lock);
4246                 return 0;
4247         }
4248
4249         /*
4250          * Build list of devices to reset.
4251          * In case we are in XGMI hive mode, resort the device list
4252          * to put adev in the 1st position.
4253          */
4254         INIT_LIST_HEAD(&device_list);
4255         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4256                 if (!hive)
4257                         return -ENODEV;
4258                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4259                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4260                 device_list_handle = &hive->device_list;
4261         } else {
4262                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4263                 device_list_handle = &device_list;
4264         }
4265
4266         /* block all schedulers and reset given job's ring */
4267         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4268                 if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
4269                         DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
4270                                   job ? job->base.id : -1);
4271                         mutex_unlock(&hive->hive_lock);
4272                         return 0;
4273                 }
4274
4275                 /*
4276                  * Try to put the audio codec into suspend state
4277                  * before gpu reset started.
4278                  *
4279                  * Due to the power domain of the graphics device
4280                  * is shared with AZ power domain. Without this,
4281                  * we may change the audio hardware from behind
4282                  * the audio driver's back. That will trigger
4283                  * some audio codec errors.
4284                  */
4285                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4286                         audio_suspended = true;
4287
4288                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4289
4290                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4291
4292                 if (!amdgpu_sriov_vf(tmp_adev))
4293                         amdgpu_amdkfd_pre_reset(tmp_adev);
4294
4295                 /*
4296                  * Mark these ASICs to be reseted as untracked first
4297                  * And add them back after reset completed
4298                  */
4299                 amdgpu_unregister_gpu_instance(tmp_adev);
4300
4301                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4302
4303                 /* disable ras on ALL IPs */
4304                 if (!(in_ras_intr && !use_baco) &&
4305                       amdgpu_device_ip_need_full_reset(tmp_adev))
4306                         amdgpu_ras_suspend(tmp_adev);
4307
4308                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4309                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4310
4311                         if (!ring || !ring->sched.thread)
4312                                 continue;
4313
4314                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4315
4316                         if (in_ras_intr && !use_baco)
4317                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4318                 }
4319         }
4320
4321         if (in_ras_intr && !use_baco)
4322                 goto skip_sched_resume;
4323
4324         /*
4325          * Must check guilty signal here since after this point all old
4326          * HW fences are force signaled.
4327          *
4328          * job->base holds a reference to parent fence
4329          */
4330         if (job && job->base.s_fence->parent &&
4331             dma_fence_is_signaled(job->base.s_fence->parent)) {
4332                 job_signaled = true;
4333                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4334                 goto skip_hw_reset;
4335         }
4336
4337 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4338         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4339                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4340                                                  NULL,
4341                                                  &need_full_reset);
4342                 /*TODO Should we stop ?*/
4343                 if (r) {
4344                         DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4345                                   r, tmp_adev->ddev->unique);
4346                         tmp_adev->asic_reset_res = r;
4347                 }
4348         }
4349
4350         /* Actual ASIC resets if needed.*/
4351         /* TODO Implement XGMI hive reset logic for SRIOV */
4352         if (amdgpu_sriov_vf(adev)) {
4353                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4354                 if (r)
4355                         adev->asic_reset_res = r;
4356         } else {
4357                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4358                 if (r && r == -EAGAIN)
4359                         goto retry;
4360         }
4361
4362 skip_hw_reset:
4363
4364         /* Post ASIC reset for all devs .*/
4365         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4366
4367                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4368                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4369
4370                         if (!ring || !ring->sched.thread)
4371                                 continue;
4372
4373                         /* No point to resubmit jobs if we didn't HW reset*/
4374                         if (!tmp_adev->asic_reset_res && !job_signaled)
4375                                 drm_sched_resubmit_jobs(&ring->sched);
4376
4377                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4378                 }
4379
4380                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4381                         drm_helper_resume_force_mode(tmp_adev->ddev);
4382                 }
4383
4384                 tmp_adev->asic_reset_res = 0;
4385
4386                 if (r) {
4387                         /* bad news, how to tell it to userspace ? */
4388                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4389                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4390                 } else {
4391                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4392                 }
4393         }
4394
4395 skip_sched_resume:
4396         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4397                 /*unlock kfd: SRIOV would do it separately */
4398                 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
4399                         amdgpu_amdkfd_post_reset(tmp_adev);
4400                 if (audio_suspended)
4401                         amdgpu_device_resume_display_audio(tmp_adev);
4402                 amdgpu_device_unlock_adev(tmp_adev);
4403         }
4404
4405         if (hive) {
4406                 mutex_unlock(&hive->reset_lock);
4407                 mutex_unlock(&hive->hive_lock);
4408         }
4409
4410         if (r)
4411                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4412         return r;
4413 }
4414
4415 /**
4416  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4417  *
4418  * @adev: amdgpu_device pointer
4419  *
4420  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4421  * and lanes) of the slot the device is in. Handles APUs and
4422  * virtualized environments where PCIE config space may not be available.
4423  */
4424 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4425 {
4426         struct pci_dev *pdev;
4427         enum pci_bus_speed speed_cap, platform_speed_cap;
4428         enum pcie_link_width platform_link_width;
4429
4430         if (amdgpu_pcie_gen_cap)
4431                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4432
4433         if (amdgpu_pcie_lane_cap)
4434                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4435
4436         /* covers APUs as well */
4437         if (pci_is_root_bus(adev->pdev->bus)) {
4438                 if (adev->pm.pcie_gen_mask == 0)
4439                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4440                 if (adev->pm.pcie_mlw_mask == 0)
4441                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4442                 return;
4443         }
4444
4445         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4446                 return;
4447
4448         pcie_bandwidth_available(adev->pdev, NULL,
4449                                  &platform_speed_cap, &platform_link_width);
4450
4451         if (adev->pm.pcie_gen_mask == 0) {
4452                 /* asic caps */
4453                 pdev = adev->pdev;
4454                 speed_cap = pcie_get_speed_cap(pdev);
4455                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4456                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4457                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4458                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4459                 } else {
4460                         if (speed_cap == PCIE_SPEED_16_0GT)
4461                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4462                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4463                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4464                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4465                         else if (speed_cap == PCIE_SPEED_8_0GT)
4466                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4467                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4468                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4469                         else if (speed_cap == PCIE_SPEED_5_0GT)
4470                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4471                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4472                         else
4473                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4474                 }
4475                 /* platform caps */
4476                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4477                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4478                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4479                 } else {
4480                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4481                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4482                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4483                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4484                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4485                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4486                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4487                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4488                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4489                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4490                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4491                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4492                         else
4493                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4494
4495                 }
4496         }
4497         if (adev->pm.pcie_mlw_mask == 0) {
4498                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4499                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4500                 } else {
4501                         switch (platform_link_width) {
4502                         case PCIE_LNK_X32:
4503                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4504                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4505                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4506                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4507                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4508                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4509                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4510                                 break;
4511                         case PCIE_LNK_X16:
4512                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4513                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4514                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4515                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4516                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4517                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4518                                 break;
4519                         case PCIE_LNK_X12:
4520                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4521                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4522                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4523                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4524                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4525                                 break;
4526                         case PCIE_LNK_X8:
4527                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4528                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4529                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4530                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4531                                 break;
4532                         case PCIE_LNK_X4:
4533                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4534                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4535                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4536                                 break;
4537                         case PCIE_LNK_X2:
4538                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4539                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4540                                 break;
4541                         case PCIE_LNK_X1:
4542                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4543                                 break;
4544                         default:
4545                                 break;
4546                         }
4547                 }
4548         }
4549 }
4550
4551 int amdgpu_device_baco_enter(struct drm_device *dev)
4552 {
4553         struct amdgpu_device *adev = dev->dev_private;
4554         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4555
4556         if (!amdgpu_device_supports_baco(adev->ddev))
4557                 return -ENOTSUPP;
4558
4559         if (ras && ras->supported)
4560                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4561
4562         return amdgpu_dpm_baco_enter(adev);
4563 }
4564
4565 int amdgpu_device_baco_exit(struct drm_device *dev)
4566 {
4567         struct amdgpu_device *adev = dev->dev_private;
4568         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4569         int ret = 0;
4570
4571         if (!amdgpu_device_supports_baco(adev->ddev))
4572                 return -ENOTSUPP;
4573
4574         ret = amdgpu_dpm_baco_exit(adev);
4575         if (ret)
4576                 return ret;
4577
4578         if (ras && ras->supported)
4579                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4580
4581         return 0;
4582 }