drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin");
  84
  85 #define AMDGPU_RESUME_MS                2000
  86
  87 const char *amdgpu_asic_name[] = {
  88         "TAHITI",
  89         "PITCAIRN",
  90         "VERDE",
  91         "OLAND",
  92         "HAINAN",
  93         "BONAIRE",
  94         "KAVERI",
  95         "KABINI",
  96         "HAWAII",
  97         "MULLINS",
  98         "TOPAZ",
  99         "TONGA",
 100         "FIJI",
 101         "CARRIZO",
 102         "STONEY",
 103         "POLARIS10",
 104         "POLARIS11",
 105         "POLARIS12",
 106         "VEGAM",
 107         "VEGA10",
 108         "VEGA12",
 109         "VEGA20",
 110         "RAVEN",
 111         "ARCTURUS",
 112         "RENOIR",
 113         "NAVI10",
 114         "NAVI14",
 115         "NAVI12",
 116         "SIENNA_CICHLID",
 117         "NAVY_FLOUNDER",
 118         "LAST",
 119 };
 120
 121 /**
 122  * DOC: pcie_replay_count
 123  *
 124  * The amdgpu driver provides a sysfs API for reporting the total number
 125  * of PCIe replays (NAKs)
 126  * The file pcie_replay_count is used for this and returns the total
 127  * number of replays as a sum of the NAKs generated and NAKs received
 128  */
 129
 130 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 131                 struct device_attribute *attr, char *buf)
 132 {
 133         struct drm_device *ddev = dev_get_drvdata(dev);
 134         struct amdgpu_device *adev = drm_to_adev(ddev);
 135         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 136
 137         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 138 }
 139
 140 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 141                 amdgpu_device_get_pcie_replay_count, NULL);
 142
 143 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 144
 145 /**
 146  * DOC: product_name
 147  *
 148  * The amdgpu driver provides a sysfs API for reporting the product name
 149  * for the device
 150  * The file serial_number is used for this and returns the product name
 151  * as returned from the FRU.
 152  * NOTE: This is only available for certain server cards
 153  */
 154
 155 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 156                 struct device_attribute *attr, char *buf)
 157 {
 158         struct drm_device *ddev = dev_get_drvdata(dev);
 159         struct amdgpu_device *adev = drm_to_adev(ddev);
 160
 161         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 162 }
 163
 164 static DEVICE_ATTR(product_name, S_IRUGO,
 165                 amdgpu_device_get_product_name, NULL);
 166
 167 /**
 168  * DOC: product_number
 169  *
 170  * The amdgpu driver provides a sysfs API for reporting the part number
 171  * for the device
 172  * The file serial_number is used for this and returns the part number
 173  * as returned from the FRU.
 174  * NOTE: This is only available for certain server cards
 175  */
 176
 177 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 178                 struct device_attribute *attr, char *buf)
 179 {
 180         struct drm_device *ddev = dev_get_drvdata(dev);
 181         struct amdgpu_device *adev = drm_to_adev(ddev);
 182
 183         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 184 }
 185
 186 static DEVICE_ATTR(product_number, S_IRUGO,
 187                 amdgpu_device_get_product_number, NULL);
 188
 189 /**
 190  * DOC: serial_number
 191  *
 192  * The amdgpu driver provides a sysfs API for reporting the serial number
 193  * for the device
 194  * The file serial_number is used for this and returns the serial number
 195  * as returned from the FRU.
 196  * NOTE: This is only available for certain server cards
 197  */
 198
 199 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 200                 struct device_attribute *attr, char *buf)
 201 {
 202         struct drm_device *ddev = dev_get_drvdata(dev);
 203         struct amdgpu_device *adev = drm_to_adev(ddev);
 204
 205         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 206 }
 207
 208 static DEVICE_ATTR(serial_number, S_IRUGO,
 209                 amdgpu_device_get_serial_number, NULL);
 210
 211 /**
 212  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 213  *
 214  * @dev: drm_device pointer
 215  *
 216  * Returns true if the device is a dGPU with HG/PX power control,
 217  * otherwise return false.
 218  */
 219 bool amdgpu_device_supports_boco(struct drm_device *dev)
 220 {
 221         struct amdgpu_device *adev = drm_to_adev(dev);
 222
 223         if (adev->flags & AMD_IS_PX)
 224                 return true;
 225         return false;
 226 }
 227
 228 /**
 229  * amdgpu_device_supports_baco - Does the device support BACO
 230  *
 231  * @dev: drm_device pointer
 232  *
 233  * Returns true if the device supporte BACO,
 234  * otherwise return false.
 235  */
 236 bool amdgpu_device_supports_baco(struct drm_device *dev)
 237 {
 238         struct amdgpu_device *adev = drm_to_adev(dev);
 239
 240         return amdgpu_asic_supports_baco(adev);
 241 }
 242
 243 /**
 244  * VRAM access helper functions.
 245  *
 246  * amdgpu_device_vram_access - read/write a buffer in vram
 247  *
 248  * @adev: amdgpu_device pointer
 249  * @pos: offset of the buffer in vram
 250  * @buf: virtual address of the buffer in system memory
 251  * @size: read/write size, sizeof(@buf) must > @size
 252  * @write: true - write to vram, otherwise - read from vram
 253  */
 254 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 255                                uint32_t *buf, size_t size, bool write)
 256 {
 257         unsigned long flags;
 258         uint32_t hi = ~0;
 259         uint64_t last;
 260
 261
 262 #ifdef CONFIG_64BIT
 263         last = min(pos + size, adev->gmc.visible_vram_size);
 264         if (last > pos) {
 265                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 266                 size_t count = last - pos;
 267
 268                 if (write) {
 269                         memcpy_toio(addr, buf, count);
 270                         mb();
 271                         amdgpu_asic_flush_hdp(adev, NULL);
 272                 } else {
 273                         amdgpu_asic_invalidate_hdp(adev, NULL);
 274                         mb();
 275                         memcpy_fromio(buf, addr, count);
 276                 }
 277
 278                 if (count == size)
 279                         return;
 280
 281                 pos += count;
 282                 buf += count / 4;
 283                 size -= count;
 284         }
 285 #endif
 286
 287         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 288         for (last = pos + size; pos < last; pos += 4) {
 289                 uint32_t tmp = pos >> 31;
 290
 291                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 292                 if (tmp != hi) {
 293                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 294                         hi = tmp;
 295                 }
 296                 if (write)
 297                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 298                 else
 299                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 300         }
 301         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 302 }
 303
 304 /*
 305  * register access helper functions.
 306  */
 307 /**
 308  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 309  *
 310  * @adev: amdgpu_device pointer
 311  * @reg: dword aligned register offset
 312  * @acc_flags: access flags which require special behavior
 313  *
 314  * Returns the 32 bit value from the offset specified.
 315  */
 316 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 317                             uint32_t reg, uint32_t acc_flags)
 318 {
 319         uint32_t ret;
 320
 321         if (adev->in_pci_err_recovery)
 322                 return 0;
 323
 324         if ((reg * 4) < adev->rmmio_size) {
 325                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 326                     amdgpu_sriov_runtime(adev) &&
 327                     down_read_trylock(&adev->reset_sem)) {
 328                         ret = amdgpu_kiq_rreg(adev, reg);
 329                         up_read(&adev->reset_sem);
 330                 } else {
 331                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 332                 }
 333         } else {
 334                 ret = adev->pcie_rreg(adev, reg * 4);
 335         }
 336
 337         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 338
 339         return ret;
 340 }
 341
 342 /*
 343  * MMIO register read with bytes helper functions
 344  * @offset:bytes offset from MMIO start
 345  *
 346 */
 347
 348 /**
 349  * amdgpu_mm_rreg8 - read a memory mapped IO register
 350  *
 351  * @adev: amdgpu_device pointer
 352  * @offset: byte aligned register offset
 353  *
 354  * Returns the 8 bit value from the offset specified.
 355  */
 356 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 357 {
 358         if (adev->in_pci_err_recovery)
 359                 return 0;
 360
 361         if (offset < adev->rmmio_size)
 362                 return (readb(adev->rmmio + offset));
 363         BUG();
 364 }
 365
 366 /*
 367  * MMIO register write with bytes helper functions
 368  * @offset:bytes offset from MMIO start
 369  * @value: the value want to be written to the register
 370  *
 371 */
 372 /**
 373  * amdgpu_mm_wreg8 - read a memory mapped IO register
 374  *
 375  * @adev: amdgpu_device pointer
 376  * @offset: byte aligned register offset
 377  * @value: 8 bit value to write
 378  *
 379  * Writes the value specified to the offset specified.
 380  */
 381 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 382 {
 383         if (adev->in_pci_err_recovery)
 384                 return;
 385
 386         if (offset < adev->rmmio_size)
 387                 writeb(value, adev->rmmio + offset);
 388         else
 389                 BUG();
 390 }
 391
 392 /**
 393  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 394  *
 395  * @adev: amdgpu_device pointer
 396  * @reg: dword aligned register offset
 397  * @v: 32 bit value to write to the register
 398  * @acc_flags: access flags which require special behavior
 399  *
 400  * Writes the value specified to the offset specified.
 401  */
 402 void amdgpu_device_wreg(struct amdgpu_device *adev,
 403                         uint32_t reg, uint32_t v,
 404                         uint32_t acc_flags)
 405 {
 406         if (adev->in_pci_err_recovery)
 407                 return;
 408
 409         if ((reg * 4) < adev->rmmio_size) {
 410                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 411                     amdgpu_sriov_runtime(adev) &&
 412                     down_read_trylock(&adev->reset_sem)) {
 413                         amdgpu_kiq_wreg(adev, reg, v);
 414                         up_read(&adev->reset_sem);
 415                 } else {
 416                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 417                 }
 418         } else {
 419                 adev->pcie_wreg(adev, reg * 4, v);
 420         }
 421
 422         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 423 }
 424
 425 /*
 426  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 427  *
 428  * this function is invoked only the debugfs register access
 429  * */
 430 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 431                              uint32_t reg, uint32_t v)
 432 {
 433         if (adev->in_pci_err_recovery)
 434                 return;
 435
 436         if (amdgpu_sriov_fullaccess(adev) &&
 437             adev->gfx.rlc.funcs &&
 438             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 439                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 440                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 441         } else {
 442                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 443         }
 444 }
 445
 446 /**
 447  * amdgpu_io_rreg - read an IO register
 448  *
 449  * @adev: amdgpu_device pointer
 450  * @reg: dword aligned register offset
 451  *
 452  * Returns the 32 bit value from the offset specified.
 453  */
 454 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 455 {
 456         if (adev->in_pci_err_recovery)
 457                 return 0;
 458
 459         if ((reg * 4) < adev->rio_mem_size)
 460                 return ioread32(adev->rio_mem + (reg * 4));
 461         else {
 462                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 463                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 464         }
 465 }
 466
 467 /**
 468  * amdgpu_io_wreg - write to an IO register
 469  *
 470  * @adev: amdgpu_device pointer
 471  * @reg: dword aligned register offset
 472  * @v: 32 bit value to write to the register
 473  *
 474  * Writes the value specified to the offset specified.
 475  */
 476 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 477 {
 478         if (adev->in_pci_err_recovery)
 479                 return;
 480
 481         if ((reg * 4) < adev->rio_mem_size)
 482                 iowrite32(v, adev->rio_mem + (reg * 4));
 483         else {
 484                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 485                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 486         }
 487 }
 488
 489 /**
 490  * amdgpu_mm_rdoorbell - read a doorbell dword
 491  *
 492  * @adev: amdgpu_device pointer
 493  * @index: doorbell index
 494  *
 495  * Returns the value in the doorbell aperture at the
 496  * requested doorbell index (CIK).
 497  */
 498 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 499 {
 500         if (adev->in_pci_err_recovery)
 501                 return 0;
 502
 503         if (index < adev->doorbell.num_doorbells) {
 504                 return readl(adev->doorbell.ptr + index);
 505         } else {
 506                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 507                 return 0;
 508         }
 509 }
 510
 511 /**
 512  * amdgpu_mm_wdoorbell - write a doorbell dword
 513  *
 514  * @adev: amdgpu_device pointer
 515  * @index: doorbell index
 516  * @v: value to write
 517  *
 518  * Writes @v to the doorbell aperture at the
 519  * requested doorbell index (CIK).
 520  */
 521 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 522 {
 523         if (adev->in_pci_err_recovery)
 524                 return;
 525
 526         if (index < adev->doorbell.num_doorbells) {
 527                 writel(v, adev->doorbell.ptr + index);
 528         } else {
 529                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 530         }
 531 }
 532
 533 /**
 534  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 535  *
 536  * @adev: amdgpu_device pointer
 537  * @index: doorbell index
 538  *
 539  * Returns the value in the doorbell aperture at the
 540  * requested doorbell index (VEGA10+).
 541  */
 542 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 543 {
 544         if (adev->in_pci_err_recovery)
 545                 return 0;
 546
 547         if (index < adev->doorbell.num_doorbells) {
 548                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 549         } else {
 550                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 551                 return 0;
 552         }
 553 }
 554
 555 /**
 556  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 557  *
 558  * @adev: amdgpu_device pointer
 559  * @index: doorbell index
 560  * @v: value to write
 561  *
 562  * Writes @v to the doorbell aperture at the
 563  * requested doorbell index (VEGA10+).
 564  */
 565 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 566 {
 567         if (adev->in_pci_err_recovery)
 568                 return;
 569
 570         if (index < adev->doorbell.num_doorbells) {
 571                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 572         } else {
 573                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 574         }
 575 }
 576
 577 /**
 578  * amdgpu_device_indirect_rreg - read an indirect register
 579  *
 580  * @adev: amdgpu_device pointer
 581  * @pcie_index: mmio register offset
 582  * @pcie_data: mmio register offset
 583  *
 584  * Returns the value of indirect register @reg_addr
 585  */
 586 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 587                                 u32 pcie_index, u32 pcie_data,
 588                                 u32 reg_addr)
 589 {
 590         unsigned long flags;
 591         u32 r;
 592         void __iomem *pcie_index_offset;
 593         void __iomem *pcie_data_offset;
 594
 595         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 596         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 597         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 598
 599         writel(reg_addr, pcie_index_offset);
 600         readl(pcie_index_offset);
 601         r = readl(pcie_data_offset);
 602         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 603
 604         return r;
 605 }
 606
 607 /**
 608  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 609  *
 610  * @adev: amdgpu_device pointer
 611  * @pcie_index: mmio register offset
 612  * @pcie_data: mmio register offset
 613  *
 614  * Returns the value of indirect register @reg_addr
 615  */
 616 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 617                                   u32 pcie_index, u32 pcie_data,
 618                                   u32 reg_addr)
 619 {
 620         unsigned long flags;
 621         u64 r;
 622         void __iomem *pcie_index_offset;
 623         void __iomem *pcie_data_offset;
 624
 625         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 626         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 627         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 628
 629         /* read low 32 bits */
 630         writel(reg_addr, pcie_index_offset);
 631         readl(pcie_index_offset);
 632         r = readl(pcie_data_offset);
 633         /* read high 32 bits */
 634         writel(reg_addr + 4, pcie_index_offset);
 635         readl(pcie_index_offset);
 636         r |= ((u64)readl(pcie_data_offset) << 32);
 637         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 638
 639         return r;
 640 }
 641
 642 /**
 643  * amdgpu_device_indirect_wreg - write an indirect register address
 644  *
 645  * @adev: amdgpu_device pointer
 646  * @pcie_index: mmio register offset
 647  * @pcie_data: mmio register offset
 648  * @reg_addr: indirect register offset
 649  * @reg_data: indirect register data
 650  *
 651  */
 652 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 653                                  u32 pcie_index, u32 pcie_data,
 654                                  u32 reg_addr, u32 reg_data)
 655 {
 656         unsigned long flags;
 657         void __iomem *pcie_index_offset;
 658         void __iomem *pcie_data_offset;
 659
 660         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 661         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 662         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 663
 664         writel(reg_addr, pcie_index_offset);
 665         readl(pcie_index_offset);
 666         writel(reg_data, pcie_data_offset);
 667         readl(pcie_data_offset);
 668         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 669 }
 670
 671 /**
 672  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 673  *
 674  * @adev: amdgpu_device pointer
 675  * @pcie_index: mmio register offset
 676  * @pcie_data: mmio register offset
 677  * @reg_addr: indirect register offset
 678  * @reg_data: indirect register data
 679  *
 680  */
 681 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 682                                    u32 pcie_index, u32 pcie_data,
 683                                    u32 reg_addr, u64 reg_data)
 684 {
 685         unsigned long flags;
 686         void __iomem *pcie_index_offset;
 687         void __iomem *pcie_data_offset;
 688
 689         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 690         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 691         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 692
 693         /* write low 32 bits */
 694         writel(reg_addr, pcie_index_offset);
 695         readl(pcie_index_offset);
 696         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 697         readl(pcie_data_offset);
 698         /* write high 32 bits */
 699         writel(reg_addr + 4, pcie_index_offset);
 700         readl(pcie_index_offset);
 701         writel((u32)(reg_data >> 32), pcie_data_offset);
 702         readl(pcie_data_offset);
 703         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 704 }
 705
 706 /**
 707  * amdgpu_invalid_rreg - dummy reg read function
 708  *
 709  * @adev: amdgpu device pointer
 710  * @reg: offset of register
 711  *
 712  * Dummy register read function.  Used for register blocks
 713  * that certain asics don't have (all asics).
 714  * Returns the value in the register.
 715  */
 716 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 717 {
 718         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 719         BUG();
 720         return 0;
 721 }
 722
 723 /**
 724  * amdgpu_invalid_wreg - dummy reg write function
 725  *
 726  * @adev: amdgpu device pointer
 727  * @reg: offset of register
 728  * @v: value to write to the register
 729  *
 730  * Dummy register read function.  Used for register blocks
 731  * that certain asics don't have (all asics).
 732  */
 733 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 734 {
 735         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 736                   reg, v);
 737         BUG();
 738 }
 739
 740 /**
 741  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 742  *
 743  * @adev: amdgpu device pointer
 744  * @reg: offset of register
 745  *
 746  * Dummy register read function.  Used for register blocks
 747  * that certain asics don't have (all asics).
 748  * Returns the value in the register.
 749  */
 750 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 751 {
 752         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 753         BUG();
 754         return 0;
 755 }
 756
 757 /**
 758  * amdgpu_invalid_wreg64 - dummy reg write function
 759  *
 760  * @adev: amdgpu device pointer
 761  * @reg: offset of register
 762  * @v: value to write to the register
 763  *
 764  * Dummy register read function.  Used for register blocks
 765  * that certain asics don't have (all asics).
 766  */
 767 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 768 {
 769         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 770                   reg, v);
 771         BUG();
 772 }
 773
 774 /**
 775  * amdgpu_block_invalid_rreg - dummy reg read function
 776  *
 777  * @adev: amdgpu device pointer
 778  * @block: offset of instance
 779  * @reg: offset of register
 780  *
 781  * Dummy register read function.  Used for register blocks
 782  * that certain asics don't have (all asics).
 783  * Returns the value in the register.
 784  */
 785 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 786                                           uint32_t block, uint32_t reg)
 787 {
 788         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 789                   reg, block);
 790         BUG();
 791         return 0;
 792 }
 793
 794 /**
 795  * amdgpu_block_invalid_wreg - dummy reg write function
 796  *
 797  * @adev: amdgpu device pointer
 798  * @block: offset of instance
 799  * @reg: offset of register
 800  * @v: value to write to the register
 801  *
 802  * Dummy register read function.  Used for register blocks
 803  * that certain asics don't have (all asics).
 804  */
 805 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 806                                       uint32_t block,
 807                                       uint32_t reg, uint32_t v)
 808 {
 809         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 810                   reg, block, v);
 811         BUG();
 812 }
 813
 814 /**
 815  * amdgpu_device_asic_init - Wrapper for atom asic_init
 816  *
 817  * @dev: drm_device pointer
 818  *
 819  * Does any asic specific work and then calls atom asic init.
 820  */
 821 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 822 {
 823         amdgpu_asic_pre_asic_init(adev);
 824
 825         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 826 }
 827
 828 /**
 829  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 830  *
 831  * @adev: amdgpu device pointer
 832  *
 833  * Allocates a scratch page of VRAM for use by various things in the
 834  * driver.
 835  */
 836 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 837 {
 838         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 839                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 840                                        &adev->vram_scratch.robj,
 841                                        &adev->vram_scratch.gpu_addr,
 842                                        (void **)&adev->vram_scratch.ptr);
 843 }
 844
 845 /**
 846  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 847  *
 848  * @adev: amdgpu device pointer
 849  *
 850  * Frees the VRAM scratch page.
 851  */
 852 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 853 {
 854         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 855 }
 856
 857 /**
 858  * amdgpu_device_program_register_sequence - program an array of registers.
 859  *
 860  * @adev: amdgpu_device pointer
 861  * @registers: pointer to the register array
 862  * @array_size: size of the register array
 863  *
 864  * Programs an array or registers with and and or masks.
 865  * This is a helper for setting golden registers.
 866  */
 867 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 868                                              const u32 *registers,
 869                                              const u32 array_size)
 870 {
 871         u32 tmp, reg, and_mask, or_mask;
 872         int i;
 873
 874         if (array_size % 3)
 875                 return;
 876
 877         for (i = 0; i < array_size; i +=3) {
 878                 reg = registers[i + 0];
 879                 and_mask = registers[i + 1];
 880                 or_mask = registers[i + 2];
 881
 882                 if (and_mask == 0xffffffff) {
 883                         tmp = or_mask;
 884                 } else {
 885                         tmp = RREG32(reg);
 886                         tmp &= ~and_mask;
 887                         if (adev->family >= AMDGPU_FAMILY_AI)
 888                                 tmp |= (or_mask & and_mask);
 889                         else
 890                                 tmp |= or_mask;
 891                 }
 892                 WREG32(reg, tmp);
 893         }
 894 }
 895
 896 /**
 897  * amdgpu_device_pci_config_reset - reset the GPU
 898  *
 899  * @adev: amdgpu_device pointer
 900  *
 901  * Resets the GPU using the pci config reset sequence.
 902  * Only applicable to asics prior to vega10.
 903  */
 904 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 905 {
 906         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 907 }
 908
 909 /*
 910  * GPU doorbell aperture helpers function.
 911  */
 912 /**
 913  * amdgpu_device_doorbell_init - Init doorbell driver information.
 914  *
 915  * @adev: amdgpu_device pointer
 916  *
 917  * Init doorbell driver information (CIK)
 918  * Returns 0 on success, error on failure.
 919  */
 920 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 921 {
 922
 923         /* No doorbell on SI hardware generation */
 924         if (adev->asic_type < CHIP_BONAIRE) {
 925                 adev->doorbell.base = 0;
 926                 adev->doorbell.size = 0;
 927                 adev->doorbell.num_doorbells = 0;
 928                 adev->doorbell.ptr = NULL;
 929                 return 0;
 930         }
 931
 932         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 933                 return -EINVAL;
 934
 935         amdgpu_asic_init_doorbell_index(adev);
 936
 937         /* doorbell bar mapping */
 938         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 939         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 940
 941         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 942                                              adev->doorbell_index.max_assignment+1);
 943         if (adev->doorbell.num_doorbells == 0)
 944                 return -EINVAL;
 945
 946         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 947          * paging queue doorbell use the second page. The
 948          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 949          * doorbells are in the first page. So with paging queue enabled,
 950          * the max num_doorbells should + 1 page (0x400 in dword)
 951          */
 952         if (adev->asic_type >= CHIP_VEGA10)
 953                 adev->doorbell.num_doorbells += 0x400;
 954
 955         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 956                                      adev->doorbell.num_doorbells *
 957                                      sizeof(u32));
 958         if (adev->doorbell.ptr == NULL)
 959                 return -ENOMEM;
 960
 961         return 0;
 962 }
 963
 964 /**
 965  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 966  *
 967  * @adev: amdgpu_device pointer
 968  *
 969  * Tear down doorbell driver information (CIK)
 970  */
 971 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 972 {
 973         iounmap(adev->doorbell.ptr);
 974         adev->doorbell.ptr = NULL;
 975 }
 976
 977
 978
 979 /*
 980  * amdgpu_device_wb_*()
 981  * Writeback is the method by which the GPU updates special pages in memory
 982  * with the status of certain GPU events (fences, ring pointers,etc.).
 983  */
 984
 985 /**
 986  * amdgpu_device_wb_fini - Disable Writeback and free memory
 987  *
 988  * @adev: amdgpu_device pointer
 989  *
 990  * Disables Writeback and frees the Writeback memory (all asics).
 991  * Used at driver shutdown.
 992  */
 993 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 994 {
 995         if (adev->wb.wb_obj) {
 996                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 997                                       &adev->wb.gpu_addr,
 998                                       (void **)&adev->wb.wb);
 999                 adev->wb.wb_obj = NULL;
1000         }
1001 }
1002
1003 /**
1004  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1005  *
1006  * @adev: amdgpu_device pointer
1007  *
1008  * Initializes writeback and allocates writeback memory (all asics).
1009  * Used at driver startup.
1010  * Returns 0 on success or an -error on failure.
1011  */
1012 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1013 {
1014         int r;
1015
1016         if (adev->wb.wb_obj == NULL) {
1017                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1018                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1019                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1020                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1021                                             (void **)&adev->wb.wb);
1022                 if (r) {
1023                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1024                         return r;
1025                 }
1026
1027                 adev->wb.num_wb = AMDGPU_MAX_WB;
1028                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1029
1030                 /* clear wb memory */
1031                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1032         }
1033
1034         return 0;
1035 }
1036
1037 /**
1038  * amdgpu_device_wb_get - Allocate a wb entry
1039  *
1040  * @adev: amdgpu_device pointer
1041  * @wb: wb index
1042  *
1043  * Allocate a wb slot for use by the driver (all asics).
1044  * Returns 0 on success or -EINVAL on failure.
1045  */
1046 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1047 {
1048         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1049
1050         if (offset < adev->wb.num_wb) {
1051                 __set_bit(offset, adev->wb.used);
1052                 *wb = offset << 3; /* convert to dw offset */
1053                 return 0;
1054         } else {
1055                 return -EINVAL;
1056         }
1057 }
1058
1059 /**
1060  * amdgpu_device_wb_free - Free a wb entry
1061  *
1062  * @adev: amdgpu_device pointer
1063  * @wb: wb index
1064  *
1065  * Free a wb slot allocated for use by the driver (all asics)
1066  */
1067 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1068 {
1069         wb >>= 3;
1070         if (wb < adev->wb.num_wb)
1071                 __clear_bit(wb, adev->wb.used);
1072 }
1073
1074 /**
1075  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1076  *
1077  * @adev: amdgpu_device pointer
1078  *
1079  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1080  * to fail, but if any of the BARs is not accessible after the size we abort
1081  * driver loading by returning -ENODEV.
1082  */
1083 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1084 {
1085         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1086         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1087         struct pci_bus *root;
1088         struct resource *res;
1089         unsigned i;
1090         u16 cmd;
1091         int r;
1092
1093         /* Bypass for VF */
1094         if (amdgpu_sriov_vf(adev))
1095                 return 0;
1096
1097         /* skip if the bios has already enabled large BAR */
1098         if (adev->gmc.real_vram_size &&
1099             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1100                 return 0;
1101
1102         /* Check if the root BUS has 64bit memory resources */
1103         root = adev->pdev->bus;
1104         while (root->parent)
1105                 root = root->parent;
1106
1107         pci_bus_for_each_resource(root, res, i) {
1108                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1109                     res->start > 0x100000000ull)
1110                         break;
1111         }
1112
1113         /* Trying to resize is pointless without a root hub window above 4GB */
1114         if (!res)
1115                 return 0;
1116
1117         /* Disable memory decoding while we change the BAR addresses and size */
1118         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1119         pci_write_config_word(adev->pdev, PCI_COMMAND,
1120                               cmd & ~PCI_COMMAND_MEMORY);
1121
1122         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1123         amdgpu_device_doorbell_fini(adev);
1124         if (adev->asic_type >= CHIP_BONAIRE)
1125                 pci_release_resource(adev->pdev, 2);
1126
1127         pci_release_resource(adev->pdev, 0);
1128
1129         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1130         if (r == -ENOSPC)
1131                 DRM_INFO("Not enough PCI address space for a large BAR.");
1132         else if (r && r != -ENOTSUPP)
1133                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1134
1135         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1136
1137         /* When the doorbell or fb BAR isn't available we have no chance of
1138          * using the device.
1139          */
1140         r = amdgpu_device_doorbell_init(adev);
1141         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1142                 return -ENODEV;
1143
1144         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1145
1146         return 0;
1147 }
1148
1149 /*
1150  * GPU helpers function.
1151  */
1152 /**
1153  * amdgpu_device_need_post - check if the hw need post or not
1154  *
1155  * @adev: amdgpu_device pointer
1156  *
1157  * Check if the asic has been initialized (all asics) at driver startup
1158  * or post is needed if  hw reset is performed.
1159  * Returns true if need or false if not.
1160  */
1161 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1162 {
1163         uint32_t reg;
1164
1165         if (amdgpu_sriov_vf(adev))
1166                 return false;
1167
1168         if (amdgpu_passthrough(adev)) {
1169                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1170                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1171                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1172                  * vpost executed for smc version below 22.15
1173                  */
1174                 if (adev->asic_type == CHIP_FIJI) {
1175                         int err;
1176                         uint32_t fw_ver;
1177                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1178                         /* force vPost if error occured */
1179                         if (err)
1180                                 return true;
1181
1182                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1183                         if (fw_ver < 0x00160e00)
1184                                 return true;
1185                 }
1186         }
1187
1188         if (adev->has_hw_reset) {
1189                 adev->has_hw_reset = false;
1190                 return true;
1191         }
1192
1193         /* bios scratch used on CIK+ */
1194         if (adev->asic_type >= CHIP_BONAIRE)
1195                 return amdgpu_atombios_scratch_need_asic_init(adev);
1196
1197         /* check MEM_SIZE for older asics */
1198         reg = amdgpu_asic_get_config_memsize(adev);
1199
1200         if ((reg != 0) && (reg != 0xffffffff))
1201                 return false;
1202
1203         return true;
1204 }
1205
1206 /* if we get transitioned to only one device, take VGA back */
1207 /**
1208  * amdgpu_device_vga_set_decode - enable/disable vga decode
1209  *
1210  * @cookie: amdgpu_device pointer
1211  * @state: enable/disable vga decode
1212  *
1213  * Enable/disable vga decode (all asics).
1214  * Returns VGA resource flags.
1215  */
1216 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1217 {
1218         struct amdgpu_device *adev = cookie;
1219         amdgpu_asic_set_vga_state(adev, state);
1220         if (state)
1221                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1222                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1223         else
1224                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1225 }
1226
1227 /**
1228  * amdgpu_device_check_block_size - validate the vm block size
1229  *
1230  * @adev: amdgpu_device pointer
1231  *
1232  * Validates the vm block size specified via module parameter.
1233  * The vm block size defines number of bits in page table versus page directory,
1234  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1235  * page table and the remaining bits are in the page directory.
1236  */
1237 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1238 {
1239         /* defines number of bits in page table versus page directory,
1240          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1241          * page table and the remaining bits are in the page directory */
1242         if (amdgpu_vm_block_size == -1)
1243                 return;
1244
1245         if (amdgpu_vm_block_size < 9) {
1246                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1247                          amdgpu_vm_block_size);
1248                 amdgpu_vm_block_size = -1;
1249         }
1250 }
1251
1252 /**
1253  * amdgpu_device_check_vm_size - validate the vm size
1254  *
1255  * @adev: amdgpu_device pointer
1256  *
1257  * Validates the vm size in GB specified via module parameter.
1258  * The VM size is the size of the GPU virtual memory space in GB.
1259  */
1260 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1261 {
1262         /* no need to check the default value */
1263         if (amdgpu_vm_size == -1)
1264                 return;
1265
1266         if (amdgpu_vm_size < 1) {
1267                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1268                          amdgpu_vm_size);
1269                 amdgpu_vm_size = -1;
1270         }
1271 }
1272
1273 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1274 {
1275         struct sysinfo si;
1276         bool is_os_64 = (sizeof(void *) == 8);
1277         uint64_t total_memory;
1278         uint64_t dram_size_seven_GB = 0x1B8000000;
1279         uint64_t dram_size_three_GB = 0xB8000000;
1280
1281         if (amdgpu_smu_memory_pool_size == 0)
1282                 return;
1283
1284         if (!is_os_64) {
1285                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1286                 goto def_value;
1287         }
1288         si_meminfo(&si);
1289         total_memory = (uint64_t)si.totalram * si.mem_unit;
1290
1291         if ((amdgpu_smu_memory_pool_size == 1) ||
1292                 (amdgpu_smu_memory_pool_size == 2)) {
1293                 if (total_memory < dram_size_three_GB)
1294                         goto def_value1;
1295         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1296                 (amdgpu_smu_memory_pool_size == 8)) {
1297                 if (total_memory < dram_size_seven_GB)
1298                         goto def_value1;
1299         } else {
1300                 DRM_WARN("Smu memory pool size not supported\n");
1301                 goto def_value;
1302         }
1303         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1304
1305         return;
1306
1307 def_value1:
1308         DRM_WARN("No enough system memory\n");
1309 def_value:
1310         adev->pm.smu_prv_buffer_size = 0;
1311 }
1312
1313 /**
1314  * amdgpu_device_check_arguments - validate module params
1315  *
1316  * @adev: amdgpu_device pointer
1317  *
1318  * Validates certain module parameters and updates
1319  * the associated values used by the driver (all asics).
1320  */
1321 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1322 {
1323         if (amdgpu_sched_jobs < 4) {
1324                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1325                          amdgpu_sched_jobs);
1326                 amdgpu_sched_jobs = 4;
1327         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1328                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1329                          amdgpu_sched_jobs);
1330                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1331         }
1332
1333         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1334                 /* gart size must be greater or equal to 32M */
1335                 dev_warn(adev->dev, "gart size (%d) too small\n",
1336                          amdgpu_gart_size);
1337                 amdgpu_gart_size = -1;
1338         }
1339
1340         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1341                 /* gtt size must be greater or equal to 32M */
1342                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1343                                  amdgpu_gtt_size);
1344                 amdgpu_gtt_size = -1;
1345         }
1346
1347         /* valid range is between 4 and 9 inclusive */
1348         if (amdgpu_vm_fragment_size != -1 &&
1349             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1350                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1351                 amdgpu_vm_fragment_size = -1;
1352         }
1353
1354         if (amdgpu_sched_hw_submission < 2) {
1355                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1356                          amdgpu_sched_hw_submission);
1357                 amdgpu_sched_hw_submission = 2;
1358         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1359                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1360                          amdgpu_sched_hw_submission);
1361                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1362         }
1363
1364         amdgpu_device_check_smu_prv_buffer_size(adev);
1365
1366         amdgpu_device_check_vm_size(adev);
1367
1368         amdgpu_device_check_block_size(adev);
1369
1370         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1371
1372         amdgpu_gmc_tmz_set(adev);
1373
1374         if (amdgpu_num_kcq == -1) {
1375                 amdgpu_num_kcq = 8;
1376         } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1377                 amdgpu_num_kcq = 8;
1378                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1379         }
1380
1381         amdgpu_gmc_noretry_set(adev);
1382
1383         return 0;
1384 }
1385
1386 /**
1387  * amdgpu_switcheroo_set_state - set switcheroo state
1388  *
1389  * @pdev: pci dev pointer
1390  * @state: vga_switcheroo state
1391  *
1392  * Callback for the switcheroo driver.  Suspends or resumes the
1393  * the asics before or after it is powered up using ACPI methods.
1394  */
1395 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1396                                         enum vga_switcheroo_state state)
1397 {
1398         struct drm_device *dev = pci_get_drvdata(pdev);
1399         int r;
1400
1401         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1402                 return;
1403
1404         if (state == VGA_SWITCHEROO_ON) {
1405                 pr_info("switched on\n");
1406                 /* don't suspend or resume card normally */
1407                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1408
1409                 pci_set_power_state(dev->pdev, PCI_D0);
1410                 amdgpu_device_load_pci_state(dev->pdev);
1411                 r = pci_enable_device(dev->pdev);
1412                 if (r)
1413                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1414                 amdgpu_device_resume(dev, true);
1415
1416                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1417                 drm_kms_helper_poll_enable(dev);
1418         } else {
1419                 pr_info("switched off\n");
1420                 drm_kms_helper_poll_disable(dev);
1421                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1422                 amdgpu_device_suspend(dev, true);
1423                 amdgpu_device_cache_pci_state(dev->pdev);
1424                 /* Shut down the device */
1425                 pci_disable_device(dev->pdev);
1426                 pci_set_power_state(dev->pdev, PCI_D3cold);
1427                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1428         }
1429 }
1430
1431 /**
1432  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1433  *
1434  * @pdev: pci dev pointer
1435  *
1436  * Callback for the switcheroo driver.  Check of the switcheroo
1437  * state can be changed.
1438  * Returns true if the state can be changed, false if not.
1439  */
1440 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1441 {
1442         struct drm_device *dev = pci_get_drvdata(pdev);
1443
1444         /*
1445         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1446         * locking inversion with the driver load path. And the access here is
1447         * completely racy anyway. So don't bother with locking for now.
1448         */
1449         return atomic_read(&dev->open_count) == 0;
1450 }
1451
1452 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1453         .set_gpu_state = amdgpu_switcheroo_set_state,
1454         .reprobe = NULL,
1455         .can_switch = amdgpu_switcheroo_can_switch,
1456 };
1457
1458 /**
1459  * amdgpu_device_ip_set_clockgating_state - set the CG state
1460  *
1461  * @dev: amdgpu_device pointer
1462  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1463  * @state: clockgating state (gate or ungate)
1464  *
1465  * Sets the requested clockgating state for all instances of
1466  * the hardware IP specified.
1467  * Returns the error code from the last instance.
1468  */
1469 int amdgpu_device_ip_set_clockgating_state(void *dev,
1470                                            enum amd_ip_block_type block_type,
1471                                            enum amd_clockgating_state state)
1472 {
1473         struct amdgpu_device *adev = dev;
1474         int i, r = 0;
1475
1476         for (i = 0; i < adev->num_ip_blocks; i++) {
1477                 if (!adev->ip_blocks[i].status.valid)
1478                         continue;
1479                 if (adev->ip_blocks[i].version->type != block_type)
1480                         continue;
1481                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1482                         continue;
1483                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1484                         (void *)adev, state);
1485                 if (r)
1486                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1487                                   adev->ip_blocks[i].version->funcs->name, r);
1488         }
1489         return r;
1490 }
1491
1492 /**
1493  * amdgpu_device_ip_set_powergating_state - set the PG state
1494  *
1495  * @dev: amdgpu_device pointer
1496  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1497  * @state: powergating state (gate or ungate)
1498  *
1499  * Sets the requested powergating state for all instances of
1500  * the hardware IP specified.
1501  * Returns the error code from the last instance.
1502  */
1503 int amdgpu_device_ip_set_powergating_state(void *dev,
1504                                            enum amd_ip_block_type block_type,
1505                                            enum amd_powergating_state state)
1506 {
1507         struct amdgpu_device *adev = dev;
1508         int i, r = 0;
1509
1510         for (i = 0; i < adev->num_ip_blocks; i++) {
1511                 if (!adev->ip_blocks[i].status.valid)
1512                         continue;
1513                 if (adev->ip_blocks[i].version->type != block_type)
1514                         continue;
1515                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1516                         continue;
1517                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1518                         (void *)adev, state);
1519                 if (r)
1520                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1521                                   adev->ip_blocks[i].version->funcs->name, r);
1522         }
1523         return r;
1524 }
1525
1526 /**
1527  * amdgpu_device_ip_get_clockgating_state - get the CG state
1528  *
1529  * @adev: amdgpu_device pointer
1530  * @flags: clockgating feature flags
1531  *
1532  * Walks the list of IPs on the device and updates the clockgating
1533  * flags for each IP.
1534  * Updates @flags with the feature flags for each hardware IP where
1535  * clockgating is enabled.
1536  */
1537 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1538                                             u32 *flags)
1539 {
1540         int i;
1541
1542         for (i = 0; i < adev->num_ip_blocks; i++) {
1543                 if (!adev->ip_blocks[i].status.valid)
1544                         continue;
1545                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1546                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1547         }
1548 }
1549
1550 /**
1551  * amdgpu_device_ip_wait_for_idle - wait for idle
1552  *
1553  * @adev: amdgpu_device pointer
1554  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1555  *
1556  * Waits for the request hardware IP to be idle.
1557  * Returns 0 for success or a negative error code on failure.
1558  */
1559 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1560                                    enum amd_ip_block_type block_type)
1561 {
1562         int i, r;
1563
1564         for (i = 0; i < adev->num_ip_blocks; i++) {
1565                 if (!adev->ip_blocks[i].status.valid)
1566                         continue;
1567                 if (adev->ip_blocks[i].version->type == block_type) {
1568                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1569                         if (r)
1570                                 return r;
1571                         break;
1572                 }
1573         }
1574         return 0;
1575
1576 }
1577
1578 /**
1579  * amdgpu_device_ip_is_idle - is the hardware IP idle
1580  *
1581  * @adev: amdgpu_device pointer
1582  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1583  *
1584  * Check if the hardware IP is idle or not.
1585  * Returns true if it the IP is idle, false if not.
1586  */
1587 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1588                               enum amd_ip_block_type block_type)
1589 {
1590         int i;
1591
1592         for (i = 0; i < adev->num_ip_blocks; i++) {
1593                 if (!adev->ip_blocks[i].status.valid)
1594                         continue;
1595                 if (adev->ip_blocks[i].version->type == block_type)
1596                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1597         }
1598         return true;
1599
1600 }
1601
1602 /**
1603  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1604  *
1605  * @adev: amdgpu_device pointer
1606  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1607  *
1608  * Returns a pointer to the hardware IP block structure
1609  * if it exists for the asic, otherwise NULL.
1610  */
1611 struct amdgpu_ip_block *
1612 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1613                               enum amd_ip_block_type type)
1614 {
1615         int i;
1616
1617         for (i = 0; i < adev->num_ip_blocks; i++)
1618                 if (adev->ip_blocks[i].version->type == type)
1619                         return &adev->ip_blocks[i];
1620
1621         return NULL;
1622 }
1623
1624 /**
1625  * amdgpu_device_ip_block_version_cmp
1626  *
1627  * @adev: amdgpu_device pointer
1628  * @type: enum amd_ip_block_type
1629  * @major: major version
1630  * @minor: minor version
1631  *
1632  * return 0 if equal or greater
1633  * return 1 if smaller or the ip_block doesn't exist
1634  */
1635 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1636                                        enum amd_ip_block_type type,
1637                                        u32 major, u32 minor)
1638 {
1639         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1640
1641         if (ip_block && ((ip_block->version->major > major) ||
1642                         ((ip_block->version->major == major) &&
1643                         (ip_block->version->minor >= minor))))
1644                 return 0;
1645
1646         return 1;
1647 }
1648
1649 /**
1650  * amdgpu_device_ip_block_add
1651  *
1652  * @adev: amdgpu_device pointer
1653  * @ip_block_version: pointer to the IP to add
1654  *
1655  * Adds the IP block driver information to the collection of IPs
1656  * on the asic.
1657  */
1658 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1659                                const struct amdgpu_ip_block_version *ip_block_version)
1660 {
1661         if (!ip_block_version)
1662                 return -EINVAL;
1663
1664         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1665                   ip_block_version->funcs->name);
1666
1667         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1668
1669         return 0;
1670 }
1671
1672 /**
1673  * amdgpu_device_enable_virtual_display - enable virtual display feature
1674  *
1675  * @adev: amdgpu_device pointer
1676  *
1677  * Enabled the virtual display feature if the user has enabled it via
1678  * the module parameter virtual_display.  This feature provides a virtual
1679  * display hardware on headless boards or in virtualized environments.
1680  * This function parses and validates the configuration string specified by
1681  * the user and configues the virtual display configuration (number of
1682  * virtual connectors, crtcs, etc.) specified.
1683  */
1684 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1685 {
1686         adev->enable_virtual_display = false;
1687
1688         if (amdgpu_virtual_display) {
1689                 struct drm_device *ddev = adev_to_drm(adev);
1690                 const char *pci_address_name = pci_name(ddev->pdev);
1691                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1692
1693                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1694                 pciaddstr_tmp = pciaddstr;
1695                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1696                         pciaddname = strsep(&pciaddname_tmp, ",");
1697                         if (!strcmp("all", pciaddname)
1698                             || !strcmp(pci_address_name, pciaddname)) {
1699                                 long num_crtc;
1700                                 int res = -1;
1701
1702                                 adev->enable_virtual_display = true;
1703
1704                                 if (pciaddname_tmp)
1705                                         res = kstrtol(pciaddname_tmp, 10,
1706                                                       &num_crtc);
1707
1708                                 if (!res) {
1709                                         if (num_crtc < 1)
1710                                                 num_crtc = 1;
1711                                         if (num_crtc > 6)
1712                                                 num_crtc = 6;
1713                                         adev->mode_info.num_crtc = num_crtc;
1714                                 } else {
1715                                         adev->mode_info.num_crtc = 1;
1716                                 }
1717                                 break;
1718                         }
1719                 }
1720
1721                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1722                          amdgpu_virtual_display, pci_address_name,
1723                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1724
1725                 kfree(pciaddstr);
1726         }
1727 }
1728
1729 /**
1730  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1731  *
1732  * @adev: amdgpu_device pointer
1733  *
1734  * Parses the asic configuration parameters specified in the gpu info
1735  * firmware and makes them availale to the driver for use in configuring
1736  * the asic.
1737  * Returns 0 on success, -EINVAL on failure.
1738  */
1739 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1740 {
1741         const char *chip_name;
1742         char fw_name[40];
1743         int err;
1744         const struct gpu_info_firmware_header_v1_0 *hdr;
1745
1746         adev->firmware.gpu_info_fw = NULL;
1747
1748         if (adev->mman.discovery_bin) {
1749                 amdgpu_discovery_get_gfx_info(adev);
1750
1751                 /*
1752                  * FIXME: The bounding box is still needed by Navi12, so
1753                  * temporarily read it from gpu_info firmware. Should be droped
1754                  * when DAL no longer needs it.
1755                  */
1756                 if (adev->asic_type != CHIP_NAVI12)
1757                         return 0;
1758         }
1759
1760         switch (adev->asic_type) {
1761 #ifdef CONFIG_DRM_AMDGPU_SI
1762         case CHIP_VERDE:
1763         case CHIP_TAHITI:
1764         case CHIP_PITCAIRN:
1765         case CHIP_OLAND:
1766         case CHIP_HAINAN:
1767 #endif
1768 #ifdef CONFIG_DRM_AMDGPU_CIK
1769         case CHIP_BONAIRE:
1770         case CHIP_HAWAII:
1771         case CHIP_KAVERI:
1772         case CHIP_KABINI:
1773         case CHIP_MULLINS:
1774 #endif
1775         case CHIP_TOPAZ:
1776         case CHIP_TONGA:
1777         case CHIP_FIJI:
1778         case CHIP_POLARIS10:
1779         case CHIP_POLARIS11:
1780         case CHIP_POLARIS12:
1781         case CHIP_VEGAM:
1782         case CHIP_CARRIZO:
1783         case CHIP_STONEY:
1784         case CHIP_VEGA20:
1785         case CHIP_SIENNA_CICHLID:
1786         case CHIP_NAVY_FLOUNDER:
1787         default:
1788                 return 0;
1789         case CHIP_VEGA10:
1790                 chip_name = "vega10";
1791                 break;
1792         case CHIP_VEGA12:
1793                 chip_name = "vega12";
1794                 break;
1795         case CHIP_RAVEN:
1796                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1797                         chip_name = "raven2";
1798                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1799                         chip_name = "picasso";
1800                 else
1801                         chip_name = "raven";
1802                 break;
1803         case CHIP_ARCTURUS:
1804                 chip_name = "arcturus";
1805                 break;
1806         case CHIP_RENOIR:
1807                 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1808                         chip_name = "renoir";
1809                 else
1810                         chip_name = "green_sardine";
1811                 break;
1812         case CHIP_NAVI10:
1813                 chip_name = "navi10";
1814                 break;
1815         case CHIP_NAVI14:
1816                 chip_name = "navi14";
1817                 break;
1818         case CHIP_NAVI12:
1819                 chip_name = "navi12";
1820                 break;
1821         }
1822
1823         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1824         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1825         if (err) {
1826                 dev_err(adev->dev,
1827                         "Failed to load gpu_info firmware \"%s\"\n",
1828                         fw_name);
1829                 goto out;
1830         }
1831         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1832         if (err) {
1833                 dev_err(adev->dev,
1834                         "Failed to validate gpu_info firmware \"%s\"\n",
1835                         fw_name);
1836                 goto out;
1837         }
1838
1839         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1840         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1841
1842         switch (hdr->version_major) {
1843         case 1:
1844         {
1845                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1846                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1847                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1848
1849                 /*
1850                  * Should be droped when DAL no longer needs it.
1851                  */
1852                 if (adev->asic_type == CHIP_NAVI12)
1853                         goto parse_soc_bounding_box;
1854
1855                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1856                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1857                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1858                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1859                 adev->gfx.config.max_texture_channel_caches =
1860                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1861                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1862                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1863                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1864                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1865                 adev->gfx.config.double_offchip_lds_buf =
1866                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1867                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1868                 adev->gfx.cu_info.max_waves_per_simd =
1869                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1870                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1871                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1872                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1873                 if (hdr->version_minor >= 1) {
1874                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1875                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1876                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1877                         adev->gfx.config.num_sc_per_sh =
1878                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1879                         adev->gfx.config.num_packer_per_sc =
1880                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1881                 }
1882
1883 parse_soc_bounding_box:
1884                 /*
1885                  * soc bounding box info is not integrated in disocovery table,
1886                  * we always need to parse it from gpu info firmware if needed.
1887                  */
1888                 if (hdr->version_minor == 2) {
1889                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1890                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1891                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1892                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1893                 }
1894                 break;
1895         }
1896         default:
1897                 dev_err(adev->dev,
1898                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1899                 err = -EINVAL;
1900                 goto out;
1901         }
1902 out:
1903         return err;
1904 }
1905
1906 /**
1907  * amdgpu_device_ip_early_init - run early init for hardware IPs
1908  *
1909  * @adev: amdgpu_device pointer
1910  *
1911  * Early initialization pass for hardware IPs.  The hardware IPs that make
1912  * up each asic are discovered each IP's early_init callback is run.  This
1913  * is the first stage in initializing the asic.
1914  * Returns 0 on success, negative error code on failure.
1915  */
1916 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1917 {
1918         int i, r;
1919
1920         amdgpu_device_enable_virtual_display(adev);
1921
1922         if (amdgpu_sriov_vf(adev)) {
1923                 r = amdgpu_virt_request_full_gpu(adev, true);
1924                 if (r)
1925                         return r;
1926         }
1927
1928         switch (adev->asic_type) {
1929 #ifdef CONFIG_DRM_AMDGPU_SI
1930         case CHIP_VERDE:
1931         case CHIP_TAHITI:
1932         case CHIP_PITCAIRN:
1933         case CHIP_OLAND:
1934         case CHIP_HAINAN:
1935                 adev->family = AMDGPU_FAMILY_SI;
1936                 r = si_set_ip_blocks(adev);
1937                 if (r)
1938                         return r;
1939                 break;
1940 #endif
1941 #ifdef CONFIG_DRM_AMDGPU_CIK
1942         case CHIP_BONAIRE:
1943         case CHIP_HAWAII:
1944         case CHIP_KAVERI:
1945         case CHIP_KABINI:
1946         case CHIP_MULLINS:
1947                 if (adev->flags & AMD_IS_APU)
1948                         adev->family = AMDGPU_FAMILY_KV;
1949                 else
1950                         adev->family = AMDGPU_FAMILY_CI;
1951
1952                 r = cik_set_ip_blocks(adev);
1953                 if (r)
1954                         return r;
1955                 break;
1956 #endif
1957         case CHIP_TOPAZ:
1958         case CHIP_TONGA:
1959         case CHIP_FIJI:
1960         case CHIP_POLARIS10:
1961         case CHIP_POLARIS11:
1962         case CHIP_POLARIS12:
1963         case CHIP_VEGAM:
1964         case CHIP_CARRIZO:
1965         case CHIP_STONEY:
1966                 if (adev->flags & AMD_IS_APU)
1967                         adev->family = AMDGPU_FAMILY_CZ;
1968                 else
1969                         adev->family = AMDGPU_FAMILY_VI;
1970
1971                 r = vi_set_ip_blocks(adev);
1972                 if (r)
1973                         return r;
1974                 break;
1975         case CHIP_VEGA10:
1976         case CHIP_VEGA12:
1977         case CHIP_VEGA20:
1978         case CHIP_RAVEN:
1979         case CHIP_ARCTURUS:
1980         case CHIP_RENOIR:
1981                 if (adev->flags & AMD_IS_APU)
1982                         adev->family = AMDGPU_FAMILY_RV;
1983                 else
1984                         adev->family = AMDGPU_FAMILY_AI;
1985
1986                 r = soc15_set_ip_blocks(adev);
1987                 if (r)
1988                         return r;
1989                 break;
1990         case  CHIP_NAVI10:
1991         case  CHIP_NAVI14:
1992         case  CHIP_NAVI12:
1993         case  CHIP_SIENNA_CICHLID:
1994         case  CHIP_NAVY_FLOUNDER:
1995                 adev->family = AMDGPU_FAMILY_NV;
1996
1997                 r = nv_set_ip_blocks(adev);
1998                 if (r)
1999                         return r;
2000                 break;
2001         default:
2002                 /* FIXME: not supported yet */
2003                 return -EINVAL;
2004         }
2005
2006         amdgpu_amdkfd_device_probe(adev);
2007
2008         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2009         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2010                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2011
2012         for (i = 0; i < adev->num_ip_blocks; i++) {
2013                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2014                         DRM_ERROR("disabled ip block: %d <%s>\n",
2015                                   i, adev->ip_blocks[i].version->funcs->name);
2016                         adev->ip_blocks[i].status.valid = false;
2017                 } else {
2018                         if (adev->ip_blocks[i].version->funcs->early_init) {
2019                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2020                                 if (r == -ENOENT) {
2021                                         adev->ip_blocks[i].status.valid = false;
2022                                 } else if (r) {
2023                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2024                                                   adev->ip_blocks[i].version->funcs->name, r);
2025                                         return r;
2026                                 } else {
2027                                         adev->ip_blocks[i].status.valid = true;
2028                                 }
2029                         } else {
2030                                 adev->ip_blocks[i].status.valid = true;
2031                         }
2032                 }
2033                 /* get the vbios after the asic_funcs are set up */
2034                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2035                         r = amdgpu_device_parse_gpu_info_fw(adev);
2036                         if (r)
2037                                 return r;
2038
2039                         /* Read BIOS */
2040                         if (!amdgpu_get_bios(adev))
2041                                 return -EINVAL;
2042
2043                         r = amdgpu_atombios_init(adev);
2044                         if (r) {
2045                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2046                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2047                                 return r;
2048                         }
2049                 }
2050         }
2051
2052         adev->cg_flags &= amdgpu_cg_mask;
2053         adev->pg_flags &= amdgpu_pg_mask;
2054
2055         return 0;
2056 }
2057
2058 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2059 {
2060         int i, r;
2061
2062         for (i = 0; i < adev->num_ip_blocks; i++) {
2063                 if (!adev->ip_blocks[i].status.sw)
2064                         continue;
2065                 if (adev->ip_blocks[i].status.hw)
2066                         continue;
2067                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2068                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2069                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2070                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2071                         if (r) {
2072                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2073                                           adev->ip_blocks[i].version->funcs->name, r);
2074                                 return r;
2075                         }
2076                         adev->ip_blocks[i].status.hw = true;
2077                 }
2078         }
2079
2080         return 0;
2081 }
2082
2083 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2084 {
2085         int i, r;
2086
2087         for (i = 0; i < adev->num_ip_blocks; i++) {
2088                 if (!adev->ip_blocks[i].status.sw)
2089                         continue;
2090                 if (adev->ip_blocks[i].status.hw)
2091                         continue;
2092                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2093                 if (r) {
2094                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2095                                   adev->ip_blocks[i].version->funcs->name, r);
2096                         return r;
2097                 }
2098                 adev->ip_blocks[i].status.hw = true;
2099         }
2100
2101         return 0;
2102 }
2103
2104 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2105 {
2106         int r = 0;
2107         int i;
2108         uint32_t smu_version;
2109
2110         if (adev->asic_type >= CHIP_VEGA10) {
2111                 for (i = 0; i < adev->num_ip_blocks; i++) {
2112                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2113                                 continue;
2114
2115                         /* no need to do the fw loading again if already done*/
2116                         if (adev->ip_blocks[i].status.hw == true)
2117                                 break;
2118
2119                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2120                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2121                                 if (r) {
2122                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2123                                                           adev->ip_blocks[i].version->funcs->name, r);
2124                                         return r;
2125                                 }
2126                         } else {
2127                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2128                                 if (r) {
2129                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2130                                                           adev->ip_blocks[i].version->funcs->name, r);
2131                                         return r;
2132                                 }
2133                         }
2134
2135                         adev->ip_blocks[i].status.hw = true;
2136                         break;
2137                 }
2138         }
2139
2140         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2141                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2142
2143         return r;
2144 }
2145
2146 /**
2147  * amdgpu_device_ip_init - run init for hardware IPs
2148  *
2149  * @adev: amdgpu_device pointer
2150  *
2151  * Main initialization pass for hardware IPs.  The list of all the hardware
2152  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2153  * are run.  sw_init initializes the software state associated with each IP
2154  * and hw_init initializes the hardware associated with each IP.
2155  * Returns 0 on success, negative error code on failure.
2156  */
2157 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2158 {
2159         int i, r;
2160
2161         r = amdgpu_ras_init(adev);
2162         if (r)
2163                 return r;
2164
2165         for (i = 0; i < adev->num_ip_blocks; i++) {
2166                 if (!adev->ip_blocks[i].status.valid)
2167                         continue;
2168                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2169                 if (r) {
2170                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2171                                   adev->ip_blocks[i].version->funcs->name, r);
2172                         goto init_failed;
2173                 }
2174                 adev->ip_blocks[i].status.sw = true;
2175
2176                 /* need to do gmc hw init early so we can allocate gpu mem */
2177                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2178                         r = amdgpu_device_vram_scratch_init(adev);
2179                         if (r) {
2180                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2181                                 goto init_failed;
2182                         }
2183                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2184                         if (r) {
2185                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2186                                 goto init_failed;
2187                         }
2188                         r = amdgpu_device_wb_init(adev);
2189                         if (r) {
2190                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2191                                 goto init_failed;
2192                         }
2193                         adev->ip_blocks[i].status.hw = true;
2194
2195                         /* right after GMC hw init, we create CSA */
2196                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2197                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2198                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2199                                                                 AMDGPU_CSA_SIZE);
2200                                 if (r) {
2201                                         DRM_ERROR("allocate CSA failed %d\n", r);
2202                                         goto init_failed;
2203                                 }
2204                         }
2205                 }
2206         }
2207
2208         if (amdgpu_sriov_vf(adev))
2209                 amdgpu_virt_init_data_exchange(adev);
2210
2211         r = amdgpu_ib_pool_init(adev);
2212         if (r) {
2213                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2214                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2215                 goto init_failed;
2216         }
2217
2218         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2219         if (r)
2220                 goto init_failed;
2221
2222         r = amdgpu_device_ip_hw_init_phase1(adev);
2223         if (r)
2224                 goto init_failed;
2225
2226         r = amdgpu_device_fw_loading(adev);
2227         if (r)
2228                 goto init_failed;
2229
2230         r = amdgpu_device_ip_hw_init_phase2(adev);
2231         if (r)
2232                 goto init_failed;
2233
2234         /*
2235          * retired pages will be loaded from eeprom and reserved here,
2236          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2237          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2238          * for I2C communication which only true at this point.
2239          *
2240          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2241          * failure from bad gpu situation and stop amdgpu init process
2242          * accordingly. For other failed cases, it will still release all
2243          * the resource and print error message, rather than returning one
2244          * negative value to upper level.
2245          *
2246          * Note: theoretically, this should be called before all vram allocations
2247          * to protect retired page from abusing
2248          */
2249         r = amdgpu_ras_recovery_init(adev);
2250         if (r)
2251                 goto init_failed;
2252
2253         if (adev->gmc.xgmi.num_physical_nodes > 1)
2254                 amdgpu_xgmi_add_device(adev);
2255         amdgpu_amdkfd_device_init(adev);
2256
2257         amdgpu_fru_get_product_info(adev);
2258
2259 init_failed:
2260         if (amdgpu_sriov_vf(adev))
2261                 amdgpu_virt_release_full_gpu(adev, true);
2262
2263         return r;
2264 }
2265
2266 /**
2267  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2268  *
2269  * @adev: amdgpu_device pointer
2270  *
2271  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2272  * this function before a GPU reset.  If the value is retained after a
2273  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2274  */
2275 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2276 {
2277         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2278 }
2279
2280 /**
2281  * amdgpu_device_check_vram_lost - check if vram is valid
2282  *
2283  * @adev: amdgpu_device pointer
2284  *
2285  * Checks the reset magic value written to the gart pointer in VRAM.
2286  * The driver calls this after a GPU reset to see if the contents of
2287  * VRAM is lost or now.
2288  * returns true if vram is lost, false if not.
2289  */
2290 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2291 {
2292         if (memcmp(adev->gart.ptr, adev->reset_magic,
2293                         AMDGPU_RESET_MAGIC_NUM))
2294                 return true;
2295
2296         if (!amdgpu_in_reset(adev))
2297                 return false;
2298
2299         /*
2300          * For all ASICs with baco/mode1 reset, the VRAM is
2301          * always assumed to be lost.
2302          */
2303         switch (amdgpu_asic_reset_method(adev)) {
2304         case AMD_RESET_METHOD_BACO:
2305         case AMD_RESET_METHOD_MODE1:
2306                 return true;
2307         default:
2308                 return false;
2309         }
2310 }
2311
2312 /**
2313  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2314  *
2315  * @adev: amdgpu_device pointer
2316  * @state: clockgating state (gate or ungate)
2317  *
2318  * The list of all the hardware IPs that make up the asic is walked and the
2319  * set_clockgating_state callbacks are run.
2320  * Late initialization pass enabling clockgating for hardware IPs.
2321  * Fini or suspend, pass disabling clockgating for hardware IPs.
2322  * Returns 0 on success, negative error code on failure.
2323  */
2324
2325 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2326                                                 enum amd_clockgating_state state)
2327 {
2328         int i, j, r;
2329
2330         if (amdgpu_emu_mode == 1)
2331                 return 0;
2332
2333         for (j = 0; j < adev->num_ip_blocks; j++) {
2334                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2335                 if (!adev->ip_blocks[i].status.late_initialized)
2336                         continue;
2337                 /* skip CG for VCE/UVD, it's handled specially */
2338                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2339                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2340                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2341                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2342                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2343                         /* enable clockgating to save power */
2344                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2345                                                                                      state);
2346                         if (r) {
2347                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2348                                           adev->ip_blocks[i].version->funcs->name, r);
2349                                 return r;
2350                         }
2351                 }
2352         }
2353
2354         return 0;
2355 }
2356
2357 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2358 {
2359         int i, j, r;
2360
2361         if (amdgpu_emu_mode == 1)
2362                 return 0;
2363
2364         for (j = 0; j < adev->num_ip_blocks; j++) {
2365                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2366                 if (!adev->ip_blocks[i].status.late_initialized)
2367                         continue;
2368                 /* skip CG for VCE/UVD, it's handled specially */
2369                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2370                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2371                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2372                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2373                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2374                         /* enable powergating to save power */
2375                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2376                                                                                         state);
2377                         if (r) {
2378                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2379                                           adev->ip_blocks[i].version->funcs->name, r);
2380                                 return r;
2381                         }
2382                 }
2383         }
2384         return 0;
2385 }
2386
2387 static int amdgpu_device_enable_mgpu_fan_boost(void)
2388 {
2389         struct amdgpu_gpu_instance *gpu_ins;
2390         struct amdgpu_device *adev;
2391         int i, ret = 0;
2392
2393         mutex_lock(&mgpu_info.mutex);
2394
2395         /*
2396          * MGPU fan boost feature should be enabled
2397          * only when there are two or more dGPUs in
2398          * the system
2399          */
2400         if (mgpu_info.num_dgpu < 2)
2401                 goto out;
2402
2403         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2404                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2405                 adev = gpu_ins->adev;
2406                 if (!(adev->flags & AMD_IS_APU) &&
2407                     !gpu_ins->mgpu_fan_enabled) {
2408                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2409                         if (ret)
2410                                 break;
2411
2412                         gpu_ins->mgpu_fan_enabled = 1;
2413                 }
2414         }
2415
2416 out:
2417         mutex_unlock(&mgpu_info.mutex);
2418
2419         return ret;
2420 }
2421
2422 /**
2423  * amdgpu_device_ip_late_init - run late init for hardware IPs
2424  *
2425  * @adev: amdgpu_device pointer
2426  *
2427  * Late initialization pass for hardware IPs.  The list of all the hardware
2428  * IPs that make up the asic is walked and the late_init callbacks are run.
2429  * late_init covers any special initialization that an IP requires
2430  * after all of the have been initialized or something that needs to happen
2431  * late in the init process.
2432  * Returns 0 on success, negative error code on failure.
2433  */
2434 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2435 {
2436         struct amdgpu_gpu_instance *gpu_instance;
2437         int i = 0, r;
2438
2439         for (i = 0; i < adev->num_ip_blocks; i++) {
2440                 if (!adev->ip_blocks[i].status.hw)
2441                         continue;
2442                 if (adev->ip_blocks[i].version->funcs->late_init) {
2443                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2444                         if (r) {
2445                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2446                                           adev->ip_blocks[i].version->funcs->name, r);
2447                                 return r;
2448                         }
2449                 }
2450                 adev->ip_blocks[i].status.late_initialized = true;
2451         }
2452
2453         amdgpu_ras_set_error_query_ready(adev, true);
2454
2455         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2456         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2457
2458         amdgpu_device_fill_reset_magic(adev);
2459
2460         r = amdgpu_device_enable_mgpu_fan_boost();
2461         if (r)
2462                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2463
2464
2465         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2466                 mutex_lock(&mgpu_info.mutex);
2467
2468                 /*
2469                  * Reset device p-state to low as this was booted with high.
2470                  *
2471                  * This should be performed only after all devices from the same
2472                  * hive get initialized.
2473                  *
2474                  * However, it's unknown how many device in the hive in advance.
2475                  * As this is counted one by one during devices initializations.
2476                  *
2477                  * So, we wait for all XGMI interlinked devices initialized.
2478                  * This may bring some delays as those devices may come from
2479                  * different hives. But that should be OK.
2480                  */
2481                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2482                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2483                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2484                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2485                                         continue;
2486
2487                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2488                                                 AMDGPU_XGMI_PSTATE_MIN);
2489                                 if (r) {
2490                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2491                                         break;
2492                                 }
2493                         }
2494                 }
2495
2496                 mutex_unlock(&mgpu_info.mutex);
2497         }
2498
2499         return 0;
2500 }
2501
2502 /**
2503  * amdgpu_device_ip_fini - run fini for hardware IPs
2504  *
2505  * @adev: amdgpu_device pointer
2506  *
2507  * Main teardown pass for hardware IPs.  The list of all the hardware
2508  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2509  * are run.  hw_fini tears down the hardware associated with each IP
2510  * and sw_fini tears down any software state associated with each IP.
2511  * Returns 0 on success, negative error code on failure.
2512  */
2513 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2514 {
2515         int i, r;
2516
2517         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2518                 amdgpu_virt_release_ras_err_handler_data(adev);
2519
2520         amdgpu_ras_pre_fini(adev);
2521
2522         if (adev->gmc.xgmi.num_physical_nodes > 1)
2523                 amdgpu_xgmi_remove_device(adev);
2524
2525         amdgpu_amdkfd_device_fini(adev);
2526
2527         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2528         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2529
2530         /* need to disable SMC first */
2531         for (i = 0; i < adev->num_ip_blocks; i++) {
2532                 if (!adev->ip_blocks[i].status.hw)
2533                         continue;
2534                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2535                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2536                         /* XXX handle errors */
2537                         if (r) {
2538                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2539                                           adev->ip_blocks[i].version->funcs->name, r);
2540                         }
2541                         adev->ip_blocks[i].status.hw = false;
2542                         break;
2543                 }
2544         }
2545
2546         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2547                 if (!adev->ip_blocks[i].status.hw)
2548                         continue;
2549
2550                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2551                 /* XXX handle errors */
2552                 if (r) {
2553                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2554                                   adev->ip_blocks[i].version->funcs->name, r);
2555                 }
2556
2557                 adev->ip_blocks[i].status.hw = false;
2558         }
2559
2560
2561         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2562                 if (!adev->ip_blocks[i].status.sw)
2563                         continue;
2564
2565                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2566                         amdgpu_ucode_free_bo(adev);
2567                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2568                         amdgpu_device_wb_fini(adev);
2569                         amdgpu_device_vram_scratch_fini(adev);
2570                         amdgpu_ib_pool_fini(adev);
2571                 }
2572
2573                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2574                 /* XXX handle errors */
2575                 if (r) {
2576                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2577                                   adev->ip_blocks[i].version->funcs->name, r);
2578                 }
2579                 adev->ip_blocks[i].status.sw = false;
2580                 adev->ip_blocks[i].status.valid = false;
2581         }
2582
2583         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2584                 if (!adev->ip_blocks[i].status.late_initialized)
2585                         continue;
2586                 if (adev->ip_blocks[i].version->funcs->late_fini)
2587                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2588                 adev->ip_blocks[i].status.late_initialized = false;
2589         }
2590
2591         amdgpu_ras_fini(adev);
2592
2593         if (amdgpu_sriov_vf(adev))
2594                 if (amdgpu_virt_release_full_gpu(adev, false))
2595                         DRM_ERROR("failed to release exclusive mode on fini\n");
2596
2597         return 0;
2598 }
2599
2600 /**
2601  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2602  *
2603  * @work: work_struct.
2604  */
2605 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2606 {
2607         struct amdgpu_device *adev =
2608                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2609         int r;
2610
2611         r = amdgpu_ib_ring_tests(adev);
2612         if (r)
2613                 DRM_ERROR("ib ring test failed (%d).\n", r);
2614 }
2615
2616 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2617 {
2618         struct amdgpu_device *adev =
2619                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2620
2621         mutex_lock(&adev->gfx.gfx_off_mutex);
2622         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2623                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2624                         adev->gfx.gfx_off_state = true;
2625         }
2626         mutex_unlock(&adev->gfx.gfx_off_mutex);
2627 }
2628
2629 /**
2630  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2631  *
2632  * @adev: amdgpu_device pointer
2633  *
2634  * Main suspend function for hardware IPs.  The list of all the hardware
2635  * IPs that make up the asic is walked, clockgating is disabled and the
2636  * suspend callbacks are run.  suspend puts the hardware and software state
2637  * in each IP into a state suitable for suspend.
2638  * Returns 0 on success, negative error code on failure.
2639  */
2640 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2641 {
2642         int i, r;
2643
2644         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2645         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2646
2647         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2648                 if (!adev->ip_blocks[i].status.valid)
2649                         continue;
2650
2651                 /* displays are handled separately */
2652                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2653                         continue;
2654
2655                 /* XXX handle errors */
2656                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2657                 /* XXX handle errors */
2658                 if (r) {
2659                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2660                                   adev->ip_blocks[i].version->funcs->name, r);
2661                         return r;
2662                 }
2663
2664                 adev->ip_blocks[i].status.hw = false;
2665         }
2666
2667         return 0;
2668 }
2669
2670 /**
2671  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2672  *
2673  * @adev: amdgpu_device pointer
2674  *
2675  * Main suspend function for hardware IPs.  The list of all the hardware
2676  * IPs that make up the asic is walked, clockgating is disabled and the
2677  * suspend callbacks are run.  suspend puts the hardware and software state
2678  * in each IP into a state suitable for suspend.
2679  * Returns 0 on success, negative error code on failure.
2680  */
2681 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2682 {
2683         int i, r;
2684
2685         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2686                 if (!adev->ip_blocks[i].status.valid)
2687                         continue;
2688                 /* displays are handled in phase1 */
2689                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2690                         continue;
2691                 /* PSP lost connection when err_event_athub occurs */
2692                 if (amdgpu_ras_intr_triggered() &&
2693                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2694                         adev->ip_blocks[i].status.hw = false;
2695                         continue;
2696                 }
2697                 /* XXX handle errors */
2698                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2699                 /* XXX handle errors */
2700                 if (r) {
2701                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2702                                   adev->ip_blocks[i].version->funcs->name, r);
2703                 }
2704                 adev->ip_blocks[i].status.hw = false;
2705                 /* handle putting the SMC in the appropriate state */
2706                 if(!amdgpu_sriov_vf(adev)){
2707                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2708                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2709                                 if (r) {
2710                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2711                                                         adev->mp1_state, r);
2712                                         return r;
2713                                 }
2714                         }
2715                 }
2716                 adev->ip_blocks[i].status.hw = false;
2717         }
2718
2719         return 0;
2720 }
2721
2722 /**
2723  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2724  *
2725  * @adev: amdgpu_device pointer
2726  *
2727  * Main suspend function for hardware IPs.  The list of all the hardware
2728  * IPs that make up the asic is walked, clockgating is disabled and the
2729  * suspend callbacks are run.  suspend puts the hardware and software state
2730  * in each IP into a state suitable for suspend.
2731  * Returns 0 on success, negative error code on failure.
2732  */
2733 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2734 {
2735         int r;
2736
2737         if (amdgpu_sriov_vf(adev))
2738                 amdgpu_virt_request_full_gpu(adev, false);
2739
2740         r = amdgpu_device_ip_suspend_phase1(adev);
2741         if (r)
2742                 return r;
2743         r = amdgpu_device_ip_suspend_phase2(adev);
2744
2745         if (amdgpu_sriov_vf(adev))
2746                 amdgpu_virt_release_full_gpu(adev, false);
2747
2748         return r;
2749 }
2750
2751 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2752 {
2753         int i, r;
2754
2755         static enum amd_ip_block_type ip_order[] = {
2756                 AMD_IP_BLOCK_TYPE_GMC,
2757                 AMD_IP_BLOCK_TYPE_COMMON,
2758                 AMD_IP_BLOCK_TYPE_PSP,
2759                 AMD_IP_BLOCK_TYPE_IH,
2760         };
2761
2762         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2763                 int j;
2764                 struct amdgpu_ip_block *block;
2765
2766                 block = &adev->ip_blocks[i];
2767                 block->status.hw = false;
2768
2769                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2770
2771                         if (block->version->type != ip_order[j] ||
2772                                 !block->status.valid)
2773                                 continue;
2774
2775                         r = block->version->funcs->hw_init(adev);
2776                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2777                         if (r)
2778                                 return r;
2779                         block->status.hw = true;
2780                 }
2781         }
2782
2783         return 0;
2784 }
2785
2786 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2787 {
2788         int i, r;
2789
2790         static enum amd_ip_block_type ip_order[] = {
2791                 AMD_IP_BLOCK_TYPE_SMC,
2792                 AMD_IP_BLOCK_TYPE_DCE,
2793                 AMD_IP_BLOCK_TYPE_GFX,
2794                 AMD_IP_BLOCK_TYPE_SDMA,
2795                 AMD_IP_BLOCK_TYPE_UVD,
2796                 AMD_IP_BLOCK_TYPE_VCE,
2797                 AMD_IP_BLOCK_TYPE_VCN
2798         };
2799
2800         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2801                 int j;
2802                 struct amdgpu_ip_block *block;
2803
2804                 for (j = 0; j < adev->num_ip_blocks; j++) {
2805                         block = &adev->ip_blocks[j];
2806
2807                         if (block->version->type != ip_order[i] ||
2808                                 !block->status.valid ||
2809                                 block->status.hw)
2810                                 continue;
2811
2812                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2813                                 r = block->version->funcs->resume(adev);
2814                         else
2815                                 r = block->version->funcs->hw_init(adev);
2816
2817                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2818                         if (r)
2819                                 return r;
2820                         block->status.hw = true;
2821                 }
2822         }
2823
2824         return 0;
2825 }
2826
2827 /**
2828  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2829  *
2830  * @adev: amdgpu_device pointer
2831  *
2832  * First resume function for hardware IPs.  The list of all the hardware
2833  * IPs that make up the asic is walked and the resume callbacks are run for
2834  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2835  * after a suspend and updates the software state as necessary.  This
2836  * function is also used for restoring the GPU after a GPU reset.
2837  * Returns 0 on success, negative error code on failure.
2838  */
2839 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2840 {
2841         int i, r;
2842
2843         for (i = 0; i < adev->num_ip_blocks; i++) {
2844                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2845                         continue;
2846                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2847                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2848                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2849
2850                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2851                         if (r) {
2852                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2853                                           adev->ip_blocks[i].version->funcs->name, r);
2854                                 return r;
2855                         }
2856                         adev->ip_blocks[i].status.hw = true;
2857                 }
2858         }
2859
2860         return 0;
2861 }
2862
2863 /**
2864  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2865  *
2866  * @adev: amdgpu_device pointer
2867  *
2868  * First resume function for hardware IPs.  The list of all the hardware
2869  * IPs that make up the asic is walked and the resume callbacks are run for
2870  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2871  * functional state after a suspend and updates the software state as
2872  * necessary.  This function is also used for restoring the GPU after a GPU
2873  * reset.
2874  * Returns 0 on success, negative error code on failure.
2875  */
2876 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2877 {
2878         int i, r;
2879
2880         for (i = 0; i < adev->num_ip_blocks; i++) {
2881                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2882                         continue;
2883                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2884                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2885                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2886                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2887                         continue;
2888                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2889                 if (r) {
2890                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2891                                   adev->ip_blocks[i].version->funcs->name, r);
2892                         return r;
2893                 }
2894                 adev->ip_blocks[i].status.hw = true;
2895         }
2896
2897         return 0;
2898 }
2899
2900 /**
2901  * amdgpu_device_ip_resume - run resume for hardware IPs
2902  *
2903  * @adev: amdgpu_device pointer
2904  *
2905  * Main resume function for hardware IPs.  The hardware IPs
2906  * are split into two resume functions because they are
2907  * are also used in in recovering from a GPU reset and some additional
2908  * steps need to be take between them.  In this case (S3/S4) they are
2909  * run sequentially.
2910  * Returns 0 on success, negative error code on failure.
2911  */
2912 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2913 {
2914         int r;
2915
2916         r = amdgpu_device_ip_resume_phase1(adev);
2917         if (r)
2918                 return r;
2919
2920         r = amdgpu_device_fw_loading(adev);
2921         if (r)
2922                 return r;
2923
2924         r = amdgpu_device_ip_resume_phase2(adev);
2925
2926         return r;
2927 }
2928
2929 /**
2930  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2931  *
2932  * @adev: amdgpu_device pointer
2933  *
2934  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2935  */
2936 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2937 {
2938         if (amdgpu_sriov_vf(adev)) {
2939                 if (adev->is_atom_fw) {
2940                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2941                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2942                 } else {
2943                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2944                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2945                 }
2946
2947                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2948                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2949         }
2950 }
2951
2952 /**
2953  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2954  *
2955  * @asic_type: AMD asic type
2956  *
2957  * Check if there is DC (new modesetting infrastructre) support for an asic.
2958  * returns true if DC has support, false if not.
2959  */
2960 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2961 {
2962         switch (asic_type) {
2963 #if defined(CONFIG_DRM_AMD_DC)
2964 #if defined(CONFIG_DRM_AMD_DC_SI)
2965         case CHIP_TAHITI:
2966         case CHIP_PITCAIRN:
2967         case CHIP_VERDE:
2968         case CHIP_OLAND:
2969 #endif
2970         case CHIP_BONAIRE:
2971         case CHIP_KAVERI:
2972         case CHIP_KABINI:
2973         case CHIP_MULLINS:
2974                 /*
2975                  * We have systems in the wild with these ASICs that require
2976                  * LVDS and VGA support which is not supported with DC.
2977                  *
2978                  * Fallback to the non-DC driver here by default so as not to
2979                  * cause regressions.
2980                  */
2981                 return amdgpu_dc > 0;
2982         case CHIP_HAWAII:
2983         case CHIP_CARRIZO:
2984         case CHIP_STONEY:
2985         case CHIP_POLARIS10:
2986         case CHIP_POLARIS11:
2987         case CHIP_POLARIS12:
2988         case CHIP_VEGAM:
2989         case CHIP_TONGA:
2990         case CHIP_FIJI:
2991         case CHIP_VEGA10:
2992         case CHIP_VEGA12:
2993         case CHIP_VEGA20:
2994 #if defined(CONFIG_DRM_AMD_DC_DCN)
2995         case CHIP_RAVEN:
2996         case CHIP_NAVI10:
2997         case CHIP_NAVI14:
2998         case CHIP_NAVI12:
2999         case CHIP_RENOIR:
3000 #endif
3001 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3002         case CHIP_SIENNA_CICHLID:
3003         case CHIP_NAVY_FLOUNDER:
3004 #endif
3005                 return amdgpu_dc != 0;
3006 #endif
3007         default:
3008                 if (amdgpu_dc > 0)
3009                         DRM_INFO("Display Core has been requested via kernel parameter "
3010                                          "but isn't supported by ASIC, ignoring\n");
3011                 return false;
3012         }
3013 }
3014
3015 /**
3016  * amdgpu_device_has_dc_support - check if dc is supported
3017  *
3018  * @adev: amdgpu_device_pointer
3019  *
3020  * Returns true for supported, false for not supported
3021  */
3022 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3023 {
3024         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3025                 return false;
3026
3027         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3028 }
3029
3030
3031 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3032 {
3033         struct amdgpu_device *adev =
3034                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3035         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3036
3037         /* It's a bug to not have a hive within this function */
3038         if (WARN_ON(!hive))
3039                 return;
3040
3041         /*
3042          * Use task barrier to synchronize all xgmi reset works across the
3043          * hive. task_barrier_enter and task_barrier_exit will block
3044          * until all the threads running the xgmi reset works reach
3045          * those points. task_barrier_full will do both blocks.
3046          */
3047         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3048
3049                 task_barrier_enter(&hive->tb);
3050                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3051
3052                 if (adev->asic_reset_res)
3053                         goto fail;
3054
3055                 task_barrier_exit(&hive->tb);
3056                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3057
3058                 if (adev->asic_reset_res)
3059                         goto fail;
3060
3061                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3062                         adev->mmhub.funcs->reset_ras_error_count(adev);
3063         } else {
3064
3065                 task_barrier_full(&hive->tb);
3066                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3067         }
3068
3069 fail:
3070         if (adev->asic_reset_res)
3071                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3072                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3073         amdgpu_put_xgmi_hive(hive);
3074 }
3075
3076 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3077 {
3078         char *input = amdgpu_lockup_timeout;
3079         char *timeout_setting = NULL;
3080         int index = 0;
3081         long timeout;
3082         int ret = 0;
3083
3084         /*
3085          * By default timeout for non compute jobs is 10000.
3086          * And there is no timeout enforced on compute jobs.
3087          * In SR-IOV or passthrough mode, timeout for compute
3088          * jobs are 60000 by default.
3089          */
3090         adev->gfx_timeout = msecs_to_jiffies(10000);
3091         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3092         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3093                 adev->compute_timeout =  msecs_to_jiffies(60000);
3094         else
3095                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3096
3097         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3098                 while ((timeout_setting = strsep(&input, ",")) &&
3099                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3100                         ret = kstrtol(timeout_setting, 0, &timeout);
3101                         if (ret)
3102                                 return ret;
3103
3104                         if (timeout == 0) {
3105                                 index++;
3106                                 continue;
3107                         } else if (timeout < 0) {
3108                                 timeout = MAX_SCHEDULE_TIMEOUT;
3109                         } else {
3110                                 timeout = msecs_to_jiffies(timeout);
3111                         }
3112
3113                         switch (index++) {
3114                         case 0:
3115                                 adev->gfx_timeout = timeout;
3116                                 break;
3117                         case 1:
3118                                 adev->compute_timeout = timeout;
3119                                 break;
3120                         case 2:
3121                                 adev->sdma_timeout = timeout;
3122                                 break;
3123                         case 3:
3124                                 adev->video_timeout = timeout;
3125                                 break;
3126                         default:
3127                                 break;
3128                         }
3129                 }
3130                 /*
3131                  * There is only one value specified and
3132                  * it should apply to all non-compute jobs.
3133                  */
3134                 if (index == 1) {
3135                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3136                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3137                                 adev->compute_timeout = adev->gfx_timeout;
3138                 }
3139         }
3140
3141         return ret;
3142 }
3143
3144 static const struct attribute *amdgpu_dev_attributes[] = {
3145         &dev_attr_product_name.attr,
3146         &dev_attr_product_number.attr,
3147         &dev_attr_serial_number.attr,
3148         &dev_attr_pcie_replay_count.attr,
3149         NULL
3150 };
3151
3152
3153 /**
3154  * amdgpu_device_init - initialize the driver
3155  *
3156  * @adev: amdgpu_device pointer
3157  * @flags: driver flags
3158  *
3159  * Initializes the driver info and hw (all asics).
3160  * Returns 0 for success or an error on failure.
3161  * Called at driver startup.
3162  */
3163 int amdgpu_device_init(struct amdgpu_device *adev,
3164                        uint32_t flags)
3165 {
3166         struct drm_device *ddev = adev_to_drm(adev);
3167         struct pci_dev *pdev = adev->pdev;
3168         int r, i;
3169         bool boco = false;
3170         u32 max_MBps;
3171
3172         adev->shutdown = false;
3173         adev->flags = flags;
3174
3175         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3176                 adev->asic_type = amdgpu_force_asic_type;
3177         else
3178                 adev->asic_type = flags & AMD_ASIC_MASK;
3179
3180         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3181         if (amdgpu_emu_mode == 1)
3182                 adev->usec_timeout *= 10;
3183         adev->gmc.gart_size = 512 * 1024 * 1024;
3184         adev->accel_working = false;
3185         adev->num_rings = 0;
3186         adev->mman.buffer_funcs = NULL;
3187         adev->mman.buffer_funcs_ring = NULL;
3188         adev->vm_manager.vm_pte_funcs = NULL;
3189         adev->vm_manager.vm_pte_num_scheds = 0;
3190         adev->gmc.gmc_funcs = NULL;
3191         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3192         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3193
3194         adev->smc_rreg = &amdgpu_invalid_rreg;
3195         adev->smc_wreg = &amdgpu_invalid_wreg;
3196         adev->pcie_rreg = &amdgpu_invalid_rreg;
3197         adev->pcie_wreg = &amdgpu_invalid_wreg;
3198         adev->pciep_rreg = &amdgpu_invalid_rreg;
3199         adev->pciep_wreg = &amdgpu_invalid_wreg;
3200         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3201         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3202         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3203         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3204         adev->didt_rreg = &amdgpu_invalid_rreg;
3205         adev->didt_wreg = &amdgpu_invalid_wreg;
3206         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3207         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3208         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3209         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3210
3211         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3212                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3213                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3214
3215         /* mutex initialization are all done here so we
3216          * can recall function without having locking issues */
3217         atomic_set(&adev->irq.ih.lock, 0);
3218         mutex_init(&adev->firmware.mutex);
3219         mutex_init(&adev->pm.mutex);
3220         mutex_init(&adev->gfx.gpu_clock_mutex);
3221         mutex_init(&adev->srbm_mutex);
3222         mutex_init(&adev->gfx.pipe_reserve_mutex);
3223         mutex_init(&adev->gfx.gfx_off_mutex);
3224         mutex_init(&adev->grbm_idx_mutex);
3225         mutex_init(&adev->mn_lock);
3226         mutex_init(&adev->virt.vf_errors.lock);
3227         hash_init(adev->mn_hash);
3228         atomic_set(&adev->in_gpu_reset, 0);
3229         init_rwsem(&adev->reset_sem);
3230         mutex_init(&adev->psp.mutex);
3231         mutex_init(&adev->notifier_lock);
3232
3233         r = amdgpu_device_check_arguments(adev);
3234         if (r)
3235                 return r;
3236
3237         spin_lock_init(&adev->mmio_idx_lock);
3238         spin_lock_init(&adev->smc_idx_lock);
3239         spin_lock_init(&adev->pcie_idx_lock);
3240         spin_lock_init(&adev->uvd_ctx_idx_lock);
3241         spin_lock_init(&adev->didt_idx_lock);
3242         spin_lock_init(&adev->gc_cac_idx_lock);
3243         spin_lock_init(&adev->se_cac_idx_lock);
3244         spin_lock_init(&adev->audio_endpt_idx_lock);
3245         spin_lock_init(&adev->mm_stats.lock);
3246
3247         INIT_LIST_HEAD(&adev->shadow_list);
3248         mutex_init(&adev->shadow_list_lock);
3249
3250         INIT_DELAYED_WORK(&adev->delayed_init_work,
3251                           amdgpu_device_delayed_init_work_handler);
3252         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3253                           amdgpu_device_delay_enable_gfx_off);
3254
3255         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3256
3257         adev->gfx.gfx_off_req_count = 1;
3258         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3259
3260         atomic_set(&adev->throttling_logging_enabled, 1);
3261         /*
3262          * If throttling continues, logging will be performed every minute
3263          * to avoid log flooding. "-1" is subtracted since the thermal
3264          * throttling interrupt comes every second. Thus, the total logging
3265          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3266          * for throttling interrupt) = 60 seconds.
3267          */
3268         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3269         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3270
3271         /* Registers mapping */
3272         /* TODO: block userspace mapping of io register */
3273         if (adev->asic_type >= CHIP_BONAIRE) {
3274                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3275                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3276         } else {
3277                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3278                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3279         }
3280
3281         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3282         if (adev->rmmio == NULL) {
3283                 return -ENOMEM;
3284         }
3285         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3286         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3287
3288         /* io port mapping */
3289         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3290                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3291                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3292                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3293                         break;
3294                 }
3295         }
3296         if (adev->rio_mem == NULL)
3297                 DRM_INFO("PCI I/O BAR is not found.\n");
3298
3299         /* enable PCIE atomic ops */
3300         r = pci_enable_atomic_ops_to_root(adev->pdev,
3301                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3302                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3303         if (r) {
3304                 adev->have_atomics_support = false;
3305                 DRM_INFO("PCIE atomic ops is not supported\n");
3306         } else {
3307                 adev->have_atomics_support = true;
3308         }
3309
3310         amdgpu_device_get_pcie_info(adev);
3311
3312         if (amdgpu_mcbp)
3313                 DRM_INFO("MCBP is enabled\n");
3314
3315         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3316                 adev->enable_mes = true;
3317
3318         /* detect hw virtualization here */
3319         amdgpu_detect_virtualization(adev);
3320
3321         r = amdgpu_device_get_job_timeout_settings(adev);
3322         if (r) {
3323                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3324                 goto failed_unmap;
3325         }
3326
3327         /* early init functions */
3328         r = amdgpu_device_ip_early_init(adev);
3329         if (r)
3330                 goto failed_unmap;
3331
3332         /* doorbell bar mapping and doorbell index init*/
3333         amdgpu_device_doorbell_init(adev);
3334
3335         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3336         /* this will fail for cards that aren't VGA class devices, just
3337          * ignore it */
3338         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3339
3340         if (amdgpu_device_supports_boco(ddev))
3341                 boco = true;
3342         if (amdgpu_has_atpx() &&
3343             (amdgpu_is_atpx_hybrid() ||
3344              amdgpu_has_atpx_dgpu_power_cntl()) &&
3345             !pci_is_thunderbolt_attached(adev->pdev))
3346                 vga_switcheroo_register_client(adev->pdev,
3347                                                &amdgpu_switcheroo_ops, boco);
3348         if (boco)
3349                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3350
3351         if (amdgpu_emu_mode == 1) {
3352                 /* post the asic on emulation mode */
3353                 emu_soc_asic_init(adev);
3354                 goto fence_driver_init;
3355         }
3356
3357         /* detect if we are with an SRIOV vbios */
3358         amdgpu_device_detect_sriov_bios(adev);
3359
3360         /* check if we need to reset the asic
3361          *  E.g., driver was not cleanly unloaded previously, etc.
3362          */
3363         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3364                 r = amdgpu_asic_reset(adev);
3365                 if (r) {
3366                         dev_err(adev->dev, "asic reset on init failed\n");
3367                         goto failed;
3368                 }
3369         }
3370
3371         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3372
3373         /* Post card if necessary */
3374         if (amdgpu_device_need_post(adev)) {
3375                 if (!adev->bios) {
3376                         dev_err(adev->dev, "no vBIOS found\n");
3377                         r = -EINVAL;
3378                         goto failed;
3379                 }
3380                 DRM_INFO("GPU posting now...\n");
3381                 r = amdgpu_device_asic_init(adev);
3382                 if (r) {
3383                         dev_err(adev->dev, "gpu post error!\n");
3384                         goto failed;
3385                 }
3386         }
3387
3388         if (adev->is_atom_fw) {
3389                 /* Initialize clocks */
3390                 r = amdgpu_atomfirmware_get_clock_info(adev);
3391                 if (r) {
3392                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3393                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3394                         goto failed;
3395                 }
3396         } else {
3397                 /* Initialize clocks */
3398                 r = amdgpu_atombios_get_clock_info(adev);
3399                 if (r) {
3400                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3401                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3402                         goto failed;
3403                 }
3404                 /* init i2c buses */
3405                 if (!amdgpu_device_has_dc_support(adev))
3406                         amdgpu_atombios_i2c_init(adev);
3407         }
3408
3409 fence_driver_init:
3410         /* Fence driver */
3411         r = amdgpu_fence_driver_init(adev);
3412         if (r) {
3413                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3414                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3415                 goto failed;
3416         }
3417
3418         /* init the mode config */
3419         drm_mode_config_init(adev_to_drm(adev));
3420
3421         r = amdgpu_device_ip_init(adev);
3422         if (r) {
3423                 /* failed in exclusive mode due to timeout */
3424                 if (amdgpu_sriov_vf(adev) &&
3425                     !amdgpu_sriov_runtime(adev) &&
3426                     amdgpu_virt_mmio_blocked(adev) &&
3427                     !amdgpu_virt_wait_reset(adev)) {
3428                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3429                         /* Don't send request since VF is inactive. */
3430                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3431                         adev->virt.ops = NULL;
3432                         r = -EAGAIN;
3433                         goto failed;
3434                 }
3435                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3436                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3437                 goto failed;
3438         }
3439
3440         dev_info(adev->dev,
3441                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3442                         adev->gfx.config.max_shader_engines,
3443                         adev->gfx.config.max_sh_per_se,
3444                         adev->gfx.config.max_cu_per_sh,
3445                         adev->gfx.cu_info.number);
3446
3447         adev->accel_working = true;
3448
3449         amdgpu_vm_check_compute_bug(adev);
3450
3451         /* Initialize the buffer migration limit. */
3452         if (amdgpu_moverate >= 0)
3453                 max_MBps = amdgpu_moverate;
3454         else
3455                 max_MBps = 8; /* Allow 8 MB/s. */
3456         /* Get a log2 for easy divisions. */
3457         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3458
3459         amdgpu_fbdev_init(adev);
3460
3461         r = amdgpu_pm_sysfs_init(adev);
3462         if (r) {
3463                 adev->pm_sysfs_en = false;
3464                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3465         } else
3466                 adev->pm_sysfs_en = true;
3467
3468         r = amdgpu_ucode_sysfs_init(adev);
3469         if (r) {
3470                 adev->ucode_sysfs_en = false;
3471                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3472         } else
3473                 adev->ucode_sysfs_en = true;
3474
3475         if ((amdgpu_testing & 1)) {
3476                 if (adev->accel_working)
3477                         amdgpu_test_moves(adev);
3478                 else
3479                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3480         }
3481         if (amdgpu_benchmarking) {
3482                 if (adev->accel_working)
3483                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3484                 else
3485                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3486         }
3487
3488         /*
3489          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3490          * Otherwise the mgpu fan boost feature will be skipped due to the
3491          * gpu instance is counted less.
3492          */
3493         amdgpu_register_gpu_instance(adev);
3494
3495         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3496          * explicit gating rather than handling it automatically.
3497          */
3498         r = amdgpu_device_ip_late_init(adev);
3499         if (r) {
3500                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3501                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3502                 goto failed;
3503         }
3504
3505         /* must succeed. */
3506         amdgpu_ras_resume(adev);
3507
3508         queue_delayed_work(system_wq, &adev->delayed_init_work,
3509                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3510
3511         if (amdgpu_sriov_vf(adev))
3512                 flush_delayed_work(&adev->delayed_init_work);
3513
3514         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3515         if (r)
3516                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3517
3518         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3519                 r = amdgpu_pmu_init(adev);
3520         if (r)
3521                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3522
3523         /* Have stored pci confspace at hand for restore in sudden PCI error */
3524         if (amdgpu_device_cache_pci_state(adev->pdev))
3525                 pci_restore_state(pdev);
3526
3527         return 0;
3528
3529 failed:
3530         amdgpu_vf_error_trans_all(adev);
3531         if (boco)
3532                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3533
3534 failed_unmap:
3535         iounmap(adev->rmmio);
3536         adev->rmmio = NULL;
3537
3538         return r;
3539 }
3540
3541 /**
3542  * amdgpu_device_fini - tear down the driver
3543  *
3544  * @adev: amdgpu_device pointer
3545  *
3546  * Tear down the driver info (all asics).
3547  * Called at driver shutdown.
3548  */
3549 void amdgpu_device_fini(struct amdgpu_device *adev)
3550 {
3551         dev_info(adev->dev, "amdgpu: finishing device.\n");
3552         flush_delayed_work(&adev->delayed_init_work);
3553         adev->shutdown = true;
3554
3555         kfree(adev->pci_state);
3556
3557         /* make sure IB test finished before entering exclusive mode
3558          * to avoid preemption on IB test
3559          * */
3560         if (amdgpu_sriov_vf(adev)) {
3561                 amdgpu_virt_request_full_gpu(adev, false);
3562                 amdgpu_virt_fini_data_exchange(adev);
3563         }
3564
3565         /* disable all interrupts */
3566         amdgpu_irq_disable_all(adev);
3567         if (adev->mode_info.mode_config_initialized){
3568                 if (!amdgpu_device_has_dc_support(adev))
3569                         drm_helper_force_disable_all(adev_to_drm(adev));
3570                 else
3571                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3572         }
3573         amdgpu_fence_driver_fini(adev);
3574         if (adev->pm_sysfs_en)
3575                 amdgpu_pm_sysfs_fini(adev);
3576         amdgpu_fbdev_fini(adev);
3577         amdgpu_device_ip_fini(adev);
3578         release_firmware(adev->firmware.gpu_info_fw);
3579         adev->firmware.gpu_info_fw = NULL;
3580         adev->accel_working = false;
3581         /* free i2c buses */
3582         if (!amdgpu_device_has_dc_support(adev))
3583                 amdgpu_i2c_fini(adev);
3584
3585         if (amdgpu_emu_mode != 1)
3586                 amdgpu_atombios_fini(adev);
3587
3588         kfree(adev->bios);
3589         adev->bios = NULL;
3590         if (amdgpu_has_atpx() &&
3591             (amdgpu_is_atpx_hybrid() ||
3592              amdgpu_has_atpx_dgpu_power_cntl()) &&
3593             !pci_is_thunderbolt_attached(adev->pdev))
3594                 vga_switcheroo_unregister_client(adev->pdev);
3595         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3596                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3597         vga_client_register(adev->pdev, NULL, NULL, NULL);
3598         if (adev->rio_mem)
3599                 pci_iounmap(adev->pdev, adev->rio_mem);
3600         adev->rio_mem = NULL;
3601         iounmap(adev->rmmio);
3602         adev->rmmio = NULL;
3603         amdgpu_device_doorbell_fini(adev);
3604
3605         if (adev->ucode_sysfs_en)
3606                 amdgpu_ucode_sysfs_fini(adev);
3607
3608         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3609         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3610                 amdgpu_pmu_fini(adev);
3611         if (adev->mman.discovery_bin)
3612                 amdgpu_discovery_fini(adev);
3613 }
3614
3615
3616 /*
3617  * Suspend & resume.
3618  */
3619 /**
3620  * amdgpu_device_suspend - initiate device suspend
3621  *
3622  * @dev: drm dev pointer
3623  * @fbcon : notify the fbdev of suspend
3624  *
3625  * Puts the hw in the suspend state (all asics).
3626  * Returns 0 for success or an error on failure.
3627  * Called at driver suspend.
3628  */
3629 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3630 {
3631         struct amdgpu_device *adev;
3632         struct drm_crtc *crtc;
3633         struct drm_connector *connector;
3634         struct drm_connector_list_iter iter;
3635         int r;
3636
3637         adev = drm_to_adev(dev);
3638
3639         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3640                 return 0;
3641
3642         adev->in_suspend = true;
3643         drm_kms_helper_poll_disable(dev);
3644
3645         if (fbcon)
3646                 amdgpu_fbdev_set_suspend(adev, 1);
3647
3648         cancel_delayed_work_sync(&adev->delayed_init_work);
3649
3650         if (!amdgpu_device_has_dc_support(adev)) {
3651                 /* turn off display hw */
3652                 drm_modeset_lock_all(dev);
3653                 drm_connector_list_iter_begin(dev, &iter);
3654                 drm_for_each_connector_iter(connector, &iter)
3655                         drm_helper_connector_dpms(connector,
3656                                                   DRM_MODE_DPMS_OFF);
3657                 drm_connector_list_iter_end(&iter);
3658                 drm_modeset_unlock_all(dev);
3659                         /* unpin the front buffers and cursors */
3660                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3661                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3662                         struct drm_framebuffer *fb = crtc->primary->fb;
3663                         struct amdgpu_bo *robj;
3664
3665                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3666                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3667                                 r = amdgpu_bo_reserve(aobj, true);
3668                                 if (r == 0) {
3669                                         amdgpu_bo_unpin(aobj);
3670                                         amdgpu_bo_unreserve(aobj);
3671                                 }
3672                         }
3673
3674                         if (fb == NULL || fb->obj[0] == NULL) {
3675                                 continue;
3676                         }
3677                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3678                         /* don't unpin kernel fb objects */
3679                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3680                                 r = amdgpu_bo_reserve(robj, true);
3681                                 if (r == 0) {
3682                                         amdgpu_bo_unpin(robj);
3683                                         amdgpu_bo_unreserve(robj);
3684                                 }
3685                         }
3686                 }
3687         }
3688
3689         amdgpu_ras_suspend(adev);
3690
3691         r = amdgpu_device_ip_suspend_phase1(adev);
3692
3693         amdgpu_amdkfd_suspend(adev, !fbcon);
3694
3695         /* evict vram memory */
3696         amdgpu_bo_evict_vram(adev);
3697
3698         amdgpu_fence_driver_suspend(adev);
3699
3700         r = amdgpu_device_ip_suspend_phase2(adev);
3701
3702         /* evict remaining vram memory
3703          * This second call to evict vram is to evict the gart page table
3704          * using the CPU.
3705          */
3706         amdgpu_bo_evict_vram(adev);
3707
3708         return 0;
3709 }
3710
3711 /**
3712  * amdgpu_device_resume - initiate device resume
3713  *
3714  * @dev: drm dev pointer
3715  * @fbcon : notify the fbdev of resume
3716  *
3717  * Bring the hw back to operating state (all asics).
3718  * Returns 0 for success or an error on failure.
3719  * Called at driver resume.
3720  */
3721 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3722 {
3723         struct drm_connector *connector;
3724         struct drm_connector_list_iter iter;
3725         struct amdgpu_device *adev = drm_to_adev(dev);
3726         struct drm_crtc *crtc;
3727         int r = 0;
3728
3729         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3730                 return 0;
3731
3732         /* post card */
3733         if (amdgpu_device_need_post(adev)) {
3734                 r = amdgpu_device_asic_init(adev);
3735                 if (r)
3736                         dev_err(adev->dev, "amdgpu asic init failed\n");
3737         }
3738
3739         r = amdgpu_device_ip_resume(adev);
3740         if (r) {
3741                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3742                 return r;
3743         }
3744         amdgpu_fence_driver_resume(adev);
3745
3746
3747         r = amdgpu_device_ip_late_init(adev);
3748         if (r)
3749                 return r;
3750
3751         queue_delayed_work(system_wq, &adev->delayed_init_work,
3752                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3753
3754         if (!amdgpu_device_has_dc_support(adev)) {
3755                 /* pin cursors */
3756                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3757                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3758
3759                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3760                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3761                                 r = amdgpu_bo_reserve(aobj, true);
3762                                 if (r == 0) {
3763                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3764                                         if (r != 0)
3765                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3766                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3767                                         amdgpu_bo_unreserve(aobj);
3768                                 }
3769                         }
3770                 }
3771         }
3772         r = amdgpu_amdkfd_resume(adev, !fbcon);
3773         if (r)
3774                 return r;
3775
3776         /* Make sure IB tests flushed */
3777         flush_delayed_work(&adev->delayed_init_work);
3778
3779         /* blat the mode back in */
3780         if (fbcon) {
3781                 if (!amdgpu_device_has_dc_support(adev)) {
3782                         /* pre DCE11 */
3783                         drm_helper_resume_force_mode(dev);
3784
3785                         /* turn on display hw */
3786                         drm_modeset_lock_all(dev);
3787
3788                         drm_connector_list_iter_begin(dev, &iter);
3789                         drm_for_each_connector_iter(connector, &iter)
3790                                 drm_helper_connector_dpms(connector,
3791                                                           DRM_MODE_DPMS_ON);
3792                         drm_connector_list_iter_end(&iter);
3793
3794                         drm_modeset_unlock_all(dev);
3795                 }
3796                 amdgpu_fbdev_set_suspend(adev, 0);
3797         }
3798
3799         drm_kms_helper_poll_enable(dev);
3800
3801         amdgpu_ras_resume(adev);
3802
3803         /*
3804          * Most of the connector probing functions try to acquire runtime pm
3805          * refs to ensure that the GPU is powered on when connector polling is
3806          * performed. Since we're calling this from a runtime PM callback,
3807          * trying to acquire rpm refs will cause us to deadlock.
3808          *
3809          * Since we're guaranteed to be holding the rpm lock, it's safe to
3810          * temporarily disable the rpm helpers so this doesn't deadlock us.
3811          */
3812 #ifdef CONFIG_PM
3813         dev->dev->power.disable_depth++;
3814 #endif
3815         if (!amdgpu_device_has_dc_support(adev))
3816                 drm_helper_hpd_irq_event(dev);
3817         else
3818                 drm_kms_helper_hotplug_event(dev);
3819 #ifdef CONFIG_PM
3820         dev->dev->power.disable_depth--;
3821 #endif
3822         adev->in_suspend = false;
3823
3824         return 0;
3825 }
3826
3827 /**
3828  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3829  *
3830  * @adev: amdgpu_device pointer
3831  *
3832  * The list of all the hardware IPs that make up the asic is walked and
3833  * the check_soft_reset callbacks are run.  check_soft_reset determines
3834  * if the asic is still hung or not.
3835  * Returns true if any of the IPs are still in a hung state, false if not.
3836  */
3837 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3838 {
3839         int i;
3840         bool asic_hang = false;
3841
3842         if (amdgpu_sriov_vf(adev))
3843                 return true;
3844
3845         if (amdgpu_asic_need_full_reset(adev))
3846                 return true;
3847
3848         for (i = 0; i < adev->num_ip_blocks; i++) {
3849                 if (!adev->ip_blocks[i].status.valid)
3850                         continue;
3851                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3852                         adev->ip_blocks[i].status.hang =
3853                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3854                 if (adev->ip_blocks[i].status.hang) {
3855                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3856                         asic_hang = true;
3857                 }
3858         }
3859         return asic_hang;
3860 }
3861
3862 /**
3863  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3864  *
3865  * @adev: amdgpu_device pointer
3866  *
3867  * The list of all the hardware IPs that make up the asic is walked and the
3868  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3869  * handles any IP specific hardware or software state changes that are
3870  * necessary for a soft reset to succeed.
3871  * Returns 0 on success, negative error code on failure.
3872  */
3873 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3874 {
3875         int i, r = 0;
3876
3877         for (i = 0; i < adev->num_ip_blocks; i++) {
3878                 if (!adev->ip_blocks[i].status.valid)
3879                         continue;
3880                 if (adev->ip_blocks[i].status.hang &&
3881                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3882                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3883                         if (r)
3884                                 return r;
3885                 }
3886         }
3887
3888         return 0;
3889 }
3890
3891 /**
3892  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3893  *
3894  * @adev: amdgpu_device pointer
3895  *
3896  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3897  * reset is necessary to recover.
3898  * Returns true if a full asic reset is required, false if not.
3899  */
3900 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3901 {
3902         int i;
3903
3904         if (amdgpu_asic_need_full_reset(adev))
3905                 return true;
3906
3907         for (i = 0; i < adev->num_ip_blocks; i++) {
3908                 if (!adev->ip_blocks[i].status.valid)
3909                         continue;
3910                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3911                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3912                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3913                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3914                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3915                         if (adev->ip_blocks[i].status.hang) {
3916                                 dev_info(adev->dev, "Some block need full reset!\n");
3917                                 return true;
3918                         }
3919                 }
3920         }
3921         return false;
3922 }
3923
3924 /**
3925  * amdgpu_device_ip_soft_reset - do a soft reset
3926  *
3927  * @adev: amdgpu_device pointer
3928  *
3929  * The list of all the hardware IPs that make up the asic is walked and the
3930  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3931  * IP specific hardware or software state changes that are necessary to soft
3932  * reset the IP.
3933  * Returns 0 on success, negative error code on failure.
3934  */
3935 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3936 {
3937         int i, r = 0;
3938
3939         for (i = 0; i < adev->num_ip_blocks; i++) {
3940                 if (!adev->ip_blocks[i].status.valid)
3941                         continue;
3942                 if (adev->ip_blocks[i].status.hang &&
3943                     adev->ip_blocks[i].version->funcs->soft_reset) {
3944                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3945                         if (r)
3946                                 return r;
3947                 }
3948         }
3949
3950         return 0;
3951 }
3952
3953 /**
3954  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3955  *
3956  * @adev: amdgpu_device pointer
3957  *
3958  * The list of all the hardware IPs that make up the asic is walked and the
3959  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3960  * handles any IP specific hardware or software state changes that are
3961  * necessary after the IP has been soft reset.
3962  * Returns 0 on success, negative error code on failure.
3963  */
3964 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3965 {
3966         int i, r = 0;
3967
3968         for (i = 0; i < adev->num_ip_blocks; i++) {
3969                 if (!adev->ip_blocks[i].status.valid)
3970                         continue;
3971                 if (adev->ip_blocks[i].status.hang &&
3972                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3973                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3974                 if (r)
3975                         return r;
3976         }
3977
3978         return 0;
3979 }
3980
3981 /**
3982  * amdgpu_device_recover_vram - Recover some VRAM contents
3983  *
3984  * @adev: amdgpu_device pointer
3985  *
3986  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3987  * restore things like GPUVM page tables after a GPU reset where
3988  * the contents of VRAM might be lost.
3989  *
3990  * Returns:
3991  * 0 on success, negative error code on failure.
3992  */
3993 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3994 {
3995         struct dma_fence *fence = NULL, *next = NULL;
3996         struct amdgpu_bo *shadow;
3997         long r = 1, tmo;
3998
3999         if (amdgpu_sriov_runtime(adev))
4000                 tmo = msecs_to_jiffies(8000);
4001         else
4002                 tmo = msecs_to_jiffies(100);
4003
4004         dev_info(adev->dev, "recover vram bo from shadow start\n");
4005         mutex_lock(&adev->shadow_list_lock);
4006         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4007
4008                 /* No need to recover an evicted BO */
4009                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4010                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4011                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4012                         continue;
4013
4014                 r = amdgpu_bo_restore_shadow(shadow, &next);
4015                 if (r)
4016                         break;
4017
4018                 if (fence) {
4019                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4020                         dma_fence_put(fence);
4021                         fence = next;
4022                         if (tmo == 0) {
4023                                 r = -ETIMEDOUT;
4024                                 break;
4025                         } else if (tmo < 0) {
4026                                 r = tmo;
4027                                 break;
4028                         }
4029                 } else {
4030                         fence = next;
4031                 }
4032         }
4033         mutex_unlock(&adev->shadow_list_lock);
4034
4035         if (fence)
4036                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4037         dma_fence_put(fence);
4038
4039         if (r < 0 || tmo <= 0) {
4040                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4041                 return -EIO;
4042         }
4043
4044         dev_info(adev->dev, "recover vram bo from shadow done\n");
4045         return 0;
4046 }
4047
4048
4049 /**
4050  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4051  *
4052  * @adev: amdgpu device pointer
4053  * @from_hypervisor: request from hypervisor
4054  *
4055  * do VF FLR and reinitialize Asic
4056  * return 0 means succeeded otherwise failed
4057  */
4058 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4059                                      bool from_hypervisor)
4060 {
4061         int r;
4062
4063         if (from_hypervisor)
4064                 r = amdgpu_virt_request_full_gpu(adev, true);
4065         else
4066                 r = amdgpu_virt_reset_gpu(adev);
4067         if (r)
4068                 return r;
4069
4070         amdgpu_amdkfd_pre_reset(adev);
4071
4072         /* Resume IP prior to SMC */
4073         r = amdgpu_device_ip_reinit_early_sriov(adev);
4074         if (r)
4075                 goto error;
4076
4077         amdgpu_virt_init_data_exchange(adev);
4078         /* we need recover gart prior to run SMC/CP/SDMA resume */
4079         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4080
4081         r = amdgpu_device_fw_loading(adev);
4082         if (r)
4083                 return r;
4084
4085         /* now we are okay to resume SMC/CP/SDMA */
4086         r = amdgpu_device_ip_reinit_late_sriov(adev);
4087         if (r)
4088                 goto error;
4089
4090         amdgpu_irq_gpu_reset_resume_helper(adev);
4091         r = amdgpu_ib_ring_tests(adev);
4092         amdgpu_amdkfd_post_reset(adev);
4093
4094 error:
4095         amdgpu_virt_release_full_gpu(adev, true);
4096         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4097                 amdgpu_inc_vram_lost(adev);
4098                 r = amdgpu_device_recover_vram(adev);
4099         }
4100
4101         return r;
4102 }
4103
4104 /**
4105  * amdgpu_device_has_job_running - check if there is any job in mirror list
4106  *
4107  * @adev: amdgpu device pointer
4108  *
4109  * check if there is any job in mirror list
4110  */
4111 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4112 {
4113         int i;
4114         struct drm_sched_job *job;
4115
4116         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4117                 struct amdgpu_ring *ring = adev->rings[i];
4118
4119                 if (!ring || !ring->sched.thread)
4120                         continue;
4121
4122                 spin_lock(&ring->sched.job_list_lock);
4123                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4124                                 struct drm_sched_job, node);
4125                 spin_unlock(&ring->sched.job_list_lock);
4126                 if (job)
4127                         return true;
4128         }
4129         return false;
4130 }
4131
4132 /**
4133  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4134  *
4135  * @adev: amdgpu device pointer
4136  *
4137  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4138  * a hung GPU.
4139  */
4140 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4141 {
4142         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4143                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4144                 return false;
4145         }
4146
4147         if (amdgpu_gpu_recovery == 0)
4148                 goto disabled;
4149
4150         if (amdgpu_sriov_vf(adev))
4151                 return true;
4152
4153         if (amdgpu_gpu_recovery == -1) {
4154                 switch (adev->asic_type) {
4155                 case CHIP_BONAIRE:
4156                 case CHIP_HAWAII:
4157                 case CHIP_TOPAZ:
4158                 case CHIP_TONGA:
4159                 case CHIP_FIJI:
4160                 case CHIP_POLARIS10:
4161                 case CHIP_POLARIS11:
4162                 case CHIP_POLARIS12:
4163                 case CHIP_VEGAM:
4164                 case CHIP_VEGA20:
4165                 case CHIP_VEGA10:
4166                 case CHIP_VEGA12:
4167                 case CHIP_RAVEN:
4168                 case CHIP_ARCTURUS:
4169                 case CHIP_RENOIR:
4170                 case CHIP_NAVI10:
4171                 case CHIP_NAVI14:
4172                 case CHIP_NAVI12:
4173                 case CHIP_SIENNA_CICHLID:
4174                         break;
4175                 default:
4176                         goto disabled;
4177                 }
4178         }
4179
4180         return true;
4181
4182 disabled:
4183                 dev_info(adev->dev, "GPU recovery disabled.\n");
4184                 return false;
4185 }
4186
4187
4188 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4189                                         struct amdgpu_job *job,
4190                                         bool *need_full_reset_arg)
4191 {
4192         int i, r = 0;
4193         bool need_full_reset  = *need_full_reset_arg;
4194
4195         amdgpu_debugfs_wait_dump(adev);
4196
4197         if (amdgpu_sriov_vf(adev)) {
4198                 /* stop the data exchange thread */
4199                 amdgpu_virt_fini_data_exchange(adev);
4200         }
4201
4202         /* block all schedulers and reset given job's ring */
4203         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4204                 struct amdgpu_ring *ring = adev->rings[i];
4205
4206                 if (!ring || !ring->sched.thread)
4207                         continue;
4208
4209                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4210                 amdgpu_fence_driver_force_completion(ring);
4211         }
4212
4213         if(job)
4214                 drm_sched_increase_karma(&job->base);
4215
4216         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4217         if (!amdgpu_sriov_vf(adev)) {
4218
4219                 if (!need_full_reset)
4220                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4221
4222                 if (!need_full_reset) {
4223                         amdgpu_device_ip_pre_soft_reset(adev);
4224                         r = amdgpu_device_ip_soft_reset(adev);
4225                         amdgpu_device_ip_post_soft_reset(adev);
4226                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4227                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4228                                 need_full_reset = true;
4229                         }
4230                 }
4231
4232                 if (need_full_reset)
4233                         r = amdgpu_device_ip_suspend(adev);
4234
4235                 *need_full_reset_arg = need_full_reset;
4236         }
4237
4238         return r;
4239 }
4240
4241 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4242                                struct list_head *device_list_handle,
4243                                bool *need_full_reset_arg,
4244                                bool skip_hw_reset)
4245 {
4246         struct amdgpu_device *tmp_adev = NULL;
4247         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4248         int r = 0;
4249
4250         /*
4251          * ASIC reset has to be done on all HGMI hive nodes ASAP
4252          * to allow proper links negotiation in FW (within 1 sec)
4253          */
4254         if (!skip_hw_reset && need_full_reset) {
4255                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4256                         /* For XGMI run all resets in parallel to speed up the process */
4257                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4258                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4259                                         r = -EALREADY;
4260                         } else
4261                                 r = amdgpu_asic_reset(tmp_adev);
4262
4263                         if (r) {
4264                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4265                                          r, adev_to_drm(tmp_adev)->unique);
4266                                 break;
4267                         }
4268                 }
4269
4270                 /* For XGMI wait for all resets to complete before proceed */
4271                 if (!r) {
4272                         list_for_each_entry(tmp_adev, device_list_handle,
4273                                             gmc.xgmi.head) {
4274                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4275                                         flush_work(&tmp_adev->xgmi_reset_work);
4276                                         r = tmp_adev->asic_reset_res;
4277                                         if (r)
4278                                                 break;
4279                                 }
4280                         }
4281                 }
4282         }
4283
4284         if (!r && amdgpu_ras_intr_triggered()) {
4285                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4286                         if (tmp_adev->mmhub.funcs &&
4287                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4288                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4289                 }
4290
4291                 amdgpu_ras_intr_cleared();
4292         }
4293
4294         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4295                 if (need_full_reset) {
4296                         /* post card */
4297                         if (amdgpu_device_asic_init(tmp_adev))
4298                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4299
4300                         if (!r) {
4301                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4302                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4303                                 if (r)
4304                                         goto out;
4305
4306                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4307                                 if (vram_lost) {
4308                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4309                                         amdgpu_inc_vram_lost(tmp_adev);
4310                                 }
4311
4312                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4313                                 if (r)
4314                                         goto out;
4315
4316                                 r = amdgpu_device_fw_loading(tmp_adev);
4317                                 if (r)
4318                                         return r;
4319
4320                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4321                                 if (r)
4322                                         goto out;
4323
4324                                 if (vram_lost)
4325                                         amdgpu_device_fill_reset_magic(tmp_adev);
4326
4327                                 /*
4328                                  * Add this ASIC as tracked as reset was already
4329                                  * complete successfully.
4330                                  */
4331                                 amdgpu_register_gpu_instance(tmp_adev);
4332
4333                                 r = amdgpu_device_ip_late_init(tmp_adev);
4334                                 if (r)
4335                                         goto out;
4336
4337                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4338
4339                                 /*
4340                                  * The GPU enters bad state once faulty pages
4341                                  * by ECC has reached the threshold, and ras
4342                                  * recovery is scheduled next. So add one check
4343                                  * here to break recovery if it indeed exceeds
4344                                  * bad page threshold, and remind user to
4345                                  * retire this GPU or setting one bigger
4346                                  * bad_page_threshold value to fix this once
4347                                  * probing driver again.
4348                                  */
4349                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4350                                         /* must succeed. */
4351                                         amdgpu_ras_resume(tmp_adev);
4352                                 } else {
4353                                         r = -EINVAL;
4354                                         goto out;
4355                                 }
4356
4357                                 /* Update PSP FW topology after reset */
4358                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4359                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4360                         }
4361                 }
4362
4363 out:
4364                 if (!r) {
4365                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4366                         r = amdgpu_ib_ring_tests(tmp_adev);
4367                         if (r) {
4368                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4369                                 r = amdgpu_device_ip_suspend(tmp_adev);
4370                                 need_full_reset = true;
4371                                 r = -EAGAIN;
4372                                 goto end;
4373                         }
4374                 }
4375
4376                 if (!r)
4377                         r = amdgpu_device_recover_vram(tmp_adev);
4378                 else
4379                         tmp_adev->asic_reset_res = r;
4380         }
4381
4382 end:
4383         *need_full_reset_arg = need_full_reset;
4384         return r;
4385 }
4386
4387 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4388                                 struct amdgpu_hive_info *hive)
4389 {
4390         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4391                 return false;
4392
4393         if (hive) {
4394                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4395         } else {
4396                 down_write(&adev->reset_sem);
4397         }
4398
4399         atomic_inc(&adev->gpu_reset_counter);
4400         switch (amdgpu_asic_reset_method(adev)) {
4401         case AMD_RESET_METHOD_MODE1:
4402                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4403                 break;
4404         case AMD_RESET_METHOD_MODE2:
4405                 adev->mp1_state = PP_MP1_STATE_RESET;
4406                 break;
4407         default:
4408                 adev->mp1_state = PP_MP1_STATE_NONE;
4409                 break;
4410         }
4411
4412         return true;
4413 }
4414
4415 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4416 {
4417         amdgpu_vf_error_trans_all(adev);
4418         adev->mp1_state = PP_MP1_STATE_NONE;
4419         atomic_set(&adev->in_gpu_reset, 0);
4420         up_write(&adev->reset_sem);
4421 }
4422
4423 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4424 {
4425         struct pci_dev *p = NULL;
4426
4427         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4428                         adev->pdev->bus->number, 1);
4429         if (p) {
4430                 pm_runtime_enable(&(p->dev));
4431                 pm_runtime_resume(&(p->dev));
4432         }
4433 }
4434
4435 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4436 {
4437         enum amd_reset_method reset_method;
4438         struct pci_dev *p = NULL;
4439         u64 expires;
4440
4441         /*
4442          * For now, only BACO and mode1 reset are confirmed
4443          * to suffer the audio issue without proper suspended.
4444          */
4445         reset_method = amdgpu_asic_reset_method(adev);
4446         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4447              (reset_method != AMD_RESET_METHOD_MODE1))
4448                 return -EINVAL;
4449
4450         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4451                         adev->pdev->bus->number, 1);
4452         if (!p)
4453                 return -ENODEV;
4454
4455         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4456         if (!expires)
4457                 /*
4458                  * If we cannot get the audio device autosuspend delay,
4459                  * a fixed 4S interval will be used. Considering 3S is
4460                  * the audio controller default autosuspend delay setting.
4461                  * 4S used here is guaranteed to cover that.
4462                  */
4463                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4464
4465         while (!pm_runtime_status_suspended(&(p->dev))) {
4466                 if (!pm_runtime_suspend(&(p->dev)))
4467                         break;
4468
4469                 if (expires < ktime_get_mono_fast_ns()) {
4470                         dev_warn(adev->dev, "failed to suspend display audio\n");
4471                         /* TODO: abort the succeeding gpu reset? */
4472                         return -ETIMEDOUT;
4473                 }
4474         }
4475
4476         pm_runtime_disable(&(p->dev));
4477
4478         return 0;
4479 }
4480
4481 /**
4482  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4483  *
4484  * @adev: amdgpu device pointer
4485  * @job: which job trigger hang
4486  *
4487  * Attempt to reset the GPU if it has hung (all asics).
4488  * Attempt to do soft-reset or full-reset and reinitialize Asic
4489  * Returns 0 for success or an error on failure.
4490  */
4491
4492 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4493                               struct amdgpu_job *job)
4494 {
4495         struct list_head device_list, *device_list_handle =  NULL;
4496         bool need_full_reset = false;
4497         bool job_signaled = false;
4498         struct amdgpu_hive_info *hive = NULL;
4499         struct amdgpu_device *tmp_adev = NULL;
4500         int i, r = 0;
4501         bool need_emergency_restart = false;
4502         bool audio_suspended = false;
4503
4504         /**
4505          * Special case: RAS triggered and full reset isn't supported
4506          */
4507         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4508
4509         /*
4510          * Flush RAM to disk so that after reboot
4511          * the user can read log and see why the system rebooted.
4512          */
4513         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4514                 DRM_WARN("Emergency reboot.");
4515
4516                 ksys_sync_helper();
4517                 emergency_restart();
4518         }
4519
4520         dev_info(adev->dev, "GPU %s begin!\n",
4521                 need_emergency_restart ? "jobs stop":"reset");
4522
4523         /*
4524          * Here we trylock to avoid chain of resets executing from
4525          * either trigger by jobs on different adevs in XGMI hive or jobs on
4526          * different schedulers for same device while this TO handler is running.
4527          * We always reset all schedulers for device and all devices for XGMI
4528          * hive so that should take care of them too.
4529          */
4530         hive = amdgpu_get_xgmi_hive(adev);
4531         if (hive) {
4532                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4533                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4534                                 job ? job->base.id : -1, hive->hive_id);
4535                         amdgpu_put_xgmi_hive(hive);
4536                         return 0;
4537                 }
4538                 mutex_lock(&hive->hive_lock);
4539         }
4540
4541         /*
4542          * Build list of devices to reset.
4543          * In case we are in XGMI hive mode, resort the device list
4544          * to put adev in the 1st position.
4545          */
4546         INIT_LIST_HEAD(&device_list);
4547         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4548                 if (!hive)
4549                         return -ENODEV;
4550                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4551                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4552                 device_list_handle = &hive->device_list;
4553         } else {
4554                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4555                 device_list_handle = &device_list;
4556         }
4557
4558         /* block all schedulers and reset given job's ring */
4559         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4560                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4561                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4562                                   job ? job->base.id : -1);
4563                         r = 0;
4564                         goto skip_recovery;
4565                 }
4566
4567                 /*
4568                  * Try to put the audio codec into suspend state
4569                  * before gpu reset started.
4570                  *
4571                  * Due to the power domain of the graphics device
4572                  * is shared with AZ power domain. Without this,
4573                  * we may change the audio hardware from behind
4574                  * the audio driver's back. That will trigger
4575                  * some audio codec errors.
4576                  */
4577                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4578                         audio_suspended = true;
4579
4580                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4581
4582                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4583
4584                 if (!amdgpu_sriov_vf(tmp_adev))
4585                         amdgpu_amdkfd_pre_reset(tmp_adev);
4586
4587                 /*
4588                  * Mark these ASICs to be reseted as untracked first
4589                  * And add them back after reset completed
4590                  */
4591                 amdgpu_unregister_gpu_instance(tmp_adev);
4592
4593                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4594
4595                 /* disable ras on ALL IPs */
4596                 if (!need_emergency_restart &&
4597                       amdgpu_device_ip_need_full_reset(tmp_adev))
4598                         amdgpu_ras_suspend(tmp_adev);
4599
4600                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4601                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4602
4603                         if (!ring || !ring->sched.thread)
4604                                 continue;
4605
4606                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4607
4608                         if (need_emergency_restart)
4609                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4610                 }
4611         }
4612
4613         if (need_emergency_restart)
4614                 goto skip_sched_resume;
4615
4616         /*
4617          * Must check guilty signal here since after this point all old
4618          * HW fences are force signaled.
4619          *
4620          * job->base holds a reference to parent fence
4621          */
4622         if (job && job->base.s_fence->parent &&
4623             dma_fence_is_signaled(job->base.s_fence->parent)) {
4624                 job_signaled = true;
4625                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4626                 goto skip_hw_reset;
4627         }
4628
4629 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4630         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4631                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4632                                                  (tmp_adev == adev) ? job : NULL,
4633                                                  &need_full_reset);
4634                 /*TODO Should we stop ?*/
4635                 if (r) {
4636                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4637                                   r, adev_to_drm(tmp_adev)->unique);
4638                         tmp_adev->asic_reset_res = r;
4639                 }
4640         }
4641
4642         /* Actual ASIC resets if needed.*/
4643         /* TODO Implement XGMI hive reset logic for SRIOV */
4644         if (amdgpu_sriov_vf(adev)) {
4645                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4646                 if (r)
4647                         adev->asic_reset_res = r;
4648         } else {
4649                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4650                 if (r && r == -EAGAIN)
4651                         goto retry;
4652         }
4653
4654 skip_hw_reset:
4655
4656         /* Post ASIC reset for all devs .*/
4657         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4658
4659                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4660                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4661
4662                         if (!ring || !ring->sched.thread)
4663                                 continue;
4664
4665                         /* No point to resubmit jobs if we didn't HW reset*/
4666                         if (!tmp_adev->asic_reset_res && !job_signaled)
4667                                 drm_sched_resubmit_jobs(&ring->sched);
4668
4669                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4670                 }
4671
4672                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4673                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4674                 }
4675
4676                 tmp_adev->asic_reset_res = 0;
4677
4678                 if (r) {
4679                         /* bad news, how to tell it to userspace ? */
4680                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4681                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4682                 } else {
4683                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4684                 }
4685         }
4686
4687 skip_sched_resume:
4688         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4689                 /*unlock kfd: SRIOV would do it separately */
4690                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4691                         amdgpu_amdkfd_post_reset(tmp_adev);
4692                 if (audio_suspended)
4693                         amdgpu_device_resume_display_audio(tmp_adev);
4694                 amdgpu_device_unlock_adev(tmp_adev);
4695         }
4696
4697 skip_recovery:
4698         if (hive) {
4699                 atomic_set(&hive->in_reset, 0);
4700                 mutex_unlock(&hive->hive_lock);
4701                 amdgpu_put_xgmi_hive(hive);
4702         }
4703
4704         if (r)
4705                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4706         return r;
4707 }
4708
4709 /**
4710  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4711  *
4712  * @adev: amdgpu_device pointer
4713  *
4714  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4715  * and lanes) of the slot the device is in. Handles APUs and
4716  * virtualized environments where PCIE config space may not be available.
4717  */
4718 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4719 {
4720         struct pci_dev *pdev;
4721         enum pci_bus_speed speed_cap, platform_speed_cap;
4722         enum pcie_link_width platform_link_width;
4723
4724         if (amdgpu_pcie_gen_cap)
4725                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4726
4727         if (amdgpu_pcie_lane_cap)
4728                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4729
4730         /* covers APUs as well */
4731         if (pci_is_root_bus(adev->pdev->bus)) {
4732                 if (adev->pm.pcie_gen_mask == 0)
4733                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4734                 if (adev->pm.pcie_mlw_mask == 0)
4735                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4736                 return;
4737         }
4738
4739         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4740                 return;
4741
4742         pcie_bandwidth_available(adev->pdev, NULL,
4743                                  &platform_speed_cap, &platform_link_width);
4744
4745         if (adev->pm.pcie_gen_mask == 0) {
4746                 /* asic caps */
4747                 pdev = adev->pdev;
4748                 speed_cap = pcie_get_speed_cap(pdev);
4749                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4750                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4751                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4752                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4753                 } else {
4754                         if (speed_cap == PCIE_SPEED_16_0GT)
4755                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4756                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4757                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4758                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4759                         else if (speed_cap == PCIE_SPEED_8_0GT)
4760                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4761                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4762                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4763                         else if (speed_cap == PCIE_SPEED_5_0GT)
4764                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4765                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4766                         else
4767                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4768                 }
4769                 /* platform caps */
4770                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4771                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4772                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4773                 } else {
4774                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4775                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4776                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4777                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4778                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4779                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4780                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4781                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4782                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4783                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4784                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4785                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4786                         else
4787                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4788
4789                 }
4790         }
4791         if (adev->pm.pcie_mlw_mask == 0) {
4792                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4793                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4794                 } else {
4795                         switch (platform_link_width) {
4796                         case PCIE_LNK_X32:
4797                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4798                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4799                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4800                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4801                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4802                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4803                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4804                                 break;
4805                         case PCIE_LNK_X16:
4806                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4807                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4808                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4809                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4810                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4811                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4812                                 break;
4813                         case PCIE_LNK_X12:
4814                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4815                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4816                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4817                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4818                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4819                                 break;
4820                         case PCIE_LNK_X8:
4821                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4822                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4823                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4824                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4825                                 break;
4826                         case PCIE_LNK_X4:
4827                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4828                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4829                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4830                                 break;
4831                         case PCIE_LNK_X2:
4832                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4833                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4834                                 break;
4835                         case PCIE_LNK_X1:
4836                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4837                                 break;
4838                         default:
4839                                 break;
4840                         }
4841                 }
4842         }
4843 }
4844
4845 int amdgpu_device_baco_enter(struct drm_device *dev)
4846 {
4847         struct amdgpu_device *adev = drm_to_adev(dev);
4848         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4849
4850         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4851                 return -ENOTSUPP;
4852
4853         if (ras && ras->supported)
4854                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4855
4856         return amdgpu_dpm_baco_enter(adev);
4857 }
4858
4859 int amdgpu_device_baco_exit(struct drm_device *dev)
4860 {
4861         struct amdgpu_device *adev = drm_to_adev(dev);
4862         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4863         int ret = 0;
4864
4865         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4866                 return -ENOTSUPP;
4867
4868         ret = amdgpu_dpm_baco_exit(adev);
4869         if (ret)
4870                 return ret;
4871
4872         if (ras && ras->supported)
4873                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4874
4875         return 0;
4876 }
4877
4878 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4879 {
4880         int i;
4881
4882         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4883                 struct amdgpu_ring *ring = adev->rings[i];
4884
4885                 if (!ring || !ring->sched.thread)
4886                         continue;
4887
4888                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4889         }
4890 }
4891
4892 /**
4893  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4894  * @pdev: PCI device struct
4895  * @state: PCI channel state
4896  *
4897  * Description: Called when a PCI error is detected.
4898  *
4899  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4900  */
4901 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4902 {
4903         struct drm_device *dev = pci_get_drvdata(pdev);
4904         struct amdgpu_device *adev = drm_to_adev(dev);
4905         int i;
4906
4907         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4908
4909         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4910                 DRM_WARN("No support for XGMI hive yet...");
4911                 return PCI_ERS_RESULT_DISCONNECT;
4912         }
4913
4914         switch (state) {
4915         case pci_channel_io_normal:
4916                 return PCI_ERS_RESULT_CAN_RECOVER;
4917         /* Fatal error, prepare for slot reset */
4918         case pci_channel_io_frozen:
4919                 /*
4920                  * Cancel and wait for all TDRs in progress if failing to
4921                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4922                  *
4923                  * Locking adev->reset_sem will prevent any external access
4924                  * to GPU during PCI error recovery
4925                  */
4926                 while (!amdgpu_device_lock_adev(adev, NULL))
4927                         amdgpu_cancel_all_tdr(adev);
4928
4929                 /*
4930                  * Block any work scheduling as we do for regular GPU reset
4931                  * for the duration of the recovery
4932                  */
4933                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4934                         struct amdgpu_ring *ring = adev->rings[i];
4935
4936                         if (!ring || !ring->sched.thread)
4937                                 continue;
4938
4939                         drm_sched_stop(&ring->sched, NULL);
4940                 }
4941                 return PCI_ERS_RESULT_NEED_RESET;
4942         case pci_channel_io_perm_failure:
4943                 /* Permanent error, prepare for device removal */
4944                 return PCI_ERS_RESULT_DISCONNECT;
4945         }
4946
4947         return PCI_ERS_RESULT_NEED_RESET;
4948 }
4949
4950 /**
4951  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4952  * @pdev: pointer to PCI device
4953  */
4954 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4955 {
4956
4957         DRM_INFO("PCI error: mmio enabled callback!!\n");
4958
4959         /* TODO - dump whatever for debugging purposes */
4960
4961         /* This called only if amdgpu_pci_error_detected returns
4962          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4963          * works, no need to reset slot.
4964          */
4965
4966         return PCI_ERS_RESULT_RECOVERED;
4967 }
4968
4969 /**
4970  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4971  * @pdev: PCI device struct
4972  *
4973  * Description: This routine is called by the pci error recovery
4974  * code after the PCI slot has been reset, just before we
4975  * should resume normal operations.
4976  */
4977 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4978 {
4979         struct drm_device *dev = pci_get_drvdata(pdev);
4980         struct amdgpu_device *adev = drm_to_adev(dev);
4981         int r, i;
4982         bool need_full_reset = true;
4983         u32 memsize;
4984         struct list_head device_list;
4985
4986         DRM_INFO("PCI error: slot reset callback!!\n");
4987
4988         INIT_LIST_HEAD(&device_list);
4989         list_add_tail(&adev->gmc.xgmi.head, &device_list);
4990
4991         /* wait for asic to come out of reset */
4992         msleep(500);
4993
4994         /* Restore PCI confspace */
4995         amdgpu_device_load_pci_state(pdev);
4996
4997         /* confirm  ASIC came out of reset */
4998         for (i = 0; i < adev->usec_timeout; i++) {
4999                 memsize = amdgpu_asic_get_config_memsize(adev);
5000
5001                 if (memsize != 0xffffffff)
5002                         break;
5003                 udelay(1);
5004         }
5005         if (memsize == 0xffffffff) {
5006                 r = -ETIME;
5007                 goto out;
5008         }
5009
5010         adev->in_pci_err_recovery = true;
5011         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5012         adev->in_pci_err_recovery = false;
5013         if (r)
5014                 goto out;
5015
5016         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5017
5018 out:
5019         if (!r) {
5020                 if (amdgpu_device_cache_pci_state(adev->pdev))
5021                         pci_restore_state(adev->pdev);
5022
5023                 DRM_INFO("PCIe error recovery succeeded\n");
5024         } else {
5025                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5026                 amdgpu_device_unlock_adev(adev);
5027         }
5028
5029         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5030 }
5031
5032 /**
5033  * amdgpu_pci_resume() - resume normal ops after PCI reset
5034  * @pdev: pointer to PCI device
5035  *
5036  * Called when the error recovery driver tells us that its
5037  * OK to resume normal operation. Use completion to allow
5038  * halted scsi ops to resume.
5039  */
5040 void amdgpu_pci_resume(struct pci_dev *pdev)
5041 {
5042         struct drm_device *dev = pci_get_drvdata(pdev);
5043         struct amdgpu_device *adev = drm_to_adev(dev);
5044         int i;
5045
5046
5047         DRM_INFO("PCI error: resume callback!!\n");
5048
5049         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5050                 struct amdgpu_ring *ring = adev->rings[i];
5051
5052                 if (!ring || !ring->sched.thread)
5053                         continue;
5054
5055
5056                 drm_sched_resubmit_jobs(&ring->sched);
5057                 drm_sched_start(&ring->sched, true);
5058         }
5059
5060         amdgpu_device_unlock_adev(adev);
5061 }
5062
5063 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5064 {
5065         struct drm_device *dev = pci_get_drvdata(pdev);
5066         struct amdgpu_device *adev = drm_to_adev(dev);
5067         int r;
5068
5069         r = pci_save_state(pdev);
5070         if (!r) {
5071                 kfree(adev->pci_state);
5072
5073                 adev->pci_state = pci_store_saved_state(pdev);
5074
5075                 if (!adev->pci_state) {
5076                         DRM_ERROR("Failed to store PCI saved state");
5077                         return false;
5078                 }
5079         } else {
5080                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5081                 return false;
5082         }
5083
5084         return true;
5085 }
5086
5087 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5088 {
5089         struct drm_device *dev = pci_get_drvdata(pdev);
5090         struct amdgpu_device *adev = drm_to_adev(dev);
5091         int r;
5092
5093         if (!adev->pci_state)
5094                 return false;
5095
5096         r = pci_load_saved_state(pdev, adev->pci_state);
5097
5098         if (!r) {
5099                 pci_restore_state(pdev);
5100         } else {
5101                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5102                 return false;
5103         }
5104
5105         return true;
5106 }
5107
5108