drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin");
  85
  86 #define AMDGPU_RESUME_MS                2000
  87
  88 const char *amdgpu_asic_name[] = {
  89         "TAHITI",
  90         "PITCAIRN",
  91         "VERDE",
  92         "OLAND",
  93         "HAINAN",
  94         "BONAIRE",
  95         "KAVERI",
  96         "KABINI",
  97         "HAWAII",
  98         "MULLINS",
  99         "TOPAZ",
 100         "TONGA",
 101         "FIJI",
 102         "CARRIZO",
 103         "STONEY",
 104         "POLARIS10",
 105         "POLARIS11",
 106         "POLARIS12",
 107         "VEGAM",
 108         "VEGA10",
 109         "VEGA12",
 110         "VEGA20",
 111         "RAVEN",
 112         "ARCTURUS",
 113         "RENOIR",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "VANGOGH",
 120         "DIMGREY_CAVEFISH",
 121         "LAST",
 122 };
 123
 124 /**
 125  * DOC: pcie_replay_count
 126  *
 127  * The amdgpu driver provides a sysfs API for reporting the total number
 128  * of PCIe replays (NAKs)
 129  * The file pcie_replay_count is used for this and returns the total
 130  * number of replays as a sum of the NAKs generated and NAKs received
 131  */
 132
 133 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 134                 struct device_attribute *attr, char *buf)
 135 {
 136         struct drm_device *ddev = dev_get_drvdata(dev);
 137         struct amdgpu_device *adev = drm_to_adev(ddev);
 138         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 139
 140         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 141 }
 142
 143 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 144                 amdgpu_device_get_pcie_replay_count, NULL);
 145
 146 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 147
 148 /**
 149  * DOC: product_name
 150  *
 151  * The amdgpu driver provides a sysfs API for reporting the product name
 152  * for the device
 153  * The file serial_number is used for this and returns the product name
 154  * as returned from the FRU.
 155  * NOTE: This is only available for certain server cards
 156  */
 157
 158 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 159                 struct device_attribute *attr, char *buf)
 160 {
 161         struct drm_device *ddev = dev_get_drvdata(dev);
 162         struct amdgpu_device *adev = drm_to_adev(ddev);
 163
 164         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 165 }
 166
 167 static DEVICE_ATTR(product_name, S_IRUGO,
 168                 amdgpu_device_get_product_name, NULL);
 169
 170 /**
 171  * DOC: product_number
 172  *
 173  * The amdgpu driver provides a sysfs API for reporting the part number
 174  * for the device
 175  * The file serial_number is used for this and returns the part number
 176  * as returned from the FRU.
 177  * NOTE: This is only available for certain server cards
 178  */
 179
 180 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 181                 struct device_attribute *attr, char *buf)
 182 {
 183         struct drm_device *ddev = dev_get_drvdata(dev);
 184         struct amdgpu_device *adev = drm_to_adev(ddev);
 185
 186         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 187 }
 188
 189 static DEVICE_ATTR(product_number, S_IRUGO,
 190                 amdgpu_device_get_product_number, NULL);
 191
 192 /**
 193  * DOC: serial_number
 194  *
 195  * The amdgpu driver provides a sysfs API for reporting the serial number
 196  * for the device
 197  * The file serial_number is used for this and returns the serial number
 198  * as returned from the FRU.
 199  * NOTE: This is only available for certain server cards
 200  */
 201
 202 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 203                 struct device_attribute *attr, char *buf)
 204 {
 205         struct drm_device *ddev = dev_get_drvdata(dev);
 206         struct amdgpu_device *adev = drm_to_adev(ddev);
 207
 208         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 209 }
 210
 211 static DEVICE_ATTR(serial_number, S_IRUGO,
 212                 amdgpu_device_get_serial_number, NULL);
 213
 214 /**
 215  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 216  *
 217  * @dev: drm_device pointer
 218  *
 219  * Returns true if the device is a dGPU with HG/PX power control,
 220  * otherwise return false.
 221  */
 222 bool amdgpu_device_supports_boco(struct drm_device *dev)
 223 {
 224         struct amdgpu_device *adev = drm_to_adev(dev);
 225
 226         if (adev->flags & AMD_IS_PX)
 227                 return true;
 228         return false;
 229 }
 230
 231 /**
 232  * amdgpu_device_supports_baco - Does the device support BACO
 233  *
 234  * @dev: drm_device pointer
 235  *
 236  * Returns true if the device supporte BACO,
 237  * otherwise return false.
 238  */
 239 bool amdgpu_device_supports_baco(struct drm_device *dev)
 240 {
 241         struct amdgpu_device *adev = drm_to_adev(dev);
 242
 243         return amdgpu_asic_supports_baco(adev);
 244 }
 245
 246 /*
 247  * VRAM access helper functions
 248  */
 249
 250 /**
 251  * amdgpu_device_vram_access - read/write a buffer in vram
 252  *
 253  * @adev: amdgpu_device pointer
 254  * @pos: offset of the buffer in vram
 255  * @buf: virtual address of the buffer in system memory
 256  * @size: read/write size, sizeof(@buf) must > @size
 257  * @write: true - write to vram, otherwise - read from vram
 258  */
 259 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 260                                uint32_t *buf, size_t size, bool write)
 261 {
 262         unsigned long flags;
 263         uint32_t hi = ~0;
 264         uint64_t last;
 265
 266
 267 #ifdef CONFIG_64BIT
 268         last = min(pos + size, adev->gmc.visible_vram_size);
 269         if (last > pos) {
 270                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 271                 size_t count = last - pos;
 272
 273                 if (write) {
 274                         memcpy_toio(addr, buf, count);
 275                         mb();
 276                         amdgpu_asic_flush_hdp(adev, NULL);
 277                 } else {
 278                         amdgpu_asic_invalidate_hdp(adev, NULL);
 279                         mb();
 280                         memcpy_fromio(buf, addr, count);
 281                 }
 282
 283                 if (count == size)
 284                         return;
 285
 286                 pos += count;
 287                 buf += count / 4;
 288                 size -= count;
 289         }
 290 #endif
 291
 292         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 293         for (last = pos + size; pos < last; pos += 4) {
 294                 uint32_t tmp = pos >> 31;
 295
 296                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 297                 if (tmp != hi) {
 298                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 299                         hi = tmp;
 300                 }
 301                 if (write)
 302                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 303                 else
 304                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 305         }
 306         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 307 }
 308
 309 /*
 310  * register access helper functions.
 311  */
 312 /**
 313  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 314  *
 315  * @adev: amdgpu_device pointer
 316  * @reg: dword aligned register offset
 317  * @acc_flags: access flags which require special behavior
 318  *
 319  * Returns the 32 bit value from the offset specified.
 320  */
 321 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 322                             uint32_t reg, uint32_t acc_flags)
 323 {
 324         uint32_t ret;
 325
 326         if (adev->in_pci_err_recovery)
 327                 return 0;
 328
 329         if ((reg * 4) < adev->rmmio_size) {
 330                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 331                     amdgpu_sriov_runtime(adev) &&
 332                     down_read_trylock(&adev->reset_sem)) {
 333                         ret = amdgpu_kiq_rreg(adev, reg);
 334                         up_read(&adev->reset_sem);
 335                 } else {
 336                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 337                 }
 338         } else {
 339                 ret = adev->pcie_rreg(adev, reg * 4);
 340         }
 341
 342         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 343
 344         return ret;
 345 }
 346
 347 /*
 348  * MMIO register read with bytes helper functions
 349  * @offset:bytes offset from MMIO start
 350  *
 351 */
 352
 353 /**
 354  * amdgpu_mm_rreg8 - read a memory mapped IO register
 355  *
 356  * @adev: amdgpu_device pointer
 357  * @offset: byte aligned register offset
 358  *
 359  * Returns the 8 bit value from the offset specified.
 360  */
 361 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 362 {
 363         if (adev->in_pci_err_recovery)
 364                 return 0;
 365
 366         if (offset < adev->rmmio_size)
 367                 return (readb(adev->rmmio + offset));
 368         BUG();
 369 }
 370
 371 /*
 372  * MMIO register write with bytes helper functions
 373  * @offset:bytes offset from MMIO start
 374  * @value: the value want to be written to the register
 375  *
 376 */
 377 /**
 378  * amdgpu_mm_wreg8 - read a memory mapped IO register
 379  *
 380  * @adev: amdgpu_device pointer
 381  * @offset: byte aligned register offset
 382  * @value: 8 bit value to write
 383  *
 384  * Writes the value specified to the offset specified.
 385  */
 386 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 387 {
 388         if (adev->in_pci_err_recovery)
 389                 return;
 390
 391         if (offset < adev->rmmio_size)
 392                 writeb(value, adev->rmmio + offset);
 393         else
 394                 BUG();
 395 }
 396
 397 /**
 398  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 399  *
 400  * @adev: amdgpu_device pointer
 401  * @reg: dword aligned register offset
 402  * @v: 32 bit value to write to the register
 403  * @acc_flags: access flags which require special behavior
 404  *
 405  * Writes the value specified to the offset specified.
 406  */
 407 void amdgpu_device_wreg(struct amdgpu_device *adev,
 408                         uint32_t reg, uint32_t v,
 409                         uint32_t acc_flags)
 410 {
 411         if (adev->in_pci_err_recovery)
 412                 return;
 413
 414         if ((reg * 4) < adev->rmmio_size) {
 415                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 416                     amdgpu_sriov_runtime(adev) &&
 417                     down_read_trylock(&adev->reset_sem)) {
 418                         amdgpu_kiq_wreg(adev, reg, v);
 419                         up_read(&adev->reset_sem);
 420                 } else {
 421                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 422                 }
 423         } else {
 424                 adev->pcie_wreg(adev, reg * 4, v);
 425         }
 426
 427         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 428 }
 429
 430 /*
 431  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 432  *
 433  * this function is invoked only the debugfs register access
 434  * */
 435 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 436                              uint32_t reg, uint32_t v)
 437 {
 438         if (adev->in_pci_err_recovery)
 439                 return;
 440
 441         if (amdgpu_sriov_fullaccess(adev) &&
 442             adev->gfx.rlc.funcs &&
 443             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 444                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 445                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 446         } else {
 447                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 448         }
 449 }
 450
 451 /**
 452  * amdgpu_io_rreg - read an IO register
 453  *
 454  * @adev: amdgpu_device pointer
 455  * @reg: dword aligned register offset
 456  *
 457  * Returns the 32 bit value from the offset specified.
 458  */
 459 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 460 {
 461         if (adev->in_pci_err_recovery)
 462                 return 0;
 463
 464         if ((reg * 4) < adev->rio_mem_size)
 465                 return ioread32(adev->rio_mem + (reg * 4));
 466         else {
 467                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 468                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 469         }
 470 }
 471
 472 /**
 473  * amdgpu_io_wreg - write to an IO register
 474  *
 475  * @adev: amdgpu_device pointer
 476  * @reg: dword aligned register offset
 477  * @v: 32 bit value to write to the register
 478  *
 479  * Writes the value specified to the offset specified.
 480  */
 481 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 482 {
 483         if (adev->in_pci_err_recovery)
 484                 return;
 485
 486         if ((reg * 4) < adev->rio_mem_size)
 487                 iowrite32(v, adev->rio_mem + (reg * 4));
 488         else {
 489                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 490                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 491         }
 492 }
 493
 494 /**
 495  * amdgpu_mm_rdoorbell - read a doorbell dword
 496  *
 497  * @adev: amdgpu_device pointer
 498  * @index: doorbell index
 499  *
 500  * Returns the value in the doorbell aperture at the
 501  * requested doorbell index (CIK).
 502  */
 503 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 504 {
 505         if (adev->in_pci_err_recovery)
 506                 return 0;
 507
 508         if (index < adev->doorbell.num_doorbells) {
 509                 return readl(adev->doorbell.ptr + index);
 510         } else {
 511                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 512                 return 0;
 513         }
 514 }
 515
 516 /**
 517  * amdgpu_mm_wdoorbell - write a doorbell dword
 518  *
 519  * @adev: amdgpu_device pointer
 520  * @index: doorbell index
 521  * @v: value to write
 522  *
 523  * Writes @v to the doorbell aperture at the
 524  * requested doorbell index (CIK).
 525  */
 526 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 527 {
 528         if (adev->in_pci_err_recovery)
 529                 return;
 530
 531         if (index < adev->doorbell.num_doorbells) {
 532                 writel(v, adev->doorbell.ptr + index);
 533         } else {
 534                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 535         }
 536 }
 537
 538 /**
 539  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 540  *
 541  * @adev: amdgpu_device pointer
 542  * @index: doorbell index
 543  *
 544  * Returns the value in the doorbell aperture at the
 545  * requested doorbell index (VEGA10+).
 546  */
 547 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 548 {
 549         if (adev->in_pci_err_recovery)
 550                 return 0;
 551
 552         if (index < adev->doorbell.num_doorbells) {
 553                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 554         } else {
 555                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 556                 return 0;
 557         }
 558 }
 559
 560 /**
 561  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 562  *
 563  * @adev: amdgpu_device pointer
 564  * @index: doorbell index
 565  * @v: value to write
 566  *
 567  * Writes @v to the doorbell aperture at the
 568  * requested doorbell index (VEGA10+).
 569  */
 570 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 571 {
 572         if (adev->in_pci_err_recovery)
 573                 return;
 574
 575         if (index < adev->doorbell.num_doorbells) {
 576                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 577         } else {
 578                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 579         }
 580 }
 581
 582 /**
 583  * amdgpu_device_indirect_rreg - read an indirect register
 584  *
 585  * @adev: amdgpu_device pointer
 586  * @pcie_index: mmio register offset
 587  * @pcie_data: mmio register offset
 588  *
 589  * Returns the value of indirect register @reg_addr
 590  */
 591 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 592                                 u32 pcie_index, u32 pcie_data,
 593                                 u32 reg_addr)
 594 {
 595         unsigned long flags;
 596         u32 r;
 597         void __iomem *pcie_index_offset;
 598         void __iomem *pcie_data_offset;
 599
 600         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 601         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 602         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 603
 604         writel(reg_addr, pcie_index_offset);
 605         readl(pcie_index_offset);
 606         r = readl(pcie_data_offset);
 607         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 608
 609         return r;
 610 }
 611
 612 /**
 613  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 614  *
 615  * @adev: amdgpu_device pointer
 616  * @pcie_index: mmio register offset
 617  * @pcie_data: mmio register offset
 618  *
 619  * Returns the value of indirect register @reg_addr
 620  */
 621 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 622                                   u32 pcie_index, u32 pcie_data,
 623                                   u32 reg_addr)
 624 {
 625         unsigned long flags;
 626         u64 r;
 627         void __iomem *pcie_index_offset;
 628         void __iomem *pcie_data_offset;
 629
 630         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 631         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 632         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 633
 634         /* read low 32 bits */
 635         writel(reg_addr, pcie_index_offset);
 636         readl(pcie_index_offset);
 637         r = readl(pcie_data_offset);
 638         /* read high 32 bits */
 639         writel(reg_addr + 4, pcie_index_offset);
 640         readl(pcie_index_offset);
 641         r |= ((u64)readl(pcie_data_offset) << 32);
 642         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 643
 644         return r;
 645 }
 646
 647 /**
 648  * amdgpu_device_indirect_wreg - write an indirect register address
 649  *
 650  * @adev: amdgpu_device pointer
 651  * @pcie_index: mmio register offset
 652  * @pcie_data: mmio register offset
 653  * @reg_addr: indirect register offset
 654  * @reg_data: indirect register data
 655  *
 656  */
 657 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 658                                  u32 pcie_index, u32 pcie_data,
 659                                  u32 reg_addr, u32 reg_data)
 660 {
 661         unsigned long flags;
 662         void __iomem *pcie_index_offset;
 663         void __iomem *pcie_data_offset;
 664
 665         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 666         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 667         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 668
 669         writel(reg_addr, pcie_index_offset);
 670         readl(pcie_index_offset);
 671         writel(reg_data, pcie_data_offset);
 672         readl(pcie_data_offset);
 673         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 674 }
 675
 676 /**
 677  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 678  *
 679  * @adev: amdgpu_device pointer
 680  * @pcie_index: mmio register offset
 681  * @pcie_data: mmio register offset
 682  * @reg_addr: indirect register offset
 683  * @reg_data: indirect register data
 684  *
 685  */
 686 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 687                                    u32 pcie_index, u32 pcie_data,
 688                                    u32 reg_addr, u64 reg_data)
 689 {
 690         unsigned long flags;
 691         void __iomem *pcie_index_offset;
 692         void __iomem *pcie_data_offset;
 693
 694         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 695         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 696         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 697
 698         /* write low 32 bits */
 699         writel(reg_addr, pcie_index_offset);
 700         readl(pcie_index_offset);
 701         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 702         readl(pcie_data_offset);
 703         /* write high 32 bits */
 704         writel(reg_addr + 4, pcie_index_offset);
 705         readl(pcie_index_offset);
 706         writel((u32)(reg_data >> 32), pcie_data_offset);
 707         readl(pcie_data_offset);
 708         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 709 }
 710
 711 /**
 712  * amdgpu_invalid_rreg - dummy reg read function
 713  *
 714  * @adev: amdgpu_device pointer
 715  * @reg: offset of register
 716  *
 717  * Dummy register read function.  Used for register blocks
 718  * that certain asics don't have (all asics).
 719  * Returns the value in the register.
 720  */
 721 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 722 {
 723         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 724         BUG();
 725         return 0;
 726 }
 727
 728 /**
 729  * amdgpu_invalid_wreg - dummy reg write function
 730  *
 731  * @adev: amdgpu_device pointer
 732  * @reg: offset of register
 733  * @v: value to write to the register
 734  *
 735  * Dummy register read function.  Used for register blocks
 736  * that certain asics don't have (all asics).
 737  */
 738 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 739 {
 740         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 741                   reg, v);
 742         BUG();
 743 }
 744
 745 /**
 746  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 747  *
 748  * @adev: amdgpu_device pointer
 749  * @reg: offset of register
 750  *
 751  * Dummy register read function.  Used for register blocks
 752  * that certain asics don't have (all asics).
 753  * Returns the value in the register.
 754  */
 755 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 756 {
 757         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 758         BUG();
 759         return 0;
 760 }
 761
 762 /**
 763  * amdgpu_invalid_wreg64 - dummy reg write function
 764  *
 765  * @adev: amdgpu_device pointer
 766  * @reg: offset of register
 767  * @v: value to write to the register
 768  *
 769  * Dummy register read function.  Used for register blocks
 770  * that certain asics don't have (all asics).
 771  */
 772 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 773 {
 774         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 775                   reg, v);
 776         BUG();
 777 }
 778
 779 /**
 780  * amdgpu_block_invalid_rreg - dummy reg read function
 781  *
 782  * @adev: amdgpu_device pointer
 783  * @block: offset of instance
 784  * @reg: offset of register
 785  *
 786  * Dummy register read function.  Used for register blocks
 787  * that certain asics don't have (all asics).
 788  * Returns the value in the register.
 789  */
 790 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 791                                           uint32_t block, uint32_t reg)
 792 {
 793         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 794                   reg, block);
 795         BUG();
 796         return 0;
 797 }
 798
 799 /**
 800  * amdgpu_block_invalid_wreg - dummy reg write function
 801  *
 802  * @adev: amdgpu_device pointer
 803  * @block: offset of instance
 804  * @reg: offset of register
 805  * @v: value to write to the register
 806  *
 807  * Dummy register read function.  Used for register blocks
 808  * that certain asics don't have (all asics).
 809  */
 810 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 811                                       uint32_t block,
 812                                       uint32_t reg, uint32_t v)
 813 {
 814         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 815                   reg, block, v);
 816         BUG();
 817 }
 818
 819 /**
 820  * amdgpu_device_asic_init - Wrapper for atom asic_init
 821  *
 822  * @adev: amdgpu_device pointer
 823  *
 824  * Does any asic specific work and then calls atom asic init.
 825  */
 826 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 827 {
 828         amdgpu_asic_pre_asic_init(adev);
 829
 830         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 831 }
 832
 833 /**
 834  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 835  *
 836  * @adev: amdgpu_device pointer
 837  *
 838  * Allocates a scratch page of VRAM for use by various things in the
 839  * driver.
 840  */
 841 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 842 {
 843         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 844                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 845                                        &adev->vram_scratch.robj,
 846                                        &adev->vram_scratch.gpu_addr,
 847                                        (void **)&adev->vram_scratch.ptr);
 848 }
 849
 850 /**
 851  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 852  *
 853  * @adev: amdgpu_device pointer
 854  *
 855  * Frees the VRAM scratch page.
 856  */
 857 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 858 {
 859         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 860 }
 861
 862 /**
 863  * amdgpu_device_program_register_sequence - program an array of registers.
 864  *
 865  * @adev: amdgpu_device pointer
 866  * @registers: pointer to the register array
 867  * @array_size: size of the register array
 868  *
 869  * Programs an array or registers with and and or masks.
 870  * This is a helper for setting golden registers.
 871  */
 872 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 873                                              const u32 *registers,
 874                                              const u32 array_size)
 875 {
 876         u32 tmp, reg, and_mask, or_mask;
 877         int i;
 878
 879         if (array_size % 3)
 880                 return;
 881
 882         for (i = 0; i < array_size; i +=3) {
 883                 reg = registers[i + 0];
 884                 and_mask = registers[i + 1];
 885                 or_mask = registers[i + 2];
 886
 887                 if (and_mask == 0xffffffff) {
 888                         tmp = or_mask;
 889                 } else {
 890                         tmp = RREG32(reg);
 891                         tmp &= ~and_mask;
 892                         if (adev->family >= AMDGPU_FAMILY_AI)
 893                                 tmp |= (or_mask & and_mask);
 894                         else
 895                                 tmp |= or_mask;
 896                 }
 897                 WREG32(reg, tmp);
 898         }
 899 }
 900
 901 /**
 902  * amdgpu_device_pci_config_reset - reset the GPU
 903  *
 904  * @adev: amdgpu_device pointer
 905  *
 906  * Resets the GPU using the pci config reset sequence.
 907  * Only applicable to asics prior to vega10.
 908  */
 909 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 910 {
 911         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 912 }
 913
 914 /*
 915  * GPU doorbell aperture helpers function.
 916  */
 917 /**
 918  * amdgpu_device_doorbell_init - Init doorbell driver information.
 919  *
 920  * @adev: amdgpu_device pointer
 921  *
 922  * Init doorbell driver information (CIK)
 923  * Returns 0 on success, error on failure.
 924  */
 925 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 926 {
 927
 928         /* No doorbell on SI hardware generation */
 929         if (adev->asic_type < CHIP_BONAIRE) {
 930                 adev->doorbell.base = 0;
 931                 adev->doorbell.size = 0;
 932                 adev->doorbell.num_doorbells = 0;
 933                 adev->doorbell.ptr = NULL;
 934                 return 0;
 935         }
 936
 937         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 938                 return -EINVAL;
 939
 940         amdgpu_asic_init_doorbell_index(adev);
 941
 942         /* doorbell bar mapping */
 943         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 944         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 945
 946         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 947                                              adev->doorbell_index.max_assignment+1);
 948         if (adev->doorbell.num_doorbells == 0)
 949                 return -EINVAL;
 950
 951         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 952          * paging queue doorbell use the second page. The
 953          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 954          * doorbells are in the first page. So with paging queue enabled,
 955          * the max num_doorbells should + 1 page (0x400 in dword)
 956          */
 957         if (adev->asic_type >= CHIP_VEGA10)
 958                 adev->doorbell.num_doorbells += 0x400;
 959
 960         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 961                                      adev->doorbell.num_doorbells *
 962                                      sizeof(u32));
 963         if (adev->doorbell.ptr == NULL)
 964                 return -ENOMEM;
 965
 966         return 0;
 967 }
 968
 969 /**
 970  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 971  *
 972  * @adev: amdgpu_device pointer
 973  *
 974  * Tear down doorbell driver information (CIK)
 975  */
 976 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 977 {
 978         iounmap(adev->doorbell.ptr);
 979         adev->doorbell.ptr = NULL;
 980 }
 981
 982
 983
 984 /*
 985  * amdgpu_device_wb_*()
 986  * Writeback is the method by which the GPU updates special pages in memory
 987  * with the status of certain GPU events (fences, ring pointers,etc.).
 988  */
 989
 990 /**
 991  * amdgpu_device_wb_fini - Disable Writeback and free memory
 992  *
 993  * @adev: amdgpu_device pointer
 994  *
 995  * Disables Writeback and frees the Writeback memory (all asics).
 996  * Used at driver shutdown.
 997  */
 998 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 999 {
1000         if (adev->wb.wb_obj) {
1001                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1002                                       &adev->wb.gpu_addr,
1003                                       (void **)&adev->wb.wb);
1004                 adev->wb.wb_obj = NULL;
1005         }
1006 }
1007
1008 /**
1009  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1010  *
1011  * @adev: amdgpu_device pointer
1012  *
1013  * Initializes writeback and allocates writeback memory (all asics).
1014  * Used at driver startup.
1015  * Returns 0 on success or an -error on failure.
1016  */
1017 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1018 {
1019         int r;
1020
1021         if (adev->wb.wb_obj == NULL) {
1022                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1023                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1024                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1025                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1026                                             (void **)&adev->wb.wb);
1027                 if (r) {
1028                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1029                         return r;
1030                 }
1031
1032                 adev->wb.num_wb = AMDGPU_MAX_WB;
1033                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1034
1035                 /* clear wb memory */
1036                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1037         }
1038
1039         return 0;
1040 }
1041
1042 /**
1043  * amdgpu_device_wb_get - Allocate a wb entry
1044  *
1045  * @adev: amdgpu_device pointer
1046  * @wb: wb index
1047  *
1048  * Allocate a wb slot for use by the driver (all asics).
1049  * Returns 0 on success or -EINVAL on failure.
1050  */
1051 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1052 {
1053         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1054
1055         if (offset < adev->wb.num_wb) {
1056                 __set_bit(offset, adev->wb.used);
1057                 *wb = offset << 3; /* convert to dw offset */
1058                 return 0;
1059         } else {
1060                 return -EINVAL;
1061         }
1062 }
1063
1064 /**
1065  * amdgpu_device_wb_free - Free a wb entry
1066  *
1067  * @adev: amdgpu_device pointer
1068  * @wb: wb index
1069  *
1070  * Free a wb slot allocated for use by the driver (all asics)
1071  */
1072 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1073 {
1074         wb >>= 3;
1075         if (wb < adev->wb.num_wb)
1076                 __clear_bit(wb, adev->wb.used);
1077 }
1078
1079 /**
1080  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1081  *
1082  * @adev: amdgpu_device pointer
1083  *
1084  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1085  * to fail, but if any of the BARs is not accessible after the size we abort
1086  * driver loading by returning -ENODEV.
1087  */
1088 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1089 {
1090         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1091         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1092         struct pci_bus *root;
1093         struct resource *res;
1094         unsigned i;
1095         u16 cmd;
1096         int r;
1097
1098         /* Bypass for VF */
1099         if (amdgpu_sriov_vf(adev))
1100                 return 0;
1101
1102         /* skip if the bios has already enabled large BAR */
1103         if (adev->gmc.real_vram_size &&
1104             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1105                 return 0;
1106
1107         /* Check if the root BUS has 64bit memory resources */
1108         root = adev->pdev->bus;
1109         while (root->parent)
1110                 root = root->parent;
1111
1112         pci_bus_for_each_resource(root, res, i) {
1113                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1114                     res->start > 0x100000000ull)
1115                         break;
1116         }
1117
1118         /* Trying to resize is pointless without a root hub window above 4GB */
1119         if (!res)
1120                 return 0;
1121
1122         /* Disable memory decoding while we change the BAR addresses and size */
1123         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1124         pci_write_config_word(adev->pdev, PCI_COMMAND,
1125                               cmd & ~PCI_COMMAND_MEMORY);
1126
1127         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1128         amdgpu_device_doorbell_fini(adev);
1129         if (adev->asic_type >= CHIP_BONAIRE)
1130                 pci_release_resource(adev->pdev, 2);
1131
1132         pci_release_resource(adev->pdev, 0);
1133
1134         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1135         if (r == -ENOSPC)
1136                 DRM_INFO("Not enough PCI address space for a large BAR.");
1137         else if (r && r != -ENOTSUPP)
1138                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1139
1140         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1141
1142         /* When the doorbell or fb BAR isn't available we have no chance of
1143          * using the device.
1144          */
1145         r = amdgpu_device_doorbell_init(adev);
1146         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1147                 return -ENODEV;
1148
1149         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1150
1151         return 0;
1152 }
1153
1154 /*
1155  * GPU helpers function.
1156  */
1157 /**
1158  * amdgpu_device_need_post - check if the hw need post or not
1159  *
1160  * @adev: amdgpu_device pointer
1161  *
1162  * Check if the asic has been initialized (all asics) at driver startup
1163  * or post is needed if  hw reset is performed.
1164  * Returns true if need or false if not.
1165  */
1166 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1167 {
1168         uint32_t reg;
1169
1170         if (amdgpu_sriov_vf(adev))
1171                 return false;
1172
1173         if (amdgpu_passthrough(adev)) {
1174                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1175                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1176                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1177                  * vpost executed for smc version below 22.15
1178                  */
1179                 if (adev->asic_type == CHIP_FIJI) {
1180                         int err;
1181                         uint32_t fw_ver;
1182                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1183                         /* force vPost if error occured */
1184                         if (err)
1185                                 return true;
1186
1187                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1188                         if (fw_ver < 0x00160e00)
1189                                 return true;
1190                 }
1191         }
1192
1193         if (adev->has_hw_reset) {
1194                 adev->has_hw_reset = false;
1195                 return true;
1196         }
1197
1198         /* bios scratch used on CIK+ */
1199         if (adev->asic_type >= CHIP_BONAIRE)
1200                 return amdgpu_atombios_scratch_need_asic_init(adev);
1201
1202         /* check MEM_SIZE for older asics */
1203         reg = amdgpu_asic_get_config_memsize(adev);
1204
1205         if ((reg != 0) && (reg != 0xffffffff))
1206                 return false;
1207
1208         return true;
1209 }
1210
1211 /* if we get transitioned to only one device, take VGA back */
1212 /**
1213  * amdgpu_device_vga_set_decode - enable/disable vga decode
1214  *
1215  * @cookie: amdgpu_device pointer
1216  * @state: enable/disable vga decode
1217  *
1218  * Enable/disable vga decode (all asics).
1219  * Returns VGA resource flags.
1220  */
1221 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1222 {
1223         struct amdgpu_device *adev = cookie;
1224         amdgpu_asic_set_vga_state(adev, state);
1225         if (state)
1226                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1227                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1228         else
1229                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1230 }
1231
1232 /**
1233  * amdgpu_device_check_block_size - validate the vm block size
1234  *
1235  * @adev: amdgpu_device pointer
1236  *
1237  * Validates the vm block size specified via module parameter.
1238  * The vm block size defines number of bits in page table versus page directory,
1239  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1240  * page table and the remaining bits are in the page directory.
1241  */
1242 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1243 {
1244         /* defines number of bits in page table versus page directory,
1245          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1246          * page table and the remaining bits are in the page directory */
1247         if (amdgpu_vm_block_size == -1)
1248                 return;
1249
1250         if (amdgpu_vm_block_size < 9) {
1251                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1252                          amdgpu_vm_block_size);
1253                 amdgpu_vm_block_size = -1;
1254         }
1255 }
1256
1257 /**
1258  * amdgpu_device_check_vm_size - validate the vm size
1259  *
1260  * @adev: amdgpu_device pointer
1261  *
1262  * Validates the vm size in GB specified via module parameter.
1263  * The VM size is the size of the GPU virtual memory space in GB.
1264  */
1265 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1266 {
1267         /* no need to check the default value */
1268         if (amdgpu_vm_size == -1)
1269                 return;
1270
1271         if (amdgpu_vm_size < 1) {
1272                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1273                          amdgpu_vm_size);
1274                 amdgpu_vm_size = -1;
1275         }
1276 }
1277
1278 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1279 {
1280         struct sysinfo si;
1281         bool is_os_64 = (sizeof(void *) == 8);
1282         uint64_t total_memory;
1283         uint64_t dram_size_seven_GB = 0x1B8000000;
1284         uint64_t dram_size_three_GB = 0xB8000000;
1285
1286         if (amdgpu_smu_memory_pool_size == 0)
1287                 return;
1288
1289         if (!is_os_64) {
1290                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1291                 goto def_value;
1292         }
1293         si_meminfo(&si);
1294         total_memory = (uint64_t)si.totalram * si.mem_unit;
1295
1296         if ((amdgpu_smu_memory_pool_size == 1) ||
1297                 (amdgpu_smu_memory_pool_size == 2)) {
1298                 if (total_memory < dram_size_three_GB)
1299                         goto def_value1;
1300         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1301                 (amdgpu_smu_memory_pool_size == 8)) {
1302                 if (total_memory < dram_size_seven_GB)
1303                         goto def_value1;
1304         } else {
1305                 DRM_WARN("Smu memory pool size not supported\n");
1306                 goto def_value;
1307         }
1308         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1309
1310         return;
1311
1312 def_value1:
1313         DRM_WARN("No enough system memory\n");
1314 def_value:
1315         adev->pm.smu_prv_buffer_size = 0;
1316 }
1317
1318 /**
1319  * amdgpu_device_check_arguments - validate module params
1320  *
1321  * @adev: amdgpu_device pointer
1322  *
1323  * Validates certain module parameters and updates
1324  * the associated values used by the driver (all asics).
1325  */
1326 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1327 {
1328         if (amdgpu_sched_jobs < 4) {
1329                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1330                          amdgpu_sched_jobs);
1331                 amdgpu_sched_jobs = 4;
1332         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1333                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1334                          amdgpu_sched_jobs);
1335                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1336         }
1337
1338         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1339                 /* gart size must be greater or equal to 32M */
1340                 dev_warn(adev->dev, "gart size (%d) too small\n",
1341                          amdgpu_gart_size);
1342                 amdgpu_gart_size = -1;
1343         }
1344
1345         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1346                 /* gtt size must be greater or equal to 32M */
1347                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1348                                  amdgpu_gtt_size);
1349                 amdgpu_gtt_size = -1;
1350         }
1351
1352         /* valid range is between 4 and 9 inclusive */
1353         if (amdgpu_vm_fragment_size != -1 &&
1354             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1355                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1356                 amdgpu_vm_fragment_size = -1;
1357         }
1358
1359         if (amdgpu_sched_hw_submission < 2) {
1360                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1361                          amdgpu_sched_hw_submission);
1362                 amdgpu_sched_hw_submission = 2;
1363         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1364                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1365                          amdgpu_sched_hw_submission);
1366                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1367         }
1368
1369         amdgpu_device_check_smu_prv_buffer_size(adev);
1370
1371         amdgpu_device_check_vm_size(adev);
1372
1373         amdgpu_device_check_block_size(adev);
1374
1375         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1376
1377         amdgpu_gmc_tmz_set(adev);
1378
1379         amdgpu_gmc_noretry_set(adev);
1380
1381         return 0;
1382 }
1383
1384 /**
1385  * amdgpu_switcheroo_set_state - set switcheroo state
1386  *
1387  * @pdev: pci dev pointer
1388  * @state: vga_switcheroo state
1389  *
1390  * Callback for the switcheroo driver.  Suspends or resumes the
1391  * the asics before or after it is powered up using ACPI methods.
1392  */
1393 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1394                                         enum vga_switcheroo_state state)
1395 {
1396         struct drm_device *dev = pci_get_drvdata(pdev);
1397         int r;
1398
1399         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1400                 return;
1401
1402         if (state == VGA_SWITCHEROO_ON) {
1403                 pr_info("switched on\n");
1404                 /* don't suspend or resume card normally */
1405                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1406
1407                 pci_set_power_state(dev->pdev, PCI_D0);
1408                 amdgpu_device_load_pci_state(dev->pdev);
1409                 r = pci_enable_device(dev->pdev);
1410                 if (r)
1411                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1412                 amdgpu_device_resume(dev, true);
1413
1414                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1415                 drm_kms_helper_poll_enable(dev);
1416         } else {
1417                 pr_info("switched off\n");
1418                 drm_kms_helper_poll_disable(dev);
1419                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1420                 amdgpu_device_suspend(dev, true);
1421                 amdgpu_device_cache_pci_state(dev->pdev);
1422                 /* Shut down the device */
1423                 pci_disable_device(dev->pdev);
1424                 pci_set_power_state(dev->pdev, PCI_D3cold);
1425                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1426         }
1427 }
1428
1429 /**
1430  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1431  *
1432  * @pdev: pci dev pointer
1433  *
1434  * Callback for the switcheroo driver.  Check of the switcheroo
1435  * state can be changed.
1436  * Returns true if the state can be changed, false if not.
1437  */
1438 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1439 {
1440         struct drm_device *dev = pci_get_drvdata(pdev);
1441
1442         /*
1443         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1444         * locking inversion with the driver load path. And the access here is
1445         * completely racy anyway. So don't bother with locking for now.
1446         */
1447         return atomic_read(&dev->open_count) == 0;
1448 }
1449
1450 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1451         .set_gpu_state = amdgpu_switcheroo_set_state,
1452         .reprobe = NULL,
1453         .can_switch = amdgpu_switcheroo_can_switch,
1454 };
1455
1456 /**
1457  * amdgpu_device_ip_set_clockgating_state - set the CG state
1458  *
1459  * @dev: amdgpu_device pointer
1460  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1461  * @state: clockgating state (gate or ungate)
1462  *
1463  * Sets the requested clockgating state for all instances of
1464  * the hardware IP specified.
1465  * Returns the error code from the last instance.
1466  */
1467 int amdgpu_device_ip_set_clockgating_state(void *dev,
1468                                            enum amd_ip_block_type block_type,
1469                                            enum amd_clockgating_state state)
1470 {
1471         struct amdgpu_device *adev = dev;
1472         int i, r = 0;
1473
1474         for (i = 0; i < adev->num_ip_blocks; i++) {
1475                 if (!adev->ip_blocks[i].status.valid)
1476                         continue;
1477                 if (adev->ip_blocks[i].version->type != block_type)
1478                         continue;
1479                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1480                         continue;
1481                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1482                         (void *)adev, state);
1483                 if (r)
1484                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1485                                   adev->ip_blocks[i].version->funcs->name, r);
1486         }
1487         return r;
1488 }
1489
1490 /**
1491  * amdgpu_device_ip_set_powergating_state - set the PG state
1492  *
1493  * @dev: amdgpu_device pointer
1494  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1495  * @state: powergating state (gate or ungate)
1496  *
1497  * Sets the requested powergating state for all instances of
1498  * the hardware IP specified.
1499  * Returns the error code from the last instance.
1500  */
1501 int amdgpu_device_ip_set_powergating_state(void *dev,
1502                                            enum amd_ip_block_type block_type,
1503                                            enum amd_powergating_state state)
1504 {
1505         struct amdgpu_device *adev = dev;
1506         int i, r = 0;
1507
1508         for (i = 0; i < adev->num_ip_blocks; i++) {
1509                 if (!adev->ip_blocks[i].status.valid)
1510                         continue;
1511                 if (adev->ip_blocks[i].version->type != block_type)
1512                         continue;
1513                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1514                         continue;
1515                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1516                         (void *)adev, state);
1517                 if (r)
1518                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1519                                   adev->ip_blocks[i].version->funcs->name, r);
1520         }
1521         return r;
1522 }
1523
1524 /**
1525  * amdgpu_device_ip_get_clockgating_state - get the CG state
1526  *
1527  * @adev: amdgpu_device pointer
1528  * @flags: clockgating feature flags
1529  *
1530  * Walks the list of IPs on the device and updates the clockgating
1531  * flags for each IP.
1532  * Updates @flags with the feature flags for each hardware IP where
1533  * clockgating is enabled.
1534  */
1535 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1536                                             u32 *flags)
1537 {
1538         int i;
1539
1540         for (i = 0; i < adev->num_ip_blocks; i++) {
1541                 if (!adev->ip_blocks[i].status.valid)
1542                         continue;
1543                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1544                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1545         }
1546 }
1547
1548 /**
1549  * amdgpu_device_ip_wait_for_idle - wait for idle
1550  *
1551  * @adev: amdgpu_device pointer
1552  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1553  *
1554  * Waits for the request hardware IP to be idle.
1555  * Returns 0 for success or a negative error code on failure.
1556  */
1557 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1558                                    enum amd_ip_block_type block_type)
1559 {
1560         int i, r;
1561
1562         for (i = 0; i < adev->num_ip_blocks; i++) {
1563                 if (!adev->ip_blocks[i].status.valid)
1564                         continue;
1565                 if (adev->ip_blocks[i].version->type == block_type) {
1566                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1567                         if (r)
1568                                 return r;
1569                         break;
1570                 }
1571         }
1572         return 0;
1573
1574 }
1575
1576 /**
1577  * amdgpu_device_ip_is_idle - is the hardware IP idle
1578  *
1579  * @adev: amdgpu_device pointer
1580  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1581  *
1582  * Check if the hardware IP is idle or not.
1583  * Returns true if it the IP is idle, false if not.
1584  */
1585 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1586                               enum amd_ip_block_type block_type)
1587 {
1588         int i;
1589
1590         for (i = 0; i < adev->num_ip_blocks; i++) {
1591                 if (!adev->ip_blocks[i].status.valid)
1592                         continue;
1593                 if (adev->ip_blocks[i].version->type == block_type)
1594                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1595         }
1596         return true;
1597
1598 }
1599
1600 /**
1601  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1602  *
1603  * @adev: amdgpu_device pointer
1604  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1605  *
1606  * Returns a pointer to the hardware IP block structure
1607  * if it exists for the asic, otherwise NULL.
1608  */
1609 struct amdgpu_ip_block *
1610 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1611                               enum amd_ip_block_type type)
1612 {
1613         int i;
1614
1615         for (i = 0; i < adev->num_ip_blocks; i++)
1616                 if (adev->ip_blocks[i].version->type == type)
1617                         return &adev->ip_blocks[i];
1618
1619         return NULL;
1620 }
1621
1622 /**
1623  * amdgpu_device_ip_block_version_cmp
1624  *
1625  * @adev: amdgpu_device pointer
1626  * @type: enum amd_ip_block_type
1627  * @major: major version
1628  * @minor: minor version
1629  *
1630  * return 0 if equal or greater
1631  * return 1 if smaller or the ip_block doesn't exist
1632  */
1633 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1634                                        enum amd_ip_block_type type,
1635                                        u32 major, u32 minor)
1636 {
1637         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1638
1639         if (ip_block && ((ip_block->version->major > major) ||
1640                         ((ip_block->version->major == major) &&
1641                         (ip_block->version->minor >= minor))))
1642                 return 0;
1643
1644         return 1;
1645 }
1646
1647 /**
1648  * amdgpu_device_ip_block_add
1649  *
1650  * @adev: amdgpu_device pointer
1651  * @ip_block_version: pointer to the IP to add
1652  *
1653  * Adds the IP block driver information to the collection of IPs
1654  * on the asic.
1655  */
1656 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1657                                const struct amdgpu_ip_block_version *ip_block_version)
1658 {
1659         if (!ip_block_version)
1660                 return -EINVAL;
1661
1662         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1663                   ip_block_version->funcs->name);
1664
1665         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1666
1667         return 0;
1668 }
1669
1670 /**
1671  * amdgpu_device_enable_virtual_display - enable virtual display feature
1672  *
1673  * @adev: amdgpu_device pointer
1674  *
1675  * Enabled the virtual display feature if the user has enabled it via
1676  * the module parameter virtual_display.  This feature provides a virtual
1677  * display hardware on headless boards or in virtualized environments.
1678  * This function parses and validates the configuration string specified by
1679  * the user and configues the virtual display configuration (number of
1680  * virtual connectors, crtcs, etc.) specified.
1681  */
1682 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1683 {
1684         adev->enable_virtual_display = false;
1685
1686         if (amdgpu_virtual_display) {
1687                 struct drm_device *ddev = adev_to_drm(adev);
1688                 const char *pci_address_name = pci_name(ddev->pdev);
1689                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1690
1691                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1692                 pciaddstr_tmp = pciaddstr;
1693                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1694                         pciaddname = strsep(&pciaddname_tmp, ",");
1695                         if (!strcmp("all", pciaddname)
1696                             || !strcmp(pci_address_name, pciaddname)) {
1697                                 long num_crtc;
1698                                 int res = -1;
1699
1700                                 adev->enable_virtual_display = true;
1701
1702                                 if (pciaddname_tmp)
1703                                         res = kstrtol(pciaddname_tmp, 10,
1704                                                       &num_crtc);
1705
1706                                 if (!res) {
1707                                         if (num_crtc < 1)
1708                                                 num_crtc = 1;
1709                                         if (num_crtc > 6)
1710                                                 num_crtc = 6;
1711                                         adev->mode_info.num_crtc = num_crtc;
1712                                 } else {
1713                                         adev->mode_info.num_crtc = 1;
1714                                 }
1715                                 break;
1716                         }
1717                 }
1718
1719                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1720                          amdgpu_virtual_display, pci_address_name,
1721                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1722
1723                 kfree(pciaddstr);
1724         }
1725 }
1726
1727 /**
1728  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1729  *
1730  * @adev: amdgpu_device pointer
1731  *
1732  * Parses the asic configuration parameters specified in the gpu info
1733  * firmware and makes them availale to the driver for use in configuring
1734  * the asic.
1735  * Returns 0 on success, -EINVAL on failure.
1736  */
1737 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1738 {
1739         const char *chip_name;
1740         char fw_name[40];
1741         int err;
1742         const struct gpu_info_firmware_header_v1_0 *hdr;
1743
1744         adev->firmware.gpu_info_fw = NULL;
1745
1746         if (adev->mman.discovery_bin) {
1747                 amdgpu_discovery_get_gfx_info(adev);
1748
1749                 /*
1750                  * FIXME: The bounding box is still needed by Navi12, so
1751                  * temporarily read it from gpu_info firmware. Should be droped
1752                  * when DAL no longer needs it.
1753                  */
1754                 if (adev->asic_type != CHIP_NAVI12)
1755                         return 0;
1756         }
1757
1758         switch (adev->asic_type) {
1759 #ifdef CONFIG_DRM_AMDGPU_SI
1760         case CHIP_VERDE:
1761         case CHIP_TAHITI:
1762         case CHIP_PITCAIRN:
1763         case CHIP_OLAND:
1764         case CHIP_HAINAN:
1765 #endif
1766 #ifdef CONFIG_DRM_AMDGPU_CIK
1767         case CHIP_BONAIRE:
1768         case CHIP_HAWAII:
1769         case CHIP_KAVERI:
1770         case CHIP_KABINI:
1771         case CHIP_MULLINS:
1772 #endif
1773         case CHIP_TOPAZ:
1774         case CHIP_TONGA:
1775         case CHIP_FIJI:
1776         case CHIP_POLARIS10:
1777         case CHIP_POLARIS11:
1778         case CHIP_POLARIS12:
1779         case CHIP_VEGAM:
1780         case CHIP_CARRIZO:
1781         case CHIP_STONEY:
1782         case CHIP_VEGA20:
1783         case CHIP_SIENNA_CICHLID:
1784         case CHIP_NAVY_FLOUNDER:
1785         case CHIP_DIMGREY_CAVEFISH:
1786         default:
1787                 return 0;
1788         case CHIP_VEGA10:
1789                 chip_name = "vega10";
1790                 break;
1791         case CHIP_VEGA12:
1792                 chip_name = "vega12";
1793                 break;
1794         case CHIP_RAVEN:
1795                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1796                         chip_name = "raven2";
1797                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1798                         chip_name = "picasso";
1799                 else
1800                         chip_name = "raven";
1801                 break;
1802         case CHIP_ARCTURUS:
1803                 chip_name = "arcturus";
1804                 break;
1805         case CHIP_RENOIR:
1806                 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1807                         chip_name = "renoir";
1808                 else
1809                         chip_name = "green_sardine";
1810                 break;
1811         case CHIP_NAVI10:
1812                 chip_name = "navi10";
1813                 break;
1814         case CHIP_NAVI14:
1815                 chip_name = "navi14";
1816                 break;
1817         case CHIP_NAVI12:
1818                 chip_name = "navi12";
1819                 break;
1820         case CHIP_VANGOGH:
1821                 chip_name = "vangogh";
1822                 break;
1823         }
1824
1825         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1826         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1827         if (err) {
1828                 dev_err(adev->dev,
1829                         "Failed to load gpu_info firmware \"%s\"\n",
1830                         fw_name);
1831                 goto out;
1832         }
1833         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1834         if (err) {
1835                 dev_err(adev->dev,
1836                         "Failed to validate gpu_info firmware \"%s\"\n",
1837                         fw_name);
1838                 goto out;
1839         }
1840
1841         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1842         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1843
1844         switch (hdr->version_major) {
1845         case 1:
1846         {
1847                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1848                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1849                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1850
1851                 /*
1852                  * Should be droped when DAL no longer needs it.
1853                  */
1854                 if (adev->asic_type == CHIP_NAVI12)
1855                         goto parse_soc_bounding_box;
1856
1857                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1858                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1859                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1860                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1861                 adev->gfx.config.max_texture_channel_caches =
1862                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1863                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1864                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1865                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1866                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1867                 adev->gfx.config.double_offchip_lds_buf =
1868                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1869                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1870                 adev->gfx.cu_info.max_waves_per_simd =
1871                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1872                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1873                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1874                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1875                 if (hdr->version_minor >= 1) {
1876                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1877                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1878                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1879                         adev->gfx.config.num_sc_per_sh =
1880                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1881                         adev->gfx.config.num_packer_per_sc =
1882                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1883                 }
1884
1885 parse_soc_bounding_box:
1886                 /*
1887                  * soc bounding box info is not integrated in disocovery table,
1888                  * we always need to parse it from gpu info firmware if needed.
1889                  */
1890                 if (hdr->version_minor == 2) {
1891                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1892                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1893                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1894                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1895                 }
1896                 break;
1897         }
1898         default:
1899                 dev_err(adev->dev,
1900                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1901                 err = -EINVAL;
1902                 goto out;
1903         }
1904 out:
1905         return err;
1906 }
1907
1908 /**
1909  * amdgpu_device_ip_early_init - run early init for hardware IPs
1910  *
1911  * @adev: amdgpu_device pointer
1912  *
1913  * Early initialization pass for hardware IPs.  The hardware IPs that make
1914  * up each asic are discovered each IP's early_init callback is run.  This
1915  * is the first stage in initializing the asic.
1916  * Returns 0 on success, negative error code on failure.
1917  */
1918 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1919 {
1920         int i, r;
1921
1922         amdgpu_device_enable_virtual_display(adev);
1923
1924         if (amdgpu_sriov_vf(adev)) {
1925                 r = amdgpu_virt_request_full_gpu(adev, true);
1926                 if (r)
1927                         return r;
1928         }
1929
1930         switch (adev->asic_type) {
1931 #ifdef CONFIG_DRM_AMDGPU_SI
1932         case CHIP_VERDE:
1933         case CHIP_TAHITI:
1934         case CHIP_PITCAIRN:
1935         case CHIP_OLAND:
1936         case CHIP_HAINAN:
1937                 adev->family = AMDGPU_FAMILY_SI;
1938                 r = si_set_ip_blocks(adev);
1939                 if (r)
1940                         return r;
1941                 break;
1942 #endif
1943 #ifdef CONFIG_DRM_AMDGPU_CIK
1944         case CHIP_BONAIRE:
1945         case CHIP_HAWAII:
1946         case CHIP_KAVERI:
1947         case CHIP_KABINI:
1948         case CHIP_MULLINS:
1949                 if (adev->flags & AMD_IS_APU)
1950                         adev->family = AMDGPU_FAMILY_KV;
1951                 else
1952                         adev->family = AMDGPU_FAMILY_CI;
1953
1954                 r = cik_set_ip_blocks(adev);
1955                 if (r)
1956                         return r;
1957                 break;
1958 #endif
1959         case CHIP_TOPAZ:
1960         case CHIP_TONGA:
1961         case CHIP_FIJI:
1962         case CHIP_POLARIS10:
1963         case CHIP_POLARIS11:
1964         case CHIP_POLARIS12:
1965         case CHIP_VEGAM:
1966         case CHIP_CARRIZO:
1967         case CHIP_STONEY:
1968                 if (adev->flags & AMD_IS_APU)
1969                         adev->family = AMDGPU_FAMILY_CZ;
1970                 else
1971                         adev->family = AMDGPU_FAMILY_VI;
1972
1973                 r = vi_set_ip_blocks(adev);
1974                 if (r)
1975                         return r;
1976                 break;
1977         case CHIP_VEGA10:
1978         case CHIP_VEGA12:
1979         case CHIP_VEGA20:
1980         case CHIP_RAVEN:
1981         case CHIP_ARCTURUS:
1982         case CHIP_RENOIR:
1983                 if (adev->flags & AMD_IS_APU)
1984                         adev->family = AMDGPU_FAMILY_RV;
1985                 else
1986                         adev->family = AMDGPU_FAMILY_AI;
1987
1988                 r = soc15_set_ip_blocks(adev);
1989                 if (r)
1990                         return r;
1991                 break;
1992         case  CHIP_NAVI10:
1993         case  CHIP_NAVI14:
1994         case  CHIP_NAVI12:
1995         case  CHIP_SIENNA_CICHLID:
1996         case  CHIP_NAVY_FLOUNDER:
1997         case  CHIP_DIMGREY_CAVEFISH:
1998         case CHIP_VANGOGH:
1999                 if (adev->asic_type == CHIP_VANGOGH)
2000                         adev->family = AMDGPU_FAMILY_VGH;
2001                 else
2002                         adev->family = AMDGPU_FAMILY_NV;
2003
2004                 r = nv_set_ip_blocks(adev);
2005                 if (r)
2006                         return r;
2007                 break;
2008         default:
2009                 /* FIXME: not supported yet */
2010                 return -EINVAL;
2011         }
2012
2013         amdgpu_amdkfd_device_probe(adev);
2014
2015         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2016         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2017                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2018
2019         for (i = 0; i < adev->num_ip_blocks; i++) {
2020                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2021                         DRM_ERROR("disabled ip block: %d <%s>\n",
2022                                   i, adev->ip_blocks[i].version->funcs->name);
2023                         adev->ip_blocks[i].status.valid = false;
2024                 } else {
2025                         if (adev->ip_blocks[i].version->funcs->early_init) {
2026                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2027                                 if (r == -ENOENT) {
2028                                         adev->ip_blocks[i].status.valid = false;
2029                                 } else if (r) {
2030                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2031                                                   adev->ip_blocks[i].version->funcs->name, r);
2032                                         return r;
2033                                 } else {
2034                                         adev->ip_blocks[i].status.valid = true;
2035                                 }
2036                         } else {
2037                                 adev->ip_blocks[i].status.valid = true;
2038                         }
2039                 }
2040                 /* get the vbios after the asic_funcs are set up */
2041                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2042                         r = amdgpu_device_parse_gpu_info_fw(adev);
2043                         if (r)
2044                                 return r;
2045
2046                         /* Read BIOS */
2047                         if (!amdgpu_get_bios(adev))
2048                                 return -EINVAL;
2049
2050                         r = amdgpu_atombios_init(adev);
2051                         if (r) {
2052                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2053                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2054                                 return r;
2055                         }
2056                 }
2057         }
2058
2059         adev->cg_flags &= amdgpu_cg_mask;
2060         adev->pg_flags &= amdgpu_pg_mask;
2061
2062         return 0;
2063 }
2064
2065 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2066 {
2067         int i, r;
2068
2069         for (i = 0; i < adev->num_ip_blocks; i++) {
2070                 if (!adev->ip_blocks[i].status.sw)
2071                         continue;
2072                 if (adev->ip_blocks[i].status.hw)
2073                         continue;
2074                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2075                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2076                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2077                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2078                         if (r) {
2079                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2080                                           adev->ip_blocks[i].version->funcs->name, r);
2081                                 return r;
2082                         }
2083                         adev->ip_blocks[i].status.hw = true;
2084                 }
2085         }
2086
2087         return 0;
2088 }
2089
2090 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2091 {
2092         int i, r;
2093
2094         for (i = 0; i < adev->num_ip_blocks; i++) {
2095                 if (!adev->ip_blocks[i].status.sw)
2096                         continue;
2097                 if (adev->ip_blocks[i].status.hw)
2098                         continue;
2099                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2100                 if (r) {
2101                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2102                                   adev->ip_blocks[i].version->funcs->name, r);
2103                         return r;
2104                 }
2105                 adev->ip_blocks[i].status.hw = true;
2106         }
2107
2108         return 0;
2109 }
2110
2111 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2112 {
2113         int r = 0;
2114         int i;
2115         uint32_t smu_version;
2116
2117         if (adev->asic_type >= CHIP_VEGA10) {
2118                 for (i = 0; i < adev->num_ip_blocks; i++) {
2119                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2120                                 continue;
2121
2122                         /* no need to do the fw loading again if already done*/
2123                         if (adev->ip_blocks[i].status.hw == true)
2124                                 break;
2125
2126                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2127                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2128                                 if (r) {
2129                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2130                                                           adev->ip_blocks[i].version->funcs->name, r);
2131                                         return r;
2132                                 }
2133                         } else {
2134                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2135                                 if (r) {
2136                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2137                                                           adev->ip_blocks[i].version->funcs->name, r);
2138                                         return r;
2139                                 }
2140                         }
2141
2142                         adev->ip_blocks[i].status.hw = true;
2143                         break;
2144                 }
2145         }
2146
2147         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2148                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2149
2150         return r;
2151 }
2152
2153 /**
2154  * amdgpu_device_ip_init - run init for hardware IPs
2155  *
2156  * @adev: amdgpu_device pointer
2157  *
2158  * Main initialization pass for hardware IPs.  The list of all the hardware
2159  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2160  * are run.  sw_init initializes the software state associated with each IP
2161  * and hw_init initializes the hardware associated with each IP.
2162  * Returns 0 on success, negative error code on failure.
2163  */
2164 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2165 {
2166         int i, r;
2167
2168         r = amdgpu_ras_init(adev);
2169         if (r)
2170                 return r;
2171
2172         for (i = 0; i < adev->num_ip_blocks; i++) {
2173                 if (!adev->ip_blocks[i].status.valid)
2174                         continue;
2175                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2176                 if (r) {
2177                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2178                                   adev->ip_blocks[i].version->funcs->name, r);
2179                         goto init_failed;
2180                 }
2181                 adev->ip_blocks[i].status.sw = true;
2182
2183                 /* need to do gmc hw init early so we can allocate gpu mem */
2184                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2185                         r = amdgpu_device_vram_scratch_init(adev);
2186                         if (r) {
2187                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2188                                 goto init_failed;
2189                         }
2190                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2191                         if (r) {
2192                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2193                                 goto init_failed;
2194                         }
2195                         r = amdgpu_device_wb_init(adev);
2196                         if (r) {
2197                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2198                                 goto init_failed;
2199                         }
2200                         adev->ip_blocks[i].status.hw = true;
2201
2202                         /* right after GMC hw init, we create CSA */
2203                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2204                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2205                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2206                                                                 AMDGPU_CSA_SIZE);
2207                                 if (r) {
2208                                         DRM_ERROR("allocate CSA failed %d\n", r);
2209                                         goto init_failed;
2210                                 }
2211                         }
2212                 }
2213         }
2214
2215         if (amdgpu_sriov_vf(adev))
2216                 amdgpu_virt_init_data_exchange(adev);
2217
2218         r = amdgpu_ib_pool_init(adev);
2219         if (r) {
2220                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2221                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2222                 goto init_failed;
2223         }
2224
2225         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2226         if (r)
2227                 goto init_failed;
2228
2229         r = amdgpu_device_ip_hw_init_phase1(adev);
2230         if (r)
2231                 goto init_failed;
2232
2233         r = amdgpu_device_fw_loading(adev);
2234         if (r)
2235                 goto init_failed;
2236
2237         r = amdgpu_device_ip_hw_init_phase2(adev);
2238         if (r)
2239                 goto init_failed;
2240
2241         /*
2242          * retired pages will be loaded from eeprom and reserved here,
2243          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2244          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2245          * for I2C communication which only true at this point.
2246          *
2247          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2248          * failure from bad gpu situation and stop amdgpu init process
2249          * accordingly. For other failed cases, it will still release all
2250          * the resource and print error message, rather than returning one
2251          * negative value to upper level.
2252          *
2253          * Note: theoretically, this should be called before all vram allocations
2254          * to protect retired page from abusing
2255          */
2256         r = amdgpu_ras_recovery_init(adev);
2257         if (r)
2258                 goto init_failed;
2259
2260         if (adev->gmc.xgmi.num_physical_nodes > 1)
2261                 amdgpu_xgmi_add_device(adev);
2262         amdgpu_amdkfd_device_init(adev);
2263
2264         amdgpu_fru_get_product_info(adev);
2265
2266 init_failed:
2267         if (amdgpu_sriov_vf(adev))
2268                 amdgpu_virt_release_full_gpu(adev, true);
2269
2270         return r;
2271 }
2272
2273 /**
2274  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2275  *
2276  * @adev: amdgpu_device pointer
2277  *
2278  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2279  * this function before a GPU reset.  If the value is retained after a
2280  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2281  */
2282 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2283 {
2284         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2285 }
2286
2287 /**
2288  * amdgpu_device_check_vram_lost - check if vram is valid
2289  *
2290  * @adev: amdgpu_device pointer
2291  *
2292  * Checks the reset magic value written to the gart pointer in VRAM.
2293  * The driver calls this after a GPU reset to see if the contents of
2294  * VRAM is lost or now.
2295  * returns true if vram is lost, false if not.
2296  */
2297 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2298 {
2299         if (memcmp(adev->gart.ptr, adev->reset_magic,
2300                         AMDGPU_RESET_MAGIC_NUM))
2301                 return true;
2302
2303         if (!amdgpu_in_reset(adev))
2304                 return false;
2305
2306         /*
2307          * For all ASICs with baco/mode1 reset, the VRAM is
2308          * always assumed to be lost.
2309          */
2310         switch (amdgpu_asic_reset_method(adev)) {
2311         case AMD_RESET_METHOD_BACO:
2312         case AMD_RESET_METHOD_MODE1:
2313                 return true;
2314         default:
2315                 return false;
2316         }
2317 }
2318
2319 /**
2320  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2321  *
2322  * @adev: amdgpu_device pointer
2323  * @state: clockgating state (gate or ungate)
2324  *
2325  * The list of all the hardware IPs that make up the asic is walked and the
2326  * set_clockgating_state callbacks are run.
2327  * Late initialization pass enabling clockgating for hardware IPs.
2328  * Fini or suspend, pass disabling clockgating for hardware IPs.
2329  * Returns 0 on success, negative error code on failure.
2330  */
2331
2332 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2333                                                 enum amd_clockgating_state state)
2334 {
2335         int i, j, r;
2336
2337         if (amdgpu_emu_mode == 1)
2338                 return 0;
2339
2340         for (j = 0; j < adev->num_ip_blocks; j++) {
2341                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2342                 if (!adev->ip_blocks[i].status.late_initialized)
2343                         continue;
2344                 /* skip CG for VCE/UVD, it's handled specially */
2345                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2346                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2347                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2348                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2349                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2350                         /* enable clockgating to save power */
2351                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2352                                                                                      state);
2353                         if (r) {
2354                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2355                                           adev->ip_blocks[i].version->funcs->name, r);
2356                                 return r;
2357                         }
2358                 }
2359         }
2360
2361         return 0;
2362 }
2363
2364 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2365 {
2366         int i, j, r;
2367
2368         if (amdgpu_emu_mode == 1)
2369                 return 0;
2370
2371         for (j = 0; j < adev->num_ip_blocks; j++) {
2372                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2373                 if (!adev->ip_blocks[i].status.late_initialized)
2374                         continue;
2375                 /* skip CG for VCE/UVD, it's handled specially */
2376                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2377                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2378                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2379                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2380                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2381                         /* enable powergating to save power */
2382                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2383                                                                                         state);
2384                         if (r) {
2385                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2386                                           adev->ip_blocks[i].version->funcs->name, r);
2387                                 return r;
2388                         }
2389                 }
2390         }
2391         return 0;
2392 }
2393
2394 static int amdgpu_device_enable_mgpu_fan_boost(void)
2395 {
2396         struct amdgpu_gpu_instance *gpu_ins;
2397         struct amdgpu_device *adev;
2398         int i, ret = 0;
2399
2400         mutex_lock(&mgpu_info.mutex);
2401
2402         /*
2403          * MGPU fan boost feature should be enabled
2404          * only when there are two or more dGPUs in
2405          * the system
2406          */
2407         if (mgpu_info.num_dgpu < 2)
2408                 goto out;
2409
2410         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2411                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2412                 adev = gpu_ins->adev;
2413                 if (!(adev->flags & AMD_IS_APU) &&
2414                     !gpu_ins->mgpu_fan_enabled) {
2415                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2416                         if (ret)
2417                                 break;
2418
2419                         gpu_ins->mgpu_fan_enabled = 1;
2420                 }
2421         }
2422
2423 out:
2424         mutex_unlock(&mgpu_info.mutex);
2425
2426         return ret;
2427 }
2428
2429 /**
2430  * amdgpu_device_ip_late_init - run late init for hardware IPs
2431  *
2432  * @adev: amdgpu_device pointer
2433  *
2434  * Late initialization pass for hardware IPs.  The list of all the hardware
2435  * IPs that make up the asic is walked and the late_init callbacks are run.
2436  * late_init covers any special initialization that an IP requires
2437  * after all of the have been initialized or something that needs to happen
2438  * late in the init process.
2439  * Returns 0 on success, negative error code on failure.
2440  */
2441 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2442 {
2443         struct amdgpu_gpu_instance *gpu_instance;
2444         int i = 0, r;
2445
2446         for (i = 0; i < adev->num_ip_blocks; i++) {
2447                 if (!adev->ip_blocks[i].status.hw)
2448                         continue;
2449                 if (adev->ip_blocks[i].version->funcs->late_init) {
2450                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2451                         if (r) {
2452                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2453                                           adev->ip_blocks[i].version->funcs->name, r);
2454                                 return r;
2455                         }
2456                 }
2457                 adev->ip_blocks[i].status.late_initialized = true;
2458         }
2459
2460         amdgpu_ras_set_error_query_ready(adev, true);
2461
2462         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2463         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2464
2465         amdgpu_device_fill_reset_magic(adev);
2466
2467         r = amdgpu_device_enable_mgpu_fan_boost();
2468         if (r)
2469                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2470
2471
2472         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2473                 mutex_lock(&mgpu_info.mutex);
2474
2475                 /*
2476                  * Reset device p-state to low as this was booted with high.
2477                  *
2478                  * This should be performed only after all devices from the same
2479                  * hive get initialized.
2480                  *
2481                  * However, it's unknown how many device in the hive in advance.
2482                  * As this is counted one by one during devices initializations.
2483                  *
2484                  * So, we wait for all XGMI interlinked devices initialized.
2485                  * This may bring some delays as those devices may come from
2486                  * different hives. But that should be OK.
2487                  */
2488                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2489                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2490                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2491                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2492                                         continue;
2493
2494                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2495                                                 AMDGPU_XGMI_PSTATE_MIN);
2496                                 if (r) {
2497                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2498                                         break;
2499                                 }
2500                         }
2501                 }
2502
2503                 mutex_unlock(&mgpu_info.mutex);
2504         }
2505
2506         return 0;
2507 }
2508
2509 /**
2510  * amdgpu_device_ip_fini - run fini for hardware IPs
2511  *
2512  * @adev: amdgpu_device pointer
2513  *
2514  * Main teardown pass for hardware IPs.  The list of all the hardware
2515  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2516  * are run.  hw_fini tears down the hardware associated with each IP
2517  * and sw_fini tears down any software state associated with each IP.
2518  * Returns 0 on success, negative error code on failure.
2519  */
2520 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2521 {
2522         int i, r;
2523
2524         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2525                 amdgpu_virt_release_ras_err_handler_data(adev);
2526
2527         amdgpu_ras_pre_fini(adev);
2528
2529         if (adev->gmc.xgmi.num_physical_nodes > 1)
2530                 amdgpu_xgmi_remove_device(adev);
2531
2532         amdgpu_amdkfd_device_fini(adev);
2533
2534         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2535         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2536
2537         /* need to disable SMC first */
2538         for (i = 0; i < adev->num_ip_blocks; i++) {
2539                 if (!adev->ip_blocks[i].status.hw)
2540                         continue;
2541                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2542                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2543                         /* XXX handle errors */
2544                         if (r) {
2545                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2546                                           adev->ip_blocks[i].version->funcs->name, r);
2547                         }
2548                         adev->ip_blocks[i].status.hw = false;
2549                         break;
2550                 }
2551         }
2552
2553         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2554                 if (!adev->ip_blocks[i].status.hw)
2555                         continue;
2556
2557                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2558                 /* XXX handle errors */
2559                 if (r) {
2560                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2561                                   adev->ip_blocks[i].version->funcs->name, r);
2562                 }
2563
2564                 adev->ip_blocks[i].status.hw = false;
2565         }
2566
2567
2568         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2569                 if (!adev->ip_blocks[i].status.sw)
2570                         continue;
2571
2572                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2573                         amdgpu_ucode_free_bo(adev);
2574                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2575                         amdgpu_device_wb_fini(adev);
2576                         amdgpu_device_vram_scratch_fini(adev);
2577                         amdgpu_ib_pool_fini(adev);
2578                 }
2579
2580                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2581                 /* XXX handle errors */
2582                 if (r) {
2583                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2584                                   adev->ip_blocks[i].version->funcs->name, r);
2585                 }
2586                 adev->ip_blocks[i].status.sw = false;
2587                 adev->ip_blocks[i].status.valid = false;
2588         }
2589
2590         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2591                 if (!adev->ip_blocks[i].status.late_initialized)
2592                         continue;
2593                 if (adev->ip_blocks[i].version->funcs->late_fini)
2594                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2595                 adev->ip_blocks[i].status.late_initialized = false;
2596         }
2597
2598         amdgpu_ras_fini(adev);
2599
2600         if (amdgpu_sriov_vf(adev))
2601                 if (amdgpu_virt_release_full_gpu(adev, false))
2602                         DRM_ERROR("failed to release exclusive mode on fini\n");
2603
2604         return 0;
2605 }
2606
2607 /**
2608  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2609  *
2610  * @work: work_struct.
2611  */
2612 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2613 {
2614         struct amdgpu_device *adev =
2615                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2616         int r;
2617
2618         r = amdgpu_ib_ring_tests(adev);
2619         if (r)
2620                 DRM_ERROR("ib ring test failed (%d).\n", r);
2621 }
2622
2623 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2624 {
2625         struct amdgpu_device *adev =
2626                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2627
2628         mutex_lock(&adev->gfx.gfx_off_mutex);
2629         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2630                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2631                         adev->gfx.gfx_off_state = true;
2632         }
2633         mutex_unlock(&adev->gfx.gfx_off_mutex);
2634 }
2635
2636 /**
2637  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2638  *
2639  * @adev: amdgpu_device pointer
2640  *
2641  * Main suspend function for hardware IPs.  The list of all the hardware
2642  * IPs that make up the asic is walked, clockgating is disabled and the
2643  * suspend callbacks are run.  suspend puts the hardware and software state
2644  * in each IP into a state suitable for suspend.
2645  * Returns 0 on success, negative error code on failure.
2646  */
2647 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2648 {
2649         int i, r;
2650
2651         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2652         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2653
2654         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2655                 if (!adev->ip_blocks[i].status.valid)
2656                         continue;
2657
2658                 /* displays are handled separately */
2659                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2660                         continue;
2661
2662                 /* XXX handle errors */
2663                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2664                 /* XXX handle errors */
2665                 if (r) {
2666                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2667                                   adev->ip_blocks[i].version->funcs->name, r);
2668                         return r;
2669                 }
2670
2671                 adev->ip_blocks[i].status.hw = false;
2672         }
2673
2674         return 0;
2675 }
2676
2677 /**
2678  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2679  *
2680  * @adev: amdgpu_device pointer
2681  *
2682  * Main suspend function for hardware IPs.  The list of all the hardware
2683  * IPs that make up the asic is walked, clockgating is disabled and the
2684  * suspend callbacks are run.  suspend puts the hardware and software state
2685  * in each IP into a state suitable for suspend.
2686  * Returns 0 on success, negative error code on failure.
2687  */
2688 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2689 {
2690         int i, r;
2691
2692         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2693                 if (!adev->ip_blocks[i].status.valid)
2694                         continue;
2695                 /* displays are handled in phase1 */
2696                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2697                         continue;
2698                 /* PSP lost connection when err_event_athub occurs */
2699                 if (amdgpu_ras_intr_triggered() &&
2700                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2701                         adev->ip_blocks[i].status.hw = false;
2702                         continue;
2703                 }
2704                 /* XXX handle errors */
2705                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2706                 /* XXX handle errors */
2707                 if (r) {
2708                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2709                                   adev->ip_blocks[i].version->funcs->name, r);
2710                 }
2711                 adev->ip_blocks[i].status.hw = false;
2712                 /* handle putting the SMC in the appropriate state */
2713                 if(!amdgpu_sriov_vf(adev)){
2714                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2715                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2716                                 if (r) {
2717                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2718                                                         adev->mp1_state, r);
2719                                         return r;
2720                                 }
2721                         }
2722                 }
2723                 adev->ip_blocks[i].status.hw = false;
2724         }
2725
2726         return 0;
2727 }
2728
2729 /**
2730  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2731  *
2732  * @adev: amdgpu_device pointer
2733  *
2734  * Main suspend function for hardware IPs.  The list of all the hardware
2735  * IPs that make up the asic is walked, clockgating is disabled and the
2736  * suspend callbacks are run.  suspend puts the hardware and software state
2737  * in each IP into a state suitable for suspend.
2738  * Returns 0 on success, negative error code on failure.
2739  */
2740 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2741 {
2742         int r;
2743
2744         if (amdgpu_sriov_vf(adev))
2745                 amdgpu_virt_request_full_gpu(adev, false);
2746
2747         r = amdgpu_device_ip_suspend_phase1(adev);
2748         if (r)
2749                 return r;
2750         r = amdgpu_device_ip_suspend_phase2(adev);
2751
2752         if (amdgpu_sriov_vf(adev))
2753                 amdgpu_virt_release_full_gpu(adev, false);
2754
2755         return r;
2756 }
2757
2758 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2759 {
2760         int i, r;
2761
2762         static enum amd_ip_block_type ip_order[] = {
2763                 AMD_IP_BLOCK_TYPE_GMC,
2764                 AMD_IP_BLOCK_TYPE_COMMON,
2765                 AMD_IP_BLOCK_TYPE_PSP,
2766                 AMD_IP_BLOCK_TYPE_IH,
2767         };
2768
2769         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2770                 int j;
2771                 struct amdgpu_ip_block *block;
2772
2773                 block = &adev->ip_blocks[i];
2774                 block->status.hw = false;
2775
2776                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2777
2778                         if (block->version->type != ip_order[j] ||
2779                                 !block->status.valid)
2780                                 continue;
2781
2782                         r = block->version->funcs->hw_init(adev);
2783                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2784                         if (r)
2785                                 return r;
2786                         block->status.hw = true;
2787                 }
2788         }
2789
2790         return 0;
2791 }
2792
2793 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2794 {
2795         int i, r;
2796
2797         static enum amd_ip_block_type ip_order[] = {
2798                 AMD_IP_BLOCK_TYPE_SMC,
2799                 AMD_IP_BLOCK_TYPE_DCE,
2800                 AMD_IP_BLOCK_TYPE_GFX,
2801                 AMD_IP_BLOCK_TYPE_SDMA,
2802                 AMD_IP_BLOCK_TYPE_UVD,
2803                 AMD_IP_BLOCK_TYPE_VCE,
2804                 AMD_IP_BLOCK_TYPE_VCN
2805         };
2806
2807         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2808                 int j;
2809                 struct amdgpu_ip_block *block;
2810
2811                 for (j = 0; j < adev->num_ip_blocks; j++) {
2812                         block = &adev->ip_blocks[j];
2813
2814                         if (block->version->type != ip_order[i] ||
2815                                 !block->status.valid ||
2816                                 block->status.hw)
2817                                 continue;
2818
2819                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2820                                 r = block->version->funcs->resume(adev);
2821                         else
2822                                 r = block->version->funcs->hw_init(adev);
2823
2824                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2825                         if (r)
2826                                 return r;
2827                         block->status.hw = true;
2828                 }
2829         }
2830
2831         return 0;
2832 }
2833
2834 /**
2835  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2836  *
2837  * @adev: amdgpu_device pointer
2838  *
2839  * First resume function for hardware IPs.  The list of all the hardware
2840  * IPs that make up the asic is walked and the resume callbacks are run for
2841  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2842  * after a suspend and updates the software state as necessary.  This
2843  * function is also used for restoring the GPU after a GPU reset.
2844  * Returns 0 on success, negative error code on failure.
2845  */
2846 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2847 {
2848         int i, r;
2849
2850         for (i = 0; i < adev->num_ip_blocks; i++) {
2851                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2852                         continue;
2853                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2854                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2855                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2856
2857                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2858                         if (r) {
2859                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2860                                           adev->ip_blocks[i].version->funcs->name, r);
2861                                 return r;
2862                         }
2863                         adev->ip_blocks[i].status.hw = true;
2864                 }
2865         }
2866
2867         return 0;
2868 }
2869
2870 /**
2871  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2872  *
2873  * @adev: amdgpu_device pointer
2874  *
2875  * First resume function for hardware IPs.  The list of all the hardware
2876  * IPs that make up the asic is walked and the resume callbacks are run for
2877  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2878  * functional state after a suspend and updates the software state as
2879  * necessary.  This function is also used for restoring the GPU after a GPU
2880  * reset.
2881  * Returns 0 on success, negative error code on failure.
2882  */
2883 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2884 {
2885         int i, r;
2886
2887         for (i = 0; i < adev->num_ip_blocks; i++) {
2888                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2889                         continue;
2890                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2891                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2892                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2893                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2894                         continue;
2895                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2896                 if (r) {
2897                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2898                                   adev->ip_blocks[i].version->funcs->name, r);
2899                         return r;
2900                 }
2901                 adev->ip_blocks[i].status.hw = true;
2902         }
2903
2904         return 0;
2905 }
2906
2907 /**
2908  * amdgpu_device_ip_resume - run resume for hardware IPs
2909  *
2910  * @adev: amdgpu_device pointer
2911  *
2912  * Main resume function for hardware IPs.  The hardware IPs
2913  * are split into two resume functions because they are
2914  * are also used in in recovering from a GPU reset and some additional
2915  * steps need to be take between them.  In this case (S3/S4) they are
2916  * run sequentially.
2917  * Returns 0 on success, negative error code on failure.
2918  */
2919 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2920 {
2921         int r;
2922
2923         r = amdgpu_device_ip_resume_phase1(adev);
2924         if (r)
2925                 return r;
2926
2927         r = amdgpu_device_fw_loading(adev);
2928         if (r)
2929                 return r;
2930
2931         r = amdgpu_device_ip_resume_phase2(adev);
2932
2933         return r;
2934 }
2935
2936 /**
2937  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2938  *
2939  * @adev: amdgpu_device pointer
2940  *
2941  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2942  */
2943 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2944 {
2945         if (amdgpu_sriov_vf(adev)) {
2946                 if (adev->is_atom_fw) {
2947                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2948                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2949                 } else {
2950                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2951                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2952                 }
2953
2954                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2955                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2956         }
2957 }
2958
2959 /**
2960  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2961  *
2962  * @asic_type: AMD asic type
2963  *
2964  * Check if there is DC (new modesetting infrastructre) support for an asic.
2965  * returns true if DC has support, false if not.
2966  */
2967 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2968 {
2969         switch (asic_type) {
2970 #if defined(CONFIG_DRM_AMD_DC)
2971 #if defined(CONFIG_DRM_AMD_DC_SI)
2972         case CHIP_TAHITI:
2973         case CHIP_PITCAIRN:
2974         case CHIP_VERDE:
2975         case CHIP_OLAND:
2976 #endif
2977         case CHIP_BONAIRE:
2978         case CHIP_KAVERI:
2979         case CHIP_KABINI:
2980         case CHIP_MULLINS:
2981                 /*
2982                  * We have systems in the wild with these ASICs that require
2983                  * LVDS and VGA support which is not supported with DC.
2984                  *
2985                  * Fallback to the non-DC driver here by default so as not to
2986                  * cause regressions.
2987                  */
2988                 return amdgpu_dc > 0;
2989         case CHIP_HAWAII:
2990         case CHIP_CARRIZO:
2991         case CHIP_STONEY:
2992         case CHIP_POLARIS10:
2993         case CHIP_POLARIS11:
2994         case CHIP_POLARIS12:
2995         case CHIP_VEGAM:
2996         case CHIP_TONGA:
2997         case CHIP_FIJI:
2998         case CHIP_VEGA10:
2999         case CHIP_VEGA12:
3000         case CHIP_VEGA20:
3001 #if defined(CONFIG_DRM_AMD_DC_DCN)
3002         case CHIP_RAVEN:
3003         case CHIP_NAVI10:
3004         case CHIP_NAVI14:
3005         case CHIP_NAVI12:
3006         case CHIP_RENOIR:
3007         case CHIP_SIENNA_CICHLID:
3008         case CHIP_NAVY_FLOUNDER:
3009         case CHIP_DIMGREY_CAVEFISH:
3010         case CHIP_VANGOGH:
3011 #endif
3012                 return amdgpu_dc != 0;
3013 #endif
3014         default:
3015                 if (amdgpu_dc > 0)
3016                         DRM_INFO("Display Core has been requested via kernel parameter "
3017                                          "but isn't supported by ASIC, ignoring\n");
3018                 return false;
3019         }
3020 }
3021
3022 /**
3023  * amdgpu_device_has_dc_support - check if dc is supported
3024  *
3025  * @adev: amdgpu_device pointer
3026  *
3027  * Returns true for supported, false for not supported
3028  */
3029 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3030 {
3031         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3032                 return false;
3033
3034         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3035 }
3036
3037
3038 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3039 {
3040         struct amdgpu_device *adev =
3041                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3042         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3043
3044         /* It's a bug to not have a hive within this function */
3045         if (WARN_ON(!hive))
3046                 return;
3047
3048         /*
3049          * Use task barrier to synchronize all xgmi reset works across the
3050          * hive. task_barrier_enter and task_barrier_exit will block
3051          * until all the threads running the xgmi reset works reach
3052          * those points. task_barrier_full will do both blocks.
3053          */
3054         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3055
3056                 task_barrier_enter(&hive->tb);
3057                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3058
3059                 if (adev->asic_reset_res)
3060                         goto fail;
3061
3062                 task_barrier_exit(&hive->tb);
3063                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3064
3065                 if (adev->asic_reset_res)
3066                         goto fail;
3067
3068                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3069                         adev->mmhub.funcs->reset_ras_error_count(adev);
3070         } else {
3071
3072                 task_barrier_full(&hive->tb);
3073                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3074         }
3075
3076 fail:
3077         if (adev->asic_reset_res)
3078                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3079                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3080         amdgpu_put_xgmi_hive(hive);
3081 }
3082
3083 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3084 {
3085         char *input = amdgpu_lockup_timeout;
3086         char *timeout_setting = NULL;
3087         int index = 0;
3088         long timeout;
3089         int ret = 0;
3090
3091         /*
3092          * By default timeout for non compute jobs is 10000.
3093          * And there is no timeout enforced on compute jobs.
3094          * In SR-IOV or passthrough mode, timeout for compute
3095          * jobs are 60000 by default.
3096          */
3097         adev->gfx_timeout = msecs_to_jiffies(10000);
3098         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3099         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3100                 adev->compute_timeout =  msecs_to_jiffies(60000);
3101         else
3102                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3103
3104         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3105                 while ((timeout_setting = strsep(&input, ",")) &&
3106                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3107                         ret = kstrtol(timeout_setting, 0, &timeout);
3108                         if (ret)
3109                                 return ret;
3110
3111                         if (timeout == 0) {
3112                                 index++;
3113                                 continue;
3114                         } else if (timeout < 0) {
3115                                 timeout = MAX_SCHEDULE_TIMEOUT;
3116                         } else {
3117                                 timeout = msecs_to_jiffies(timeout);
3118                         }
3119
3120                         switch (index++) {
3121                         case 0:
3122                                 adev->gfx_timeout = timeout;
3123                                 break;
3124                         case 1:
3125                                 adev->compute_timeout = timeout;
3126                                 break;
3127                         case 2:
3128                                 adev->sdma_timeout = timeout;
3129                                 break;
3130                         case 3:
3131                                 adev->video_timeout = timeout;
3132                                 break;
3133                         default:
3134                                 break;
3135                         }
3136                 }
3137                 /*
3138                  * There is only one value specified and
3139                  * it should apply to all non-compute jobs.
3140                  */
3141                 if (index == 1) {
3142                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3143                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3144                                 adev->compute_timeout = adev->gfx_timeout;
3145                 }
3146         }
3147
3148         return ret;
3149 }
3150
3151 static const struct attribute *amdgpu_dev_attributes[] = {
3152         &dev_attr_product_name.attr,
3153         &dev_attr_product_number.attr,
3154         &dev_attr_serial_number.attr,
3155         &dev_attr_pcie_replay_count.attr,
3156         NULL
3157 };
3158
3159
3160 /**
3161  * amdgpu_device_init - initialize the driver
3162  *
3163  * @adev: amdgpu_device pointer
3164  * @flags: driver flags
3165  *
3166  * Initializes the driver info and hw (all asics).
3167  * Returns 0 for success or an error on failure.
3168  * Called at driver startup.
3169  */
3170 int amdgpu_device_init(struct amdgpu_device *adev,
3171                        uint32_t flags)
3172 {
3173         struct drm_device *ddev = adev_to_drm(adev);
3174         struct pci_dev *pdev = adev->pdev;
3175         int r, i;
3176         bool boco = false;
3177         u32 max_MBps;
3178
3179         adev->shutdown = false;
3180         adev->flags = flags;
3181
3182         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3183                 adev->asic_type = amdgpu_force_asic_type;
3184         else
3185                 adev->asic_type = flags & AMD_ASIC_MASK;
3186
3187         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3188         if (amdgpu_emu_mode == 1)
3189                 adev->usec_timeout *= 10;
3190         adev->gmc.gart_size = 512 * 1024 * 1024;
3191         adev->accel_working = false;
3192         adev->num_rings = 0;
3193         adev->mman.buffer_funcs = NULL;
3194         adev->mman.buffer_funcs_ring = NULL;
3195         adev->vm_manager.vm_pte_funcs = NULL;
3196         adev->vm_manager.vm_pte_num_scheds = 0;
3197         adev->gmc.gmc_funcs = NULL;
3198         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3199         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3200
3201         adev->smc_rreg = &amdgpu_invalid_rreg;
3202         adev->smc_wreg = &amdgpu_invalid_wreg;
3203         adev->pcie_rreg = &amdgpu_invalid_rreg;
3204         adev->pcie_wreg = &amdgpu_invalid_wreg;
3205         adev->pciep_rreg = &amdgpu_invalid_rreg;
3206         adev->pciep_wreg = &amdgpu_invalid_wreg;
3207         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3208         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3209         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3210         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3211         adev->didt_rreg = &amdgpu_invalid_rreg;
3212         adev->didt_wreg = &amdgpu_invalid_wreg;
3213         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3214         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3215         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3216         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3217
3218         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3219                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3220                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3221
3222         /* mutex initialization are all done here so we
3223          * can recall function without having locking issues */
3224         atomic_set(&adev->irq.ih.lock, 0);
3225         mutex_init(&adev->firmware.mutex);
3226         mutex_init(&adev->pm.mutex);
3227         mutex_init(&adev->gfx.gpu_clock_mutex);
3228         mutex_init(&adev->srbm_mutex);
3229         mutex_init(&adev->gfx.pipe_reserve_mutex);
3230         mutex_init(&adev->gfx.gfx_off_mutex);
3231         mutex_init(&adev->grbm_idx_mutex);
3232         mutex_init(&adev->mn_lock);
3233         mutex_init(&adev->virt.vf_errors.lock);
3234         hash_init(adev->mn_hash);
3235         atomic_set(&adev->in_gpu_reset, 0);
3236         init_rwsem(&adev->reset_sem);
3237         mutex_init(&adev->psp.mutex);
3238         mutex_init(&adev->notifier_lock);
3239
3240         r = amdgpu_device_check_arguments(adev);
3241         if (r)
3242                 return r;
3243
3244         spin_lock_init(&adev->mmio_idx_lock);
3245         spin_lock_init(&adev->smc_idx_lock);
3246         spin_lock_init(&adev->pcie_idx_lock);
3247         spin_lock_init(&adev->uvd_ctx_idx_lock);
3248         spin_lock_init(&adev->didt_idx_lock);
3249         spin_lock_init(&adev->gc_cac_idx_lock);
3250         spin_lock_init(&adev->se_cac_idx_lock);
3251         spin_lock_init(&adev->audio_endpt_idx_lock);
3252         spin_lock_init(&adev->mm_stats.lock);
3253
3254         INIT_LIST_HEAD(&adev->shadow_list);
3255         mutex_init(&adev->shadow_list_lock);
3256
3257         INIT_DELAYED_WORK(&adev->delayed_init_work,
3258                           amdgpu_device_delayed_init_work_handler);
3259         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3260                           amdgpu_device_delay_enable_gfx_off);
3261
3262         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3263
3264         adev->gfx.gfx_off_req_count = 1;
3265         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3266
3267         atomic_set(&adev->throttling_logging_enabled, 1);
3268         /*
3269          * If throttling continues, logging will be performed every minute
3270          * to avoid log flooding. "-1" is subtracted since the thermal
3271          * throttling interrupt comes every second. Thus, the total logging
3272          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3273          * for throttling interrupt) = 60 seconds.
3274          */
3275         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3276         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3277
3278         /* Registers mapping */
3279         /* TODO: block userspace mapping of io register */
3280         if (adev->asic_type >= CHIP_BONAIRE) {
3281                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3282                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3283         } else {
3284                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3285                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3286         }
3287
3288         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3289         if (adev->rmmio == NULL) {
3290                 return -ENOMEM;
3291         }
3292         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3293         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3294
3295         /* io port mapping */
3296         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3297                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3298                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3299                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3300                         break;
3301                 }
3302         }
3303         if (adev->rio_mem == NULL)
3304                 DRM_INFO("PCI I/O BAR is not found.\n");
3305
3306         /* enable PCIE atomic ops */
3307         r = pci_enable_atomic_ops_to_root(adev->pdev,
3308                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3309                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3310         if (r) {
3311                 adev->have_atomics_support = false;
3312                 DRM_INFO("PCIE atomic ops is not supported\n");
3313         } else {
3314                 adev->have_atomics_support = true;
3315         }
3316
3317         amdgpu_device_get_pcie_info(adev);
3318
3319         if (amdgpu_mcbp)
3320                 DRM_INFO("MCBP is enabled\n");
3321
3322         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3323                 adev->enable_mes = true;
3324
3325         /* detect hw virtualization here */
3326         amdgpu_detect_virtualization(adev);
3327
3328         r = amdgpu_device_get_job_timeout_settings(adev);
3329         if (r) {
3330                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3331                 goto failed_unmap;
3332         }
3333
3334         /* early init functions */
3335         r = amdgpu_device_ip_early_init(adev);
3336         if (r)
3337                 goto failed_unmap;
3338
3339         /* doorbell bar mapping and doorbell index init*/
3340         amdgpu_device_doorbell_init(adev);
3341
3342         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3343         /* this will fail for cards that aren't VGA class devices, just
3344          * ignore it */
3345         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3346
3347         if (amdgpu_device_supports_boco(ddev))
3348                 boco = true;
3349         if (amdgpu_has_atpx() &&
3350             (amdgpu_is_atpx_hybrid() ||
3351              amdgpu_has_atpx_dgpu_power_cntl()) &&
3352             !pci_is_thunderbolt_attached(adev->pdev))
3353                 vga_switcheroo_register_client(adev->pdev,
3354                                                &amdgpu_switcheroo_ops, boco);
3355         if (boco)
3356                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3357
3358         if (amdgpu_emu_mode == 1) {
3359                 /* post the asic on emulation mode */
3360                 emu_soc_asic_init(adev);
3361                 goto fence_driver_init;
3362         }
3363
3364         /* detect if we are with an SRIOV vbios */
3365         amdgpu_device_detect_sriov_bios(adev);
3366
3367         /* check if we need to reset the asic
3368          *  E.g., driver was not cleanly unloaded previously, etc.
3369          */
3370         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3371                 r = amdgpu_asic_reset(adev);
3372                 if (r) {
3373                         dev_err(adev->dev, "asic reset on init failed\n");
3374                         goto failed;
3375                 }
3376         }
3377
3378         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3379
3380         /* Post card if necessary */
3381         if (amdgpu_device_need_post(adev)) {
3382                 if (!adev->bios) {
3383                         dev_err(adev->dev, "no vBIOS found\n");
3384                         r = -EINVAL;
3385                         goto failed;
3386                 }
3387                 DRM_INFO("GPU posting now...\n");
3388                 r = amdgpu_device_asic_init(adev);
3389                 if (r) {
3390                         dev_err(adev->dev, "gpu post error!\n");
3391                         goto failed;
3392                 }
3393         }
3394
3395         if (adev->is_atom_fw) {
3396                 /* Initialize clocks */
3397                 r = amdgpu_atomfirmware_get_clock_info(adev);
3398                 if (r) {
3399                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3400                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3401                         goto failed;
3402                 }
3403         } else {
3404                 /* Initialize clocks */
3405                 r = amdgpu_atombios_get_clock_info(adev);
3406                 if (r) {
3407                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3408                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3409                         goto failed;
3410                 }
3411                 /* init i2c buses */
3412                 if (!amdgpu_device_has_dc_support(adev))
3413                         amdgpu_atombios_i2c_init(adev);
3414         }
3415
3416 fence_driver_init:
3417         /* Fence driver */
3418         r = amdgpu_fence_driver_init(adev);
3419         if (r) {
3420                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3421                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3422                 goto failed;
3423         }
3424
3425         /* init the mode config */
3426         drm_mode_config_init(adev_to_drm(adev));
3427
3428         r = amdgpu_device_ip_init(adev);
3429         if (r) {
3430                 /* failed in exclusive mode due to timeout */
3431                 if (amdgpu_sriov_vf(adev) &&
3432                     !amdgpu_sriov_runtime(adev) &&
3433                     amdgpu_virt_mmio_blocked(adev) &&
3434                     !amdgpu_virt_wait_reset(adev)) {
3435                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3436                         /* Don't send request since VF is inactive. */
3437                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3438                         adev->virt.ops = NULL;
3439                         r = -EAGAIN;
3440                         goto failed;
3441                 }
3442                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3443                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3444                 goto failed;
3445         }
3446
3447         dev_info(adev->dev,
3448                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3449                         adev->gfx.config.max_shader_engines,
3450                         adev->gfx.config.max_sh_per_se,
3451                         adev->gfx.config.max_cu_per_sh,
3452                         adev->gfx.cu_info.number);
3453
3454         adev->accel_working = true;
3455
3456         amdgpu_vm_check_compute_bug(adev);
3457
3458         /* Initialize the buffer migration limit. */
3459         if (amdgpu_moverate >= 0)
3460                 max_MBps = amdgpu_moverate;
3461         else
3462                 max_MBps = 8; /* Allow 8 MB/s. */
3463         /* Get a log2 for easy divisions. */
3464         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3465
3466         amdgpu_fbdev_init(adev);
3467
3468         r = amdgpu_pm_sysfs_init(adev);
3469         if (r) {
3470                 adev->pm_sysfs_en = false;
3471                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3472         } else
3473                 adev->pm_sysfs_en = true;
3474
3475         r = amdgpu_ucode_sysfs_init(adev);
3476         if (r) {
3477                 adev->ucode_sysfs_en = false;
3478                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3479         } else
3480                 adev->ucode_sysfs_en = true;
3481
3482         if ((amdgpu_testing & 1)) {
3483                 if (adev->accel_working)
3484                         amdgpu_test_moves(adev);
3485                 else
3486                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3487         }
3488         if (amdgpu_benchmarking) {
3489                 if (adev->accel_working)
3490                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3491                 else
3492                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3493         }
3494
3495         /*
3496          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3497          * Otherwise the mgpu fan boost feature will be skipped due to the
3498          * gpu instance is counted less.
3499          */
3500         amdgpu_register_gpu_instance(adev);
3501
3502         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3503          * explicit gating rather than handling it automatically.
3504          */
3505         r = amdgpu_device_ip_late_init(adev);
3506         if (r) {
3507                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3508                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3509                 goto failed;
3510         }
3511
3512         /* must succeed. */
3513         amdgpu_ras_resume(adev);
3514
3515         queue_delayed_work(system_wq, &adev->delayed_init_work,
3516                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3517
3518         if (amdgpu_sriov_vf(adev))
3519                 flush_delayed_work(&adev->delayed_init_work);
3520
3521         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3522         if (r)
3523                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3524
3525         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3526                 r = amdgpu_pmu_init(adev);
3527         if (r)
3528                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3529
3530         /* Have stored pci confspace at hand for restore in sudden PCI error */
3531         if (amdgpu_device_cache_pci_state(adev->pdev))
3532                 pci_restore_state(pdev);
3533
3534         return 0;
3535
3536 failed:
3537         amdgpu_vf_error_trans_all(adev);
3538         if (boco)
3539                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3540
3541 failed_unmap:
3542         iounmap(adev->rmmio);
3543         adev->rmmio = NULL;
3544
3545         return r;
3546 }
3547
3548 /**
3549  * amdgpu_device_fini - tear down the driver
3550  *
3551  * @adev: amdgpu_device pointer
3552  *
3553  * Tear down the driver info (all asics).
3554  * Called at driver shutdown.
3555  */
3556 void amdgpu_device_fini(struct amdgpu_device *adev)
3557 {
3558         dev_info(adev->dev, "amdgpu: finishing device.\n");
3559         flush_delayed_work(&adev->delayed_init_work);
3560         adev->shutdown = true;
3561
3562         kfree(adev->pci_state);
3563
3564         /* make sure IB test finished before entering exclusive mode
3565          * to avoid preemption on IB test
3566          * */
3567         if (amdgpu_sriov_vf(adev)) {
3568                 amdgpu_virt_request_full_gpu(adev, false);
3569                 amdgpu_virt_fini_data_exchange(adev);
3570         }
3571
3572         /* disable all interrupts */
3573         amdgpu_irq_disable_all(adev);
3574         if (adev->mode_info.mode_config_initialized){
3575                 if (!amdgpu_device_has_dc_support(adev))
3576                         drm_helper_force_disable_all(adev_to_drm(adev));
3577                 else
3578                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3579         }
3580         amdgpu_fence_driver_fini(adev);
3581         if (adev->pm_sysfs_en)
3582                 amdgpu_pm_sysfs_fini(adev);
3583         amdgpu_fbdev_fini(adev);
3584         amdgpu_device_ip_fini(adev);
3585         release_firmware(adev->firmware.gpu_info_fw);
3586         adev->firmware.gpu_info_fw = NULL;
3587         adev->accel_working = false;
3588         /* free i2c buses */
3589         if (!amdgpu_device_has_dc_support(adev))
3590                 amdgpu_i2c_fini(adev);
3591
3592         if (amdgpu_emu_mode != 1)
3593                 amdgpu_atombios_fini(adev);
3594
3595         kfree(adev->bios);
3596         adev->bios = NULL;
3597         if (amdgpu_has_atpx() &&
3598             (amdgpu_is_atpx_hybrid() ||
3599              amdgpu_has_atpx_dgpu_power_cntl()) &&
3600             !pci_is_thunderbolt_attached(adev->pdev))
3601                 vga_switcheroo_unregister_client(adev->pdev);
3602         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3603                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3604         vga_client_register(adev->pdev, NULL, NULL, NULL);
3605         if (adev->rio_mem)
3606                 pci_iounmap(adev->pdev, adev->rio_mem);
3607         adev->rio_mem = NULL;
3608         iounmap(adev->rmmio);
3609         adev->rmmio = NULL;
3610         amdgpu_device_doorbell_fini(adev);
3611
3612         if (adev->ucode_sysfs_en)
3613                 amdgpu_ucode_sysfs_fini(adev);
3614
3615         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3616         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3617                 amdgpu_pmu_fini(adev);
3618         if (adev->mman.discovery_bin)
3619                 amdgpu_discovery_fini(adev);
3620 }
3621
3622
3623 /*
3624  * Suspend & resume.
3625  */
3626 /**
3627  * amdgpu_device_suspend - initiate device suspend
3628  *
3629  * @dev: drm dev pointer
3630  * @fbcon : notify the fbdev of suspend
3631  *
3632  * Puts the hw in the suspend state (all asics).
3633  * Returns 0 for success or an error on failure.
3634  * Called at driver suspend.
3635  */
3636 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3637 {
3638         struct amdgpu_device *adev;
3639         struct drm_crtc *crtc;
3640         struct drm_connector *connector;
3641         struct drm_connector_list_iter iter;
3642         int r;
3643
3644         adev = drm_to_adev(dev);
3645
3646         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3647                 return 0;
3648
3649         adev->in_suspend = true;
3650         drm_kms_helper_poll_disable(dev);
3651
3652         if (fbcon)
3653                 amdgpu_fbdev_set_suspend(adev, 1);
3654
3655         cancel_delayed_work_sync(&adev->delayed_init_work);
3656
3657         if (!amdgpu_device_has_dc_support(adev)) {
3658                 /* turn off display hw */
3659                 drm_modeset_lock_all(dev);
3660                 drm_connector_list_iter_begin(dev, &iter);
3661                 drm_for_each_connector_iter(connector, &iter)
3662                         drm_helper_connector_dpms(connector,
3663                                                   DRM_MODE_DPMS_OFF);
3664                 drm_connector_list_iter_end(&iter);
3665                 drm_modeset_unlock_all(dev);
3666                         /* unpin the front buffers and cursors */
3667                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3668                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3669                         struct drm_framebuffer *fb = crtc->primary->fb;
3670                         struct amdgpu_bo *robj;
3671
3672                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3673                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3674                                 r = amdgpu_bo_reserve(aobj, true);
3675                                 if (r == 0) {
3676                                         amdgpu_bo_unpin(aobj);
3677                                         amdgpu_bo_unreserve(aobj);
3678                                 }
3679                         }
3680
3681                         if (fb == NULL || fb->obj[0] == NULL) {
3682                                 continue;
3683                         }
3684                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3685                         /* don't unpin kernel fb objects */
3686                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3687                                 r = amdgpu_bo_reserve(robj, true);
3688                                 if (r == 0) {
3689                                         amdgpu_bo_unpin(robj);
3690                                         amdgpu_bo_unreserve(robj);
3691                                 }
3692                         }
3693                 }
3694         }
3695
3696         amdgpu_ras_suspend(adev);
3697
3698         r = amdgpu_device_ip_suspend_phase1(adev);
3699
3700         amdgpu_amdkfd_suspend(adev, !fbcon);
3701
3702         /* evict vram memory */
3703         amdgpu_bo_evict_vram(adev);
3704
3705         amdgpu_fence_driver_suspend(adev);
3706
3707         r = amdgpu_device_ip_suspend_phase2(adev);
3708
3709         /* evict remaining vram memory
3710          * This second call to evict vram is to evict the gart page table
3711          * using the CPU.
3712          */
3713         amdgpu_bo_evict_vram(adev);
3714
3715         return 0;
3716 }
3717
3718 /**
3719  * amdgpu_device_resume - initiate device resume
3720  *
3721  * @dev: drm dev pointer
3722  * @fbcon : notify the fbdev of resume
3723  *
3724  * Bring the hw back to operating state (all asics).
3725  * Returns 0 for success or an error on failure.
3726  * Called at driver resume.
3727  */
3728 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3729 {
3730         struct drm_connector *connector;
3731         struct drm_connector_list_iter iter;
3732         struct amdgpu_device *adev = drm_to_adev(dev);
3733         struct drm_crtc *crtc;
3734         int r = 0;
3735
3736         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3737                 return 0;
3738
3739         /* post card */
3740         if (amdgpu_device_need_post(adev)) {
3741                 r = amdgpu_device_asic_init(adev);
3742                 if (r)
3743                         dev_err(adev->dev, "amdgpu asic init failed\n");
3744         }
3745
3746         r = amdgpu_device_ip_resume(adev);
3747         if (r) {
3748                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3749                 return r;
3750         }
3751         amdgpu_fence_driver_resume(adev);
3752
3753
3754         r = amdgpu_device_ip_late_init(adev);
3755         if (r)
3756                 return r;
3757
3758         queue_delayed_work(system_wq, &adev->delayed_init_work,
3759                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3760
3761         if (!amdgpu_device_has_dc_support(adev)) {
3762                 /* pin cursors */
3763                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3764                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3765
3766                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3767                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3768                                 r = amdgpu_bo_reserve(aobj, true);
3769                                 if (r == 0) {
3770                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3771                                         if (r != 0)
3772                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3773                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3774                                         amdgpu_bo_unreserve(aobj);
3775                                 }
3776                         }
3777                 }
3778         }
3779         r = amdgpu_amdkfd_resume(adev, !fbcon);
3780         if (r)
3781                 return r;
3782
3783         /* Make sure IB tests flushed */
3784         flush_delayed_work(&adev->delayed_init_work);
3785
3786         /* blat the mode back in */
3787         if (fbcon) {
3788                 if (!amdgpu_device_has_dc_support(adev)) {
3789                         /* pre DCE11 */
3790                         drm_helper_resume_force_mode(dev);
3791
3792                         /* turn on display hw */
3793                         drm_modeset_lock_all(dev);
3794
3795                         drm_connector_list_iter_begin(dev, &iter);
3796                         drm_for_each_connector_iter(connector, &iter)
3797                                 drm_helper_connector_dpms(connector,
3798                                                           DRM_MODE_DPMS_ON);
3799                         drm_connector_list_iter_end(&iter);
3800
3801                         drm_modeset_unlock_all(dev);
3802                 }
3803                 amdgpu_fbdev_set_suspend(adev, 0);
3804         }
3805
3806         drm_kms_helper_poll_enable(dev);
3807
3808         amdgpu_ras_resume(adev);
3809
3810         /*
3811          * Most of the connector probing functions try to acquire runtime pm
3812          * refs to ensure that the GPU is powered on when connector polling is
3813          * performed. Since we're calling this from a runtime PM callback,
3814          * trying to acquire rpm refs will cause us to deadlock.
3815          *
3816          * Since we're guaranteed to be holding the rpm lock, it's safe to
3817          * temporarily disable the rpm helpers so this doesn't deadlock us.
3818          */
3819 #ifdef CONFIG_PM
3820         dev->dev->power.disable_depth++;
3821 #endif
3822         if (!amdgpu_device_has_dc_support(adev))
3823                 drm_helper_hpd_irq_event(dev);
3824         else
3825                 drm_kms_helper_hotplug_event(dev);
3826 #ifdef CONFIG_PM
3827         dev->dev->power.disable_depth--;
3828 #endif
3829         adev->in_suspend = false;
3830
3831         return 0;
3832 }
3833
3834 /**
3835  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3836  *
3837  * @adev: amdgpu_device pointer
3838  *
3839  * The list of all the hardware IPs that make up the asic is walked and
3840  * the check_soft_reset callbacks are run.  check_soft_reset determines
3841  * if the asic is still hung or not.
3842  * Returns true if any of the IPs are still in a hung state, false if not.
3843  */
3844 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3845 {
3846         int i;
3847         bool asic_hang = false;
3848
3849         if (amdgpu_sriov_vf(adev))
3850                 return true;
3851
3852         if (amdgpu_asic_need_full_reset(adev))
3853                 return true;
3854
3855         for (i = 0; i < adev->num_ip_blocks; i++) {
3856                 if (!adev->ip_blocks[i].status.valid)
3857                         continue;
3858                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3859                         adev->ip_blocks[i].status.hang =
3860                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3861                 if (adev->ip_blocks[i].status.hang) {
3862                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3863                         asic_hang = true;
3864                 }
3865         }
3866         return asic_hang;
3867 }
3868
3869 /**
3870  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3871  *
3872  * @adev: amdgpu_device pointer
3873  *
3874  * The list of all the hardware IPs that make up the asic is walked and the
3875  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3876  * handles any IP specific hardware or software state changes that are
3877  * necessary for a soft reset to succeed.
3878  * Returns 0 on success, negative error code on failure.
3879  */
3880 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3881 {
3882         int i, r = 0;
3883
3884         for (i = 0; i < adev->num_ip_blocks; i++) {
3885                 if (!adev->ip_blocks[i].status.valid)
3886                         continue;
3887                 if (adev->ip_blocks[i].status.hang &&
3888                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3889                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3890                         if (r)
3891                                 return r;
3892                 }
3893         }
3894
3895         return 0;
3896 }
3897
3898 /**
3899  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3900  *
3901  * @adev: amdgpu_device pointer
3902  *
3903  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3904  * reset is necessary to recover.
3905  * Returns true if a full asic reset is required, false if not.
3906  */
3907 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3908 {
3909         int i;
3910
3911         if (amdgpu_asic_need_full_reset(adev))
3912                 return true;
3913
3914         for (i = 0; i < adev->num_ip_blocks; i++) {
3915                 if (!adev->ip_blocks[i].status.valid)
3916                         continue;
3917                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3918                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3919                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3920                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3921                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3922                         if (adev->ip_blocks[i].status.hang) {
3923                                 dev_info(adev->dev, "Some block need full reset!\n");
3924                                 return true;
3925                         }
3926                 }
3927         }
3928         return false;
3929 }
3930
3931 /**
3932  * amdgpu_device_ip_soft_reset - do a soft reset
3933  *
3934  * @adev: amdgpu_device pointer
3935  *
3936  * The list of all the hardware IPs that make up the asic is walked and the
3937  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3938  * IP specific hardware or software state changes that are necessary to soft
3939  * reset the IP.
3940  * Returns 0 on success, negative error code on failure.
3941  */
3942 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3943 {
3944         int i, r = 0;
3945
3946         for (i = 0; i < adev->num_ip_blocks; i++) {
3947                 if (!adev->ip_blocks[i].status.valid)
3948                         continue;
3949                 if (adev->ip_blocks[i].status.hang &&
3950                     adev->ip_blocks[i].version->funcs->soft_reset) {
3951                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3952                         if (r)
3953                                 return r;
3954                 }
3955         }
3956
3957         return 0;
3958 }
3959
3960 /**
3961  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3962  *
3963  * @adev: amdgpu_device pointer
3964  *
3965  * The list of all the hardware IPs that make up the asic is walked and the
3966  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3967  * handles any IP specific hardware or software state changes that are
3968  * necessary after the IP has been soft reset.
3969  * Returns 0 on success, negative error code on failure.
3970  */
3971 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3972 {
3973         int i, r = 0;
3974
3975         for (i = 0; i < adev->num_ip_blocks; i++) {
3976                 if (!adev->ip_blocks[i].status.valid)
3977                         continue;
3978                 if (adev->ip_blocks[i].status.hang &&
3979                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3980                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3981                 if (r)
3982                         return r;
3983         }
3984
3985         return 0;
3986 }
3987
3988 /**
3989  * amdgpu_device_recover_vram - Recover some VRAM contents
3990  *
3991  * @adev: amdgpu_device pointer
3992  *
3993  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3994  * restore things like GPUVM page tables after a GPU reset where
3995  * the contents of VRAM might be lost.
3996  *
3997  * Returns:
3998  * 0 on success, negative error code on failure.
3999  */
4000 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4001 {
4002         struct dma_fence *fence = NULL, *next = NULL;
4003         struct amdgpu_bo *shadow;
4004         long r = 1, tmo;
4005
4006         if (amdgpu_sriov_runtime(adev))
4007                 tmo = msecs_to_jiffies(8000);
4008         else
4009                 tmo = msecs_to_jiffies(100);
4010
4011         dev_info(adev->dev, "recover vram bo from shadow start\n");
4012         mutex_lock(&adev->shadow_list_lock);
4013         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4014
4015                 /* No need to recover an evicted BO */
4016                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4017                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4018                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4019                         continue;
4020
4021                 r = amdgpu_bo_restore_shadow(shadow, &next);
4022                 if (r)
4023                         break;
4024
4025                 if (fence) {
4026                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4027                         dma_fence_put(fence);
4028                         fence = next;
4029                         if (tmo == 0) {
4030                                 r = -ETIMEDOUT;
4031                                 break;
4032                         } else if (tmo < 0) {
4033                                 r = tmo;
4034                                 break;
4035                         }
4036                 } else {
4037                         fence = next;
4038                 }
4039         }
4040         mutex_unlock(&adev->shadow_list_lock);
4041
4042         if (fence)
4043                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4044         dma_fence_put(fence);
4045
4046         if (r < 0 || tmo <= 0) {
4047                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4048                 return -EIO;
4049         }
4050
4051         dev_info(adev->dev, "recover vram bo from shadow done\n");
4052         return 0;
4053 }
4054
4055
4056 /**
4057  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4058  *
4059  * @adev: amdgpu_device pointer
4060  * @from_hypervisor: request from hypervisor
4061  *
4062  * do VF FLR and reinitialize Asic
4063  * return 0 means succeeded otherwise failed
4064  */
4065 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4066                                      bool from_hypervisor)
4067 {
4068         int r;
4069
4070         if (from_hypervisor)
4071                 r = amdgpu_virt_request_full_gpu(adev, true);
4072         else
4073                 r = amdgpu_virt_reset_gpu(adev);
4074         if (r)
4075                 return r;
4076
4077         amdgpu_amdkfd_pre_reset(adev);
4078
4079         /* Resume IP prior to SMC */
4080         r = amdgpu_device_ip_reinit_early_sriov(adev);
4081         if (r)
4082                 goto error;
4083
4084         amdgpu_virt_init_data_exchange(adev);
4085         /* we need recover gart prior to run SMC/CP/SDMA resume */
4086         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4087
4088         r = amdgpu_device_fw_loading(adev);
4089         if (r)
4090                 return r;
4091
4092         /* now we are okay to resume SMC/CP/SDMA */
4093         r = amdgpu_device_ip_reinit_late_sriov(adev);
4094         if (r)
4095                 goto error;
4096
4097         amdgpu_irq_gpu_reset_resume_helper(adev);
4098         r = amdgpu_ib_ring_tests(adev);
4099         amdgpu_amdkfd_post_reset(adev);
4100
4101 error:
4102         amdgpu_virt_release_full_gpu(adev, true);
4103         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4104                 amdgpu_inc_vram_lost(adev);
4105                 r = amdgpu_device_recover_vram(adev);
4106         }
4107
4108         return r;
4109 }
4110
4111 /**
4112  * amdgpu_device_has_job_running - check if there is any job in mirror list
4113  *
4114  * @adev: amdgpu_device pointer
4115  *
4116  * check if there is any job in mirror list
4117  */
4118 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4119 {
4120         int i;
4121         struct drm_sched_job *job;
4122
4123         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4124                 struct amdgpu_ring *ring = adev->rings[i];
4125
4126                 if (!ring || !ring->sched.thread)
4127                         continue;
4128
4129                 spin_lock(&ring->sched.job_list_lock);
4130                 job = list_first_entry_or_null(&ring->sched.pending_list,
4131                                                struct drm_sched_job, list);
4132                 spin_unlock(&ring->sched.job_list_lock);
4133                 if (job)
4134                         return true;
4135         }
4136         return false;
4137 }
4138
4139 /**
4140  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4141  *
4142  * @adev: amdgpu_device pointer
4143  *
4144  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4145  * a hung GPU.
4146  */
4147 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4148 {
4149         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4150                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4151                 return false;
4152         }
4153
4154         if (amdgpu_gpu_recovery == 0)
4155                 goto disabled;
4156
4157         if (amdgpu_sriov_vf(adev))
4158                 return true;
4159
4160         if (amdgpu_gpu_recovery == -1) {
4161                 switch (adev->asic_type) {
4162                 case CHIP_BONAIRE:
4163                 case CHIP_HAWAII:
4164                 case CHIP_TOPAZ:
4165                 case CHIP_TONGA:
4166                 case CHIP_FIJI:
4167                 case CHIP_POLARIS10:
4168                 case CHIP_POLARIS11:
4169                 case CHIP_POLARIS12:
4170                 case CHIP_VEGAM:
4171                 case CHIP_VEGA20:
4172                 case CHIP_VEGA10:
4173                 case CHIP_VEGA12:
4174                 case CHIP_RAVEN:
4175                 case CHIP_ARCTURUS:
4176                 case CHIP_RENOIR:
4177                 case CHIP_NAVI10:
4178                 case CHIP_NAVI14:
4179                 case CHIP_NAVI12:
4180                 case CHIP_SIENNA_CICHLID:
4181                         break;
4182                 default:
4183                         goto disabled;
4184                 }
4185         }
4186
4187         return true;
4188
4189 disabled:
4190                 dev_info(adev->dev, "GPU recovery disabled.\n");
4191                 return false;
4192 }
4193
4194
4195 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4196                                         struct amdgpu_job *job,
4197                                         bool *need_full_reset_arg)
4198 {
4199         int i, r = 0;
4200         bool need_full_reset  = *need_full_reset_arg;
4201
4202         amdgpu_debugfs_wait_dump(adev);
4203
4204         if (amdgpu_sriov_vf(adev)) {
4205                 /* stop the data exchange thread */
4206                 amdgpu_virt_fini_data_exchange(adev);
4207         }
4208
4209         /* block all schedulers and reset given job's ring */
4210         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4211                 struct amdgpu_ring *ring = adev->rings[i];
4212
4213                 if (!ring || !ring->sched.thread)
4214                         continue;
4215
4216                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4217                 amdgpu_fence_driver_force_completion(ring);
4218         }
4219
4220         if(job)
4221                 drm_sched_increase_karma(&job->base);
4222
4223         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4224         if (!amdgpu_sriov_vf(adev)) {
4225
4226                 if (!need_full_reset)
4227                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4228
4229                 if (!need_full_reset) {
4230                         amdgpu_device_ip_pre_soft_reset(adev);
4231                         r = amdgpu_device_ip_soft_reset(adev);
4232                         amdgpu_device_ip_post_soft_reset(adev);
4233                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4234                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4235                                 need_full_reset = true;
4236                         }
4237                 }
4238
4239                 if (need_full_reset)
4240                         r = amdgpu_device_ip_suspend(adev);
4241
4242                 *need_full_reset_arg = need_full_reset;
4243         }
4244
4245         return r;
4246 }
4247
4248 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4249                                struct list_head *device_list_handle,
4250                                bool *need_full_reset_arg,
4251                                bool skip_hw_reset)
4252 {
4253         struct amdgpu_device *tmp_adev = NULL;
4254         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4255         int r = 0;
4256
4257         /*
4258          * ASIC reset has to be done on all HGMI hive nodes ASAP
4259          * to allow proper links negotiation in FW (within 1 sec)
4260          */
4261         if (!skip_hw_reset && need_full_reset) {
4262                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4263                         /* For XGMI run all resets in parallel to speed up the process */
4264                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4265                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4266                                         r = -EALREADY;
4267                         } else
4268                                 r = amdgpu_asic_reset(tmp_adev);
4269
4270                         if (r) {
4271                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4272                                          r, adev_to_drm(tmp_adev)->unique);
4273                                 break;
4274                         }
4275                 }
4276
4277                 /* For XGMI wait for all resets to complete before proceed */
4278                 if (!r) {
4279                         list_for_each_entry(tmp_adev, device_list_handle,
4280                                             gmc.xgmi.head) {
4281                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4282                                         flush_work(&tmp_adev->xgmi_reset_work);
4283                                         r = tmp_adev->asic_reset_res;
4284                                         if (r)
4285                                                 break;
4286                                 }
4287                         }
4288                 }
4289         }
4290
4291         if (!r && amdgpu_ras_intr_triggered()) {
4292                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4293                         if (tmp_adev->mmhub.funcs &&
4294                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4295                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4296                 }
4297
4298                 amdgpu_ras_intr_cleared();
4299         }
4300
4301         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4302                 if (need_full_reset) {
4303                         /* post card */
4304                         if (amdgpu_device_asic_init(tmp_adev))
4305                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4306
4307                         if (!r) {
4308                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4309                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4310                                 if (r)
4311                                         goto out;
4312
4313                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4314                                 if (vram_lost) {
4315                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4316                                         amdgpu_inc_vram_lost(tmp_adev);
4317                                 }
4318
4319                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4320                                 if (r)
4321                                         goto out;
4322
4323                                 r = amdgpu_device_fw_loading(tmp_adev);
4324                                 if (r)
4325                                         return r;
4326
4327                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4328                                 if (r)
4329                                         goto out;
4330
4331                                 if (vram_lost)
4332                                         amdgpu_device_fill_reset_magic(tmp_adev);
4333
4334                                 /*
4335                                  * Add this ASIC as tracked as reset was already
4336                                  * complete successfully.
4337                                  */
4338                                 amdgpu_register_gpu_instance(tmp_adev);
4339
4340                                 r = amdgpu_device_ip_late_init(tmp_adev);
4341                                 if (r)
4342                                         goto out;
4343
4344                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4345
4346                                 /*
4347                                  * The GPU enters bad state once faulty pages
4348                                  * by ECC has reached the threshold, and ras
4349                                  * recovery is scheduled next. So add one check
4350                                  * here to break recovery if it indeed exceeds
4351                                  * bad page threshold, and remind user to
4352                                  * retire this GPU or setting one bigger
4353                                  * bad_page_threshold value to fix this once
4354                                  * probing driver again.
4355                                  */
4356                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4357                                         /* must succeed. */
4358                                         amdgpu_ras_resume(tmp_adev);
4359                                 } else {
4360                                         r = -EINVAL;
4361                                         goto out;
4362                                 }
4363
4364                                 /* Update PSP FW topology after reset */
4365                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4366                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4367                         }
4368                 }
4369
4370 out:
4371                 if (!r) {
4372                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4373                         r = amdgpu_ib_ring_tests(tmp_adev);
4374                         if (r) {
4375                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4376                                 r = amdgpu_device_ip_suspend(tmp_adev);
4377                                 need_full_reset = true;
4378                                 r = -EAGAIN;
4379                                 goto end;
4380                         }
4381                 }
4382
4383                 if (!r)
4384                         r = amdgpu_device_recover_vram(tmp_adev);
4385                 else
4386                         tmp_adev->asic_reset_res = r;
4387         }
4388
4389 end:
4390         *need_full_reset_arg = need_full_reset;
4391         return r;
4392 }
4393
4394 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4395                                 struct amdgpu_hive_info *hive)
4396 {
4397         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4398                 return false;
4399
4400         if (hive) {
4401                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4402         } else {
4403                 down_write(&adev->reset_sem);
4404         }
4405
4406         atomic_inc(&adev->gpu_reset_counter);
4407         switch (amdgpu_asic_reset_method(adev)) {
4408         case AMD_RESET_METHOD_MODE1:
4409                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4410                 break;
4411         case AMD_RESET_METHOD_MODE2:
4412                 adev->mp1_state = PP_MP1_STATE_RESET;
4413                 break;
4414         default:
4415                 adev->mp1_state = PP_MP1_STATE_NONE;
4416                 break;
4417         }
4418
4419         return true;
4420 }
4421
4422 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4423 {
4424         amdgpu_vf_error_trans_all(adev);
4425         adev->mp1_state = PP_MP1_STATE_NONE;
4426         atomic_set(&adev->in_gpu_reset, 0);
4427         up_write(&adev->reset_sem);
4428 }
4429
4430 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4431 {
4432         struct pci_dev *p = NULL;
4433
4434         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4435                         adev->pdev->bus->number, 1);
4436         if (p) {
4437                 pm_runtime_enable(&(p->dev));
4438                 pm_runtime_resume(&(p->dev));
4439         }
4440 }
4441
4442 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4443 {
4444         enum amd_reset_method reset_method;
4445         struct pci_dev *p = NULL;
4446         u64 expires;
4447
4448         /*
4449          * For now, only BACO and mode1 reset are confirmed
4450          * to suffer the audio issue without proper suspended.
4451          */
4452         reset_method = amdgpu_asic_reset_method(adev);
4453         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4454              (reset_method != AMD_RESET_METHOD_MODE1))
4455                 return -EINVAL;
4456
4457         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4458                         adev->pdev->bus->number, 1);
4459         if (!p)
4460                 return -ENODEV;
4461
4462         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4463         if (!expires)
4464                 /*
4465                  * If we cannot get the audio device autosuspend delay,
4466                  * a fixed 4S interval will be used. Considering 3S is
4467                  * the audio controller default autosuspend delay setting.
4468                  * 4S used here is guaranteed to cover that.
4469                  */
4470                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4471
4472         while (!pm_runtime_status_suspended(&(p->dev))) {
4473                 if (!pm_runtime_suspend(&(p->dev)))
4474                         break;
4475
4476                 if (expires < ktime_get_mono_fast_ns()) {
4477                         dev_warn(adev->dev, "failed to suspend display audio\n");
4478                         /* TODO: abort the succeeding gpu reset? */
4479                         return -ETIMEDOUT;
4480                 }
4481         }
4482
4483         pm_runtime_disable(&(p->dev));
4484
4485         return 0;
4486 }
4487
4488 /**
4489  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4490  *
4491  * @adev: amdgpu_device pointer
4492  * @job: which job trigger hang
4493  *
4494  * Attempt to reset the GPU if it has hung (all asics).
4495  * Attempt to do soft-reset or full-reset and reinitialize Asic
4496  * Returns 0 for success or an error on failure.
4497  */
4498
4499 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4500                               struct amdgpu_job *job)
4501 {
4502         struct list_head device_list, *device_list_handle =  NULL;
4503         bool need_full_reset = false;
4504         bool job_signaled = false;
4505         struct amdgpu_hive_info *hive = NULL;
4506         struct amdgpu_device *tmp_adev = NULL;
4507         int i, r = 0;
4508         bool need_emergency_restart = false;
4509         bool audio_suspended = false;
4510
4511         /*
4512          * Special case: RAS triggered and full reset isn't supported
4513          */
4514         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4515
4516         /*
4517          * Flush RAM to disk so that after reboot
4518          * the user can read log and see why the system rebooted.
4519          */
4520         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4521                 DRM_WARN("Emergency reboot.");
4522
4523                 ksys_sync_helper();
4524                 emergency_restart();
4525         }
4526
4527         dev_info(adev->dev, "GPU %s begin!\n",
4528                 need_emergency_restart ? "jobs stop":"reset");
4529
4530         /*
4531          * Here we trylock to avoid chain of resets executing from
4532          * either trigger by jobs on different adevs in XGMI hive or jobs on
4533          * different schedulers for same device while this TO handler is running.
4534          * We always reset all schedulers for device and all devices for XGMI
4535          * hive so that should take care of them too.
4536          */
4537         hive = amdgpu_get_xgmi_hive(adev);
4538         if (hive) {
4539                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4540                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4541                                 job ? job->base.id : -1, hive->hive_id);
4542                         amdgpu_put_xgmi_hive(hive);
4543                         return 0;
4544                 }
4545                 mutex_lock(&hive->hive_lock);
4546         }
4547
4548         /*
4549          * Build list of devices to reset.
4550          * In case we are in XGMI hive mode, resort the device list
4551          * to put adev in the 1st position.
4552          */
4553         INIT_LIST_HEAD(&device_list);
4554         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4555                 if (!hive)
4556                         return -ENODEV;
4557                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4558                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4559                 device_list_handle = &hive->device_list;
4560         } else {
4561                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4562                 device_list_handle = &device_list;
4563         }
4564
4565         /* block all schedulers and reset given job's ring */
4566         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4567                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4568                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4569                                   job ? job->base.id : -1);
4570                         r = 0;
4571                         goto skip_recovery;
4572                 }
4573
4574                 /*
4575                  * Try to put the audio codec into suspend state
4576                  * before gpu reset started.
4577                  *
4578                  * Due to the power domain of the graphics device
4579                  * is shared with AZ power domain. Without this,
4580                  * we may change the audio hardware from behind
4581                  * the audio driver's back. That will trigger
4582                  * some audio codec errors.
4583                  */
4584                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4585                         audio_suspended = true;
4586
4587                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4588
4589                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4590
4591                 if (!amdgpu_sriov_vf(tmp_adev))
4592                         amdgpu_amdkfd_pre_reset(tmp_adev);
4593
4594                 /*
4595                  * Mark these ASICs to be reseted as untracked first
4596                  * And add them back after reset completed
4597                  */
4598                 amdgpu_unregister_gpu_instance(tmp_adev);
4599
4600                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4601
4602                 /* disable ras on ALL IPs */
4603                 if (!need_emergency_restart &&
4604                       amdgpu_device_ip_need_full_reset(tmp_adev))
4605                         amdgpu_ras_suspend(tmp_adev);
4606
4607                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4608                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4609
4610                         if (!ring || !ring->sched.thread)
4611                                 continue;
4612
4613                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4614
4615                         if (need_emergency_restart)
4616                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4617                 }
4618         }
4619
4620         if (need_emergency_restart)
4621                 goto skip_sched_resume;
4622
4623         /*
4624          * Must check guilty signal here since after this point all old
4625          * HW fences are force signaled.
4626          *
4627          * job->base holds a reference to parent fence
4628          */
4629         if (job && job->base.s_fence->parent &&
4630             dma_fence_is_signaled(job->base.s_fence->parent)) {
4631                 job_signaled = true;
4632                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4633                 goto skip_hw_reset;
4634         }
4635
4636 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4637         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4638                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4639                                                  (tmp_adev == adev) ? job : NULL,
4640                                                  &need_full_reset);
4641                 /*TODO Should we stop ?*/
4642                 if (r) {
4643                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4644                                   r, adev_to_drm(tmp_adev)->unique);
4645                         tmp_adev->asic_reset_res = r;
4646                 }
4647         }
4648
4649         /* Actual ASIC resets if needed.*/
4650         /* TODO Implement XGMI hive reset logic for SRIOV */
4651         if (amdgpu_sriov_vf(adev)) {
4652                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4653                 if (r)
4654                         adev->asic_reset_res = r;
4655         } else {
4656                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4657                 if (r && r == -EAGAIN)
4658                         goto retry;
4659         }
4660
4661 skip_hw_reset:
4662
4663         /* Post ASIC reset for all devs .*/
4664         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4665
4666                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4667                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4668
4669                         if (!ring || !ring->sched.thread)
4670                                 continue;
4671
4672                         /* No point to resubmit jobs if we didn't HW reset*/
4673                         if (!tmp_adev->asic_reset_res && !job_signaled)
4674                                 drm_sched_resubmit_jobs(&ring->sched);
4675
4676                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4677                 }
4678
4679                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4680                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4681                 }
4682
4683                 tmp_adev->asic_reset_res = 0;
4684
4685                 if (r) {
4686                         /* bad news, how to tell it to userspace ? */
4687                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4688                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4689                 } else {
4690                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4691                 }
4692         }
4693
4694 skip_sched_resume:
4695         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4696                 /*unlock kfd: SRIOV would do it separately */
4697                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4698                         amdgpu_amdkfd_post_reset(tmp_adev);
4699                 if (audio_suspended)
4700                         amdgpu_device_resume_display_audio(tmp_adev);
4701                 amdgpu_device_unlock_adev(tmp_adev);
4702         }
4703
4704 skip_recovery:
4705         if (hive) {
4706                 atomic_set(&hive->in_reset, 0);
4707                 mutex_unlock(&hive->hive_lock);
4708                 amdgpu_put_xgmi_hive(hive);
4709         }
4710
4711         if (r)
4712                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4713         return r;
4714 }
4715
4716 /**
4717  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4718  *
4719  * @adev: amdgpu_device pointer
4720  *
4721  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4722  * and lanes) of the slot the device is in. Handles APUs and
4723  * virtualized environments where PCIE config space may not be available.
4724  */
4725 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4726 {
4727         struct pci_dev *pdev;
4728         enum pci_bus_speed speed_cap, platform_speed_cap;
4729         enum pcie_link_width platform_link_width;
4730
4731         if (amdgpu_pcie_gen_cap)
4732                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4733
4734         if (amdgpu_pcie_lane_cap)
4735                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4736
4737         /* covers APUs as well */
4738         if (pci_is_root_bus(adev->pdev->bus)) {
4739                 if (adev->pm.pcie_gen_mask == 0)
4740                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4741                 if (adev->pm.pcie_mlw_mask == 0)
4742                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4743                 return;
4744         }
4745
4746         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4747                 return;
4748
4749         pcie_bandwidth_available(adev->pdev, NULL,
4750                                  &platform_speed_cap, &platform_link_width);
4751
4752         if (adev->pm.pcie_gen_mask == 0) {
4753                 /* asic caps */
4754                 pdev = adev->pdev;
4755                 speed_cap = pcie_get_speed_cap(pdev);
4756                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4757                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4758                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4759                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4760                 } else {
4761                         if (speed_cap == PCIE_SPEED_16_0GT)
4762                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4763                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4764                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4765                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4766                         else if (speed_cap == PCIE_SPEED_8_0GT)
4767                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4768                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4769                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4770                         else if (speed_cap == PCIE_SPEED_5_0GT)
4771                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4772                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4773                         else
4774                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4775                 }
4776                 /* platform caps */
4777                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4778                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4779                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4780                 } else {
4781                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4782                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4783                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4784                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4785                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4786                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4787                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4788                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4789                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4790                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4791                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4792                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4793                         else
4794                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4795
4796                 }
4797         }
4798         if (adev->pm.pcie_mlw_mask == 0) {
4799                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4800                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4801                 } else {
4802                         switch (platform_link_width) {
4803                         case PCIE_LNK_X32:
4804                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4805                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4806                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4807                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4808                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4809                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4810                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4811                                 break;
4812                         case PCIE_LNK_X16:
4813                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4814                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4815                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4816                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4817                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4818                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4819                                 break;
4820                         case PCIE_LNK_X12:
4821                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4822                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4823                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4824                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4825                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4826                                 break;
4827                         case PCIE_LNK_X8:
4828                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4829                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4830                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4831                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4832                                 break;
4833                         case PCIE_LNK_X4:
4834                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4835                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4836                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4837                                 break;
4838                         case PCIE_LNK_X2:
4839                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4840                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4841                                 break;
4842                         case PCIE_LNK_X1:
4843                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4844                                 break;
4845                         default:
4846                                 break;
4847                         }
4848                 }
4849         }
4850 }
4851
4852 int amdgpu_device_baco_enter(struct drm_device *dev)
4853 {
4854         struct amdgpu_device *adev = drm_to_adev(dev);
4855         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4856
4857         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4858                 return -ENOTSUPP;
4859
4860         if (ras && ras->supported)
4861                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4862
4863         return amdgpu_dpm_baco_enter(adev);
4864 }
4865
4866 int amdgpu_device_baco_exit(struct drm_device *dev)
4867 {
4868         struct amdgpu_device *adev = drm_to_adev(dev);
4869         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4870         int ret = 0;
4871
4872         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4873                 return -ENOTSUPP;
4874
4875         ret = amdgpu_dpm_baco_exit(adev);
4876         if (ret)
4877                 return ret;
4878
4879         if (ras && ras->supported)
4880                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4881
4882         return 0;
4883 }
4884
4885 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4886 {
4887         int i;
4888
4889         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4890                 struct amdgpu_ring *ring = adev->rings[i];
4891
4892                 if (!ring || !ring->sched.thread)
4893                         continue;
4894
4895                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4896         }
4897 }
4898
4899 /**
4900  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4901  * @pdev: PCI device struct
4902  * @state: PCI channel state
4903  *
4904  * Description: Called when a PCI error is detected.
4905  *
4906  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4907  */
4908 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4909 {
4910         struct drm_device *dev = pci_get_drvdata(pdev);
4911         struct amdgpu_device *adev = drm_to_adev(dev);
4912         int i;
4913
4914         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4915
4916         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4917                 DRM_WARN("No support for XGMI hive yet...");
4918                 return PCI_ERS_RESULT_DISCONNECT;
4919         }
4920
4921         switch (state) {
4922         case pci_channel_io_normal:
4923                 return PCI_ERS_RESULT_CAN_RECOVER;
4924         /* Fatal error, prepare for slot reset */
4925         case pci_channel_io_frozen:
4926                 /*
4927                  * Cancel and wait for all TDRs in progress if failing to
4928                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4929                  *
4930                  * Locking adev->reset_sem will prevent any external access
4931                  * to GPU during PCI error recovery
4932                  */
4933                 while (!amdgpu_device_lock_adev(adev, NULL))
4934                         amdgpu_cancel_all_tdr(adev);
4935
4936                 /*
4937                  * Block any work scheduling as we do for regular GPU reset
4938                  * for the duration of the recovery
4939                  */
4940                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4941                         struct amdgpu_ring *ring = adev->rings[i];
4942
4943                         if (!ring || !ring->sched.thread)
4944                                 continue;
4945
4946                         drm_sched_stop(&ring->sched, NULL);
4947                 }
4948                 return PCI_ERS_RESULT_NEED_RESET;
4949         case pci_channel_io_perm_failure:
4950                 /* Permanent error, prepare for device removal */
4951                 return PCI_ERS_RESULT_DISCONNECT;
4952         }
4953
4954         return PCI_ERS_RESULT_NEED_RESET;
4955 }
4956
4957 /**
4958  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4959  * @pdev: pointer to PCI device
4960  */
4961 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4962 {
4963
4964         DRM_INFO("PCI error: mmio enabled callback!!\n");
4965
4966         /* TODO - dump whatever for debugging purposes */
4967
4968         /* This called only if amdgpu_pci_error_detected returns
4969          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4970          * works, no need to reset slot.
4971          */
4972
4973         return PCI_ERS_RESULT_RECOVERED;
4974 }
4975
4976 /**
4977  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4978  * @pdev: PCI device struct
4979  *
4980  * Description: This routine is called by the pci error recovery
4981  * code after the PCI slot has been reset, just before we
4982  * should resume normal operations.
4983  */
4984 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4985 {
4986         struct drm_device *dev = pci_get_drvdata(pdev);
4987         struct amdgpu_device *adev = drm_to_adev(dev);
4988         int r, i;
4989         bool need_full_reset = true;
4990         u32 memsize;
4991         struct list_head device_list;
4992
4993         DRM_INFO("PCI error: slot reset callback!!\n");
4994
4995         INIT_LIST_HEAD(&device_list);
4996         list_add_tail(&adev->gmc.xgmi.head, &device_list);
4997
4998         /* wait for asic to come out of reset */
4999         msleep(500);
5000
5001         /* Restore PCI confspace */
5002         amdgpu_device_load_pci_state(pdev);
5003
5004         /* confirm  ASIC came out of reset */
5005         for (i = 0; i < adev->usec_timeout; i++) {
5006                 memsize = amdgpu_asic_get_config_memsize(adev);
5007
5008                 if (memsize != 0xffffffff)
5009                         break;
5010                 udelay(1);
5011         }
5012         if (memsize == 0xffffffff) {
5013                 r = -ETIME;
5014                 goto out;
5015         }
5016
5017         adev->in_pci_err_recovery = true;
5018         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5019         adev->in_pci_err_recovery = false;
5020         if (r)
5021                 goto out;
5022
5023         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5024
5025 out:
5026         if (!r) {
5027                 if (amdgpu_device_cache_pci_state(adev->pdev))
5028                         pci_restore_state(adev->pdev);
5029
5030                 DRM_INFO("PCIe error recovery succeeded\n");
5031         } else {
5032                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5033                 amdgpu_device_unlock_adev(adev);
5034         }
5035
5036         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5037 }
5038
5039 /**
5040  * amdgpu_pci_resume() - resume normal ops after PCI reset
5041  * @pdev: pointer to PCI device
5042  *
5043  * Called when the error recovery driver tells us that its
5044  * OK to resume normal operation. Use completion to allow
5045  * halted scsi ops to resume.
5046  */
5047 void amdgpu_pci_resume(struct pci_dev *pdev)
5048 {
5049         struct drm_device *dev = pci_get_drvdata(pdev);
5050         struct amdgpu_device *adev = drm_to_adev(dev);
5051         int i;
5052
5053
5054         DRM_INFO("PCI error: resume callback!!\n");
5055
5056         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5057                 struct amdgpu_ring *ring = adev->rings[i];
5058
5059                 if (!ring || !ring->sched.thread)
5060                         continue;
5061
5062
5063                 drm_sched_resubmit_jobs(&ring->sched);
5064                 drm_sched_start(&ring->sched, true);
5065         }
5066
5067         amdgpu_device_unlock_adev(adev);
5068 }
5069
5070 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5071 {
5072         struct drm_device *dev = pci_get_drvdata(pdev);
5073         struct amdgpu_device *adev = drm_to_adev(dev);
5074         int r;
5075
5076         r = pci_save_state(pdev);
5077         if (!r) {
5078                 kfree(adev->pci_state);
5079
5080                 adev->pci_state = pci_store_saved_state(pdev);
5081
5082                 if (!adev->pci_state) {
5083                         DRM_ERROR("Failed to store PCI saved state");
5084                         return false;
5085                 }
5086         } else {
5087                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5088                 return false;
5089         }
5090
5091         return true;
5092 }
5093
5094 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5095 {
5096         struct drm_device *dev = pci_get_drvdata(pdev);
5097         struct amdgpu_device *adev = drm_to_adev(dev);
5098         int r;
5099
5100         if (!adev->pci_state)
5101                 return false;
5102
5103         r = pci_load_saved_state(pdev, adev->pci_state);
5104
5105         if (!r) {
5106                 pci_restore_state(pdev);
5107         } else {
5108                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5109                 return false;
5110         }
5111
5112         return true;
5113 }
5114
5115