drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83
  84 #define AMDGPU_RESUME_MS                2000
  85
  86 const char *amdgpu_asic_name[] = {
  87         "TAHITI",
  88         "PITCAIRN",
  89         "VERDE",
  90         "OLAND",
  91         "HAINAN",
  92         "BONAIRE",
  93         "KAVERI",
  94         "KABINI",
  95         "HAWAII",
  96         "MULLINS",
  97         "TOPAZ",
  98         "TONGA",
  99         "FIJI",
 100         "CARRIZO",
 101         "STONEY",
 102         "POLARIS10",
 103         "POLARIS11",
 104         "POLARIS12",
 105         "VEGAM",
 106         "VEGA10",
 107         "VEGA12",
 108         "VEGA20",
 109         "RAVEN",
 110         "ARCTURUS",
 111         "RENOIR",
 112         "NAVI10",
 113         "NAVI14",
 114         "NAVI12",
 115         "SIENNA_CICHLID",
 116         "NAVY_FLOUNDER",
 117         "LAST",
 118 };
 119
 120 /**
 121  * DOC: pcie_replay_count
 122  *
 123  * The amdgpu driver provides a sysfs API for reporting the total number
 124  * of PCIe replays (NAKs)
 125  * The file pcie_replay_count is used for this and returns the total
 126  * number of replays as a sum of the NAKs generated and NAKs received
 127  */
 128
 129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 130                 struct device_attribute *attr, char *buf)
 131 {
 132         struct drm_device *ddev = dev_get_drvdata(dev);
 133         struct amdgpu_device *adev = drm_to_adev(ddev);
 134         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 135
 136         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 137 }
 138
 139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 140                 amdgpu_device_get_pcie_replay_count, NULL);
 141
 142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 143
 144 /**
 145  * DOC: product_name
 146  *
 147  * The amdgpu driver provides a sysfs API for reporting the product name
 148  * for the device
 149  * The file serial_number is used for this and returns the product name
 150  * as returned from the FRU.
 151  * NOTE: This is only available for certain server cards
 152  */
 153
 154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 155                 struct device_attribute *attr, char *buf)
 156 {
 157         struct drm_device *ddev = dev_get_drvdata(dev);
 158         struct amdgpu_device *adev = drm_to_adev(ddev);
 159
 160         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 161 }
 162
 163 static DEVICE_ATTR(product_name, S_IRUGO,
 164                 amdgpu_device_get_product_name, NULL);
 165
 166 /**
 167  * DOC: product_number
 168  *
 169  * The amdgpu driver provides a sysfs API for reporting the part number
 170  * for the device
 171  * The file serial_number is used for this and returns the part number
 172  * as returned from the FRU.
 173  * NOTE: This is only available for certain server cards
 174  */
 175
 176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 177                 struct device_attribute *attr, char *buf)
 178 {
 179         struct drm_device *ddev = dev_get_drvdata(dev);
 180         struct amdgpu_device *adev = drm_to_adev(ddev);
 181
 182         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 183 }
 184
 185 static DEVICE_ATTR(product_number, S_IRUGO,
 186                 amdgpu_device_get_product_number, NULL);
 187
 188 /**
 189  * DOC: serial_number
 190  *
 191  * The amdgpu driver provides a sysfs API for reporting the serial number
 192  * for the device
 193  * The file serial_number is used for this and returns the serial number
 194  * as returned from the FRU.
 195  * NOTE: This is only available for certain server cards
 196  */
 197
 198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 199                 struct device_attribute *attr, char *buf)
 200 {
 201         struct drm_device *ddev = dev_get_drvdata(dev);
 202         struct amdgpu_device *adev = drm_to_adev(ddev);
 203
 204         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 205 }
 206
 207 static DEVICE_ATTR(serial_number, S_IRUGO,
 208                 amdgpu_device_get_serial_number, NULL);
 209
 210 /**
 211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 212  *
 213  * @dev: drm_device pointer
 214  *
 215  * Returns true if the device is a dGPU with HG/PX power control,
 216  * otherwise return false.
 217  */
 218 bool amdgpu_device_supports_boco(struct drm_device *dev)
 219 {
 220         struct amdgpu_device *adev = drm_to_adev(dev);
 221
 222         if (adev->flags & AMD_IS_PX)
 223                 return true;
 224         return false;
 225 }
 226
 227 /**
 228  * amdgpu_device_supports_baco - Does the device support BACO
 229  *
 230  * @dev: drm_device pointer
 231  *
 232  * Returns true if the device supporte BACO,
 233  * otherwise return false.
 234  */
 235 bool amdgpu_device_supports_baco(struct drm_device *dev)
 236 {
 237         struct amdgpu_device *adev = drm_to_adev(dev);
 238
 239         return amdgpu_asic_supports_baco(adev);
 240 }
 241
 242 /**
 243  * VRAM access helper functions.
 244  *
 245  * amdgpu_device_vram_access - read/write a buffer in vram
 246  *
 247  * @adev: amdgpu_device pointer
 248  * @pos: offset of the buffer in vram
 249  * @buf: virtual address of the buffer in system memory
 250  * @size: read/write size, sizeof(@buf) must > @size
 251  * @write: true - write to vram, otherwise - read from vram
 252  */
 253 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 254                                uint32_t *buf, size_t size, bool write)
 255 {
 256         unsigned long flags;
 257         uint32_t hi = ~0;
 258         uint64_t last;
 259
 260
 261 #ifdef CONFIG_64BIT
 262         last = min(pos + size, adev->gmc.visible_vram_size);
 263         if (last > pos) {
 264                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 265                 size_t count = last - pos;
 266
 267                 if (write) {
 268                         memcpy_toio(addr, buf, count);
 269                         mb();
 270                         amdgpu_asic_flush_hdp(adev, NULL);
 271                 } else {
 272                         amdgpu_asic_invalidate_hdp(adev, NULL);
 273                         mb();
 274                         memcpy_fromio(buf, addr, count);
 275                 }
 276
 277                 if (count == size)
 278                         return;
 279
 280                 pos += count;
 281                 buf += count / 4;
 282                 size -= count;
 283         }
 284 #endif
 285
 286         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 287         for (last = pos + size; pos < last; pos += 4) {
 288                 uint32_t tmp = pos >> 31;
 289
 290                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 291                 if (tmp != hi) {
 292                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 293                         hi = tmp;
 294                 }
 295                 if (write)
 296                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 297                 else
 298                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 299         }
 300         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 301 }
 302
 303 /*
 304  * register access helper functions.
 305  */
 306 /**
 307  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 308  *
 309  * @adev: amdgpu_device pointer
 310  * @reg: dword aligned register offset
 311  * @acc_flags: access flags which require special behavior
 312  *
 313  * Returns the 32 bit value from the offset specified.
 314  */
 315 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 316                             uint32_t reg, uint32_t acc_flags)
 317 {
 318         uint32_t ret;
 319
 320         if (adev->in_pci_err_recovery)
 321                 return 0;
 322
 323         if ((reg * 4) < adev->rmmio_size) {
 324                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 325                     amdgpu_sriov_runtime(adev) &&
 326                     down_read_trylock(&adev->reset_sem)) {
 327                         ret = amdgpu_kiq_rreg(adev, reg);
 328                         up_read(&adev->reset_sem);
 329                 } else {
 330                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 331                 }
 332         } else {
 333                 ret = adev->pcie_rreg(adev, reg * 4);
 334         }
 335
 336         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 337
 338         return ret;
 339 }
 340
 341 /*
 342  * MMIO register read with bytes helper functions
 343  * @offset:bytes offset from MMIO start
 344  *
 345 */
 346
 347 /**
 348  * amdgpu_mm_rreg8 - read a memory mapped IO register
 349  *
 350  * @adev: amdgpu_device pointer
 351  * @offset: byte aligned register offset
 352  *
 353  * Returns the 8 bit value from the offset specified.
 354  */
 355 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 356 {
 357         if (adev->in_pci_err_recovery)
 358                 return 0;
 359
 360         if (offset < adev->rmmio_size)
 361                 return (readb(adev->rmmio + offset));
 362         BUG();
 363 }
 364
 365 /*
 366  * MMIO register write with bytes helper functions
 367  * @offset:bytes offset from MMIO start
 368  * @value: the value want to be written to the register
 369  *
 370 */
 371 /**
 372  * amdgpu_mm_wreg8 - read a memory mapped IO register
 373  *
 374  * @adev: amdgpu_device pointer
 375  * @offset: byte aligned register offset
 376  * @value: 8 bit value to write
 377  *
 378  * Writes the value specified to the offset specified.
 379  */
 380 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 381 {
 382         if (adev->in_pci_err_recovery)
 383                 return;
 384
 385         if (offset < adev->rmmio_size)
 386                 writeb(value, adev->rmmio + offset);
 387         else
 388                 BUG();
 389 }
 390
 391 /**
 392  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 393  *
 394  * @adev: amdgpu_device pointer
 395  * @reg: dword aligned register offset
 396  * @v: 32 bit value to write to the register
 397  * @acc_flags: access flags which require special behavior
 398  *
 399  * Writes the value specified to the offset specified.
 400  */
 401 void amdgpu_device_wreg(struct amdgpu_device *adev,
 402                         uint32_t reg, uint32_t v,
 403                         uint32_t acc_flags)
 404 {
 405         if (adev->in_pci_err_recovery)
 406                 return;
 407
 408         if ((reg * 4) < adev->rmmio_size) {
 409                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 410                     amdgpu_sriov_runtime(adev) &&
 411                     down_read_trylock(&adev->reset_sem)) {
 412                         amdgpu_kiq_wreg(adev, reg, v);
 413                         up_read(&adev->reset_sem);
 414                 } else {
 415                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 416                 }
 417         } else {
 418                 adev->pcie_wreg(adev, reg * 4, v);
 419         }
 420
 421         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 422 }
 423
 424 /*
 425  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 426  *
 427  * this function is invoked only the debugfs register access
 428  * */
 429 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 430                              uint32_t reg, uint32_t v)
 431 {
 432         if (adev->in_pci_err_recovery)
 433                 return;
 434
 435         if (amdgpu_sriov_fullaccess(adev) &&
 436             adev->gfx.rlc.funcs &&
 437             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 438                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 439                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 440         } else {
 441                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 442         }
 443 }
 444
 445 /**
 446  * amdgpu_io_rreg - read an IO register
 447  *
 448  * @adev: amdgpu_device pointer
 449  * @reg: dword aligned register offset
 450  *
 451  * Returns the 32 bit value from the offset specified.
 452  */
 453 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 454 {
 455         if (adev->in_pci_err_recovery)
 456                 return 0;
 457
 458         if ((reg * 4) < adev->rio_mem_size)
 459                 return ioread32(adev->rio_mem + (reg * 4));
 460         else {
 461                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 462                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 463         }
 464 }
 465
 466 /**
 467  * amdgpu_io_wreg - write to an IO register
 468  *
 469  * @adev: amdgpu_device pointer
 470  * @reg: dword aligned register offset
 471  * @v: 32 bit value to write to the register
 472  *
 473  * Writes the value specified to the offset specified.
 474  */
 475 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 476 {
 477         if (adev->in_pci_err_recovery)
 478                 return;
 479
 480         if ((reg * 4) < adev->rio_mem_size)
 481                 iowrite32(v, adev->rio_mem + (reg * 4));
 482         else {
 483                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 484                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 485         }
 486 }
 487
 488 /**
 489  * amdgpu_mm_rdoorbell - read a doorbell dword
 490  *
 491  * @adev: amdgpu_device pointer
 492  * @index: doorbell index
 493  *
 494  * Returns the value in the doorbell aperture at the
 495  * requested doorbell index (CIK).
 496  */
 497 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 498 {
 499         if (adev->in_pci_err_recovery)
 500                 return 0;
 501
 502         if (index < adev->doorbell.num_doorbells) {
 503                 return readl(adev->doorbell.ptr + index);
 504         } else {
 505                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 506                 return 0;
 507         }
 508 }
 509
 510 /**
 511  * amdgpu_mm_wdoorbell - write a doorbell dword
 512  *
 513  * @adev: amdgpu_device pointer
 514  * @index: doorbell index
 515  * @v: value to write
 516  *
 517  * Writes @v to the doorbell aperture at the
 518  * requested doorbell index (CIK).
 519  */
 520 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 521 {
 522         if (adev->in_pci_err_recovery)
 523                 return;
 524
 525         if (index < adev->doorbell.num_doorbells) {
 526                 writel(v, adev->doorbell.ptr + index);
 527         } else {
 528                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 529         }
 530 }
 531
 532 /**
 533  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 534  *
 535  * @adev: amdgpu_device pointer
 536  * @index: doorbell index
 537  *
 538  * Returns the value in the doorbell aperture at the
 539  * requested doorbell index (VEGA10+).
 540  */
 541 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 542 {
 543         if (adev->in_pci_err_recovery)
 544                 return 0;
 545
 546         if (index < adev->doorbell.num_doorbells) {
 547                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 548         } else {
 549                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 550                 return 0;
 551         }
 552 }
 553
 554 /**
 555  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 556  *
 557  * @adev: amdgpu_device pointer
 558  * @index: doorbell index
 559  * @v: value to write
 560  *
 561  * Writes @v to the doorbell aperture at the
 562  * requested doorbell index (VEGA10+).
 563  */
 564 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 565 {
 566         if (adev->in_pci_err_recovery)
 567                 return;
 568
 569         if (index < adev->doorbell.num_doorbells) {
 570                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 571         } else {
 572                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 573         }
 574 }
 575
 576 /**
 577  * amdgpu_device_indirect_rreg - read an indirect register
 578  *
 579  * @adev: amdgpu_device pointer
 580  * @pcie_index: mmio register offset
 581  * @pcie_data: mmio register offset
 582  *
 583  * Returns the value of indirect register @reg_addr
 584  */
 585 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 586                                 u32 pcie_index, u32 pcie_data,
 587                                 u32 reg_addr)
 588 {
 589         unsigned long flags;
 590         u32 r;
 591         void __iomem *pcie_index_offset;
 592         void __iomem *pcie_data_offset;
 593
 594         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 595         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 596         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 597
 598         writel(reg_addr, pcie_index_offset);
 599         readl(pcie_index_offset);
 600         r = readl(pcie_data_offset);
 601         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 602
 603         return r;
 604 }
 605
 606 /**
 607  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 608  *
 609  * @adev: amdgpu_device pointer
 610  * @pcie_index: mmio register offset
 611  * @pcie_data: mmio register offset
 612  *
 613  * Returns the value of indirect register @reg_addr
 614  */
 615 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 616                                   u32 pcie_index, u32 pcie_data,
 617                                   u32 reg_addr)
 618 {
 619         unsigned long flags;
 620         u64 r;
 621         void __iomem *pcie_index_offset;
 622         void __iomem *pcie_data_offset;
 623
 624         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 625         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 626         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 627
 628         /* read low 32 bits */
 629         writel(reg_addr, pcie_index_offset);
 630         readl(pcie_index_offset);
 631         r = readl(pcie_data_offset);
 632         /* read high 32 bits */
 633         writel(reg_addr + 4, pcie_index_offset);
 634         readl(pcie_index_offset);
 635         r |= ((u64)readl(pcie_data_offset) << 32);
 636         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 637
 638         return r;
 639 }
 640
 641 /**
 642  * amdgpu_device_indirect_wreg - write an indirect register address
 643  *
 644  * @adev: amdgpu_device pointer
 645  * @pcie_index: mmio register offset
 646  * @pcie_data: mmio register offset
 647  * @reg_addr: indirect register offset
 648  * @reg_data: indirect register data
 649  *
 650  */
 651 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 652                                  u32 pcie_index, u32 pcie_data,
 653                                  u32 reg_addr, u32 reg_data)
 654 {
 655         unsigned long flags;
 656         void __iomem *pcie_index_offset;
 657         void __iomem *pcie_data_offset;
 658
 659         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 660         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 661         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 662
 663         writel(reg_addr, pcie_index_offset);
 664         readl(pcie_index_offset);
 665         writel(reg_data, pcie_data_offset);
 666         readl(pcie_data_offset);
 667         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 668 }
 669
 670 /**
 671  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 672  *
 673  * @adev: amdgpu_device pointer
 674  * @pcie_index: mmio register offset
 675  * @pcie_data: mmio register offset
 676  * @reg_addr: indirect register offset
 677  * @reg_data: indirect register data
 678  *
 679  */
 680 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 681                                    u32 pcie_index, u32 pcie_data,
 682                                    u32 reg_addr, u64 reg_data)
 683 {
 684         unsigned long flags;
 685         void __iomem *pcie_index_offset;
 686         void __iomem *pcie_data_offset;
 687
 688         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 689         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 690         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 691
 692         /* write low 32 bits */
 693         writel(reg_addr, pcie_index_offset);
 694         readl(pcie_index_offset);
 695         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 696         readl(pcie_data_offset);
 697         /* write high 32 bits */
 698         writel(reg_addr + 4, pcie_index_offset);
 699         readl(pcie_index_offset);
 700         writel((u32)(reg_data >> 32), pcie_data_offset);
 701         readl(pcie_data_offset);
 702         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 703 }
 704
 705 /**
 706  * amdgpu_invalid_rreg - dummy reg read function
 707  *
 708  * @adev: amdgpu device pointer
 709  * @reg: offset of register
 710  *
 711  * Dummy register read function.  Used for register blocks
 712  * that certain asics don't have (all asics).
 713  * Returns the value in the register.
 714  */
 715 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 716 {
 717         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 718         BUG();
 719         return 0;
 720 }
 721
 722 /**
 723  * amdgpu_invalid_wreg - dummy reg write function
 724  *
 725  * @adev: amdgpu device pointer
 726  * @reg: offset of register
 727  * @v: value to write to the register
 728  *
 729  * Dummy register read function.  Used for register blocks
 730  * that certain asics don't have (all asics).
 731  */
 732 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 733 {
 734         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 735                   reg, v);
 736         BUG();
 737 }
 738
 739 /**
 740  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 741  *
 742  * @adev: amdgpu device pointer
 743  * @reg: offset of register
 744  *
 745  * Dummy register read function.  Used for register blocks
 746  * that certain asics don't have (all asics).
 747  * Returns the value in the register.
 748  */
 749 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 750 {
 751         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 752         BUG();
 753         return 0;
 754 }
 755
 756 /**
 757  * amdgpu_invalid_wreg64 - dummy reg write function
 758  *
 759  * @adev: amdgpu device pointer
 760  * @reg: offset of register
 761  * @v: value to write to the register
 762  *
 763  * Dummy register read function.  Used for register blocks
 764  * that certain asics don't have (all asics).
 765  */
 766 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 767 {
 768         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 769                   reg, v);
 770         BUG();
 771 }
 772
 773 /**
 774  * amdgpu_block_invalid_rreg - dummy reg read function
 775  *
 776  * @adev: amdgpu device pointer
 777  * @block: offset of instance
 778  * @reg: offset of register
 779  *
 780  * Dummy register read function.  Used for register blocks
 781  * that certain asics don't have (all asics).
 782  * Returns the value in the register.
 783  */
 784 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 785                                           uint32_t block, uint32_t reg)
 786 {
 787         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 788                   reg, block);
 789         BUG();
 790         return 0;
 791 }
 792
 793 /**
 794  * amdgpu_block_invalid_wreg - dummy reg write function
 795  *
 796  * @adev: amdgpu device pointer
 797  * @block: offset of instance
 798  * @reg: offset of register
 799  * @v: value to write to the register
 800  *
 801  * Dummy register read function.  Used for register blocks
 802  * that certain asics don't have (all asics).
 803  */
 804 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 805                                       uint32_t block,
 806                                       uint32_t reg, uint32_t v)
 807 {
 808         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 809                   reg, block, v);
 810         BUG();
 811 }
 812
 813 /**
 814  * amdgpu_device_asic_init - Wrapper for atom asic_init
 815  *
 816  * @dev: drm_device pointer
 817  *
 818  * Does any asic specific work and then calls atom asic init.
 819  */
 820 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 821 {
 822         amdgpu_asic_pre_asic_init(adev);
 823
 824         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 825 }
 826
 827 /**
 828  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 829  *
 830  * @adev: amdgpu device pointer
 831  *
 832  * Allocates a scratch page of VRAM for use by various things in the
 833  * driver.
 834  */
 835 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 836 {
 837         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 838                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 839                                        &adev->vram_scratch.robj,
 840                                        &adev->vram_scratch.gpu_addr,
 841                                        (void **)&adev->vram_scratch.ptr);
 842 }
 843
 844 /**
 845  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 846  *
 847  * @adev: amdgpu device pointer
 848  *
 849  * Frees the VRAM scratch page.
 850  */
 851 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 852 {
 853         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 854 }
 855
 856 /**
 857  * amdgpu_device_program_register_sequence - program an array of registers.
 858  *
 859  * @adev: amdgpu_device pointer
 860  * @registers: pointer to the register array
 861  * @array_size: size of the register array
 862  *
 863  * Programs an array or registers with and and or masks.
 864  * This is a helper for setting golden registers.
 865  */
 866 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 867                                              const u32 *registers,
 868                                              const u32 array_size)
 869 {
 870         u32 tmp, reg, and_mask, or_mask;
 871         int i;
 872
 873         if (array_size % 3)
 874                 return;
 875
 876         for (i = 0; i < array_size; i +=3) {
 877                 reg = registers[i + 0];
 878                 and_mask = registers[i + 1];
 879                 or_mask = registers[i + 2];
 880
 881                 if (and_mask == 0xffffffff) {
 882                         tmp = or_mask;
 883                 } else {
 884                         tmp = RREG32(reg);
 885                         tmp &= ~and_mask;
 886                         if (adev->family >= AMDGPU_FAMILY_AI)
 887                                 tmp |= (or_mask & and_mask);
 888                         else
 889                                 tmp |= or_mask;
 890                 }
 891                 WREG32(reg, tmp);
 892         }
 893 }
 894
 895 /**
 896  * amdgpu_device_pci_config_reset - reset the GPU
 897  *
 898  * @adev: amdgpu_device pointer
 899  *
 900  * Resets the GPU using the pci config reset sequence.
 901  * Only applicable to asics prior to vega10.
 902  */
 903 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 904 {
 905         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 906 }
 907
 908 /*
 909  * GPU doorbell aperture helpers function.
 910  */
 911 /**
 912  * amdgpu_device_doorbell_init - Init doorbell driver information.
 913  *
 914  * @adev: amdgpu_device pointer
 915  *
 916  * Init doorbell driver information (CIK)
 917  * Returns 0 on success, error on failure.
 918  */
 919 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 920 {
 921
 922         /* No doorbell on SI hardware generation */
 923         if (adev->asic_type < CHIP_BONAIRE) {
 924                 adev->doorbell.base = 0;
 925                 adev->doorbell.size = 0;
 926                 adev->doorbell.num_doorbells = 0;
 927                 adev->doorbell.ptr = NULL;
 928                 return 0;
 929         }
 930
 931         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 932                 return -EINVAL;
 933
 934         amdgpu_asic_init_doorbell_index(adev);
 935
 936         /* doorbell bar mapping */
 937         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 938         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 939
 940         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 941                                              adev->doorbell_index.max_assignment+1);
 942         if (adev->doorbell.num_doorbells == 0)
 943                 return -EINVAL;
 944
 945         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 946          * paging queue doorbell use the second page. The
 947          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 948          * doorbells are in the first page. So with paging queue enabled,
 949          * the max num_doorbells should + 1 page (0x400 in dword)
 950          */
 951         if (adev->asic_type >= CHIP_VEGA10)
 952                 adev->doorbell.num_doorbells += 0x400;
 953
 954         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 955                                      adev->doorbell.num_doorbells *
 956                                      sizeof(u32));
 957         if (adev->doorbell.ptr == NULL)
 958                 return -ENOMEM;
 959
 960         return 0;
 961 }
 962
 963 /**
 964  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 965  *
 966  * @adev: amdgpu_device pointer
 967  *
 968  * Tear down doorbell driver information (CIK)
 969  */
 970 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 971 {
 972         iounmap(adev->doorbell.ptr);
 973         adev->doorbell.ptr = NULL;
 974 }
 975
 976
 977
 978 /*
 979  * amdgpu_device_wb_*()
 980  * Writeback is the method by which the GPU updates special pages in memory
 981  * with the status of certain GPU events (fences, ring pointers,etc.).
 982  */
 983
 984 /**
 985  * amdgpu_device_wb_fini - Disable Writeback and free memory
 986  *
 987  * @adev: amdgpu_device pointer
 988  *
 989  * Disables Writeback and frees the Writeback memory (all asics).
 990  * Used at driver shutdown.
 991  */
 992 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 993 {
 994         if (adev->wb.wb_obj) {
 995                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 996                                       &adev->wb.gpu_addr,
 997                                       (void **)&adev->wb.wb);
 998                 adev->wb.wb_obj = NULL;
 999         }
1000 }
1001
1002 /**
1003  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1004  *
1005  * @adev: amdgpu_device pointer
1006  *
1007  * Initializes writeback and allocates writeback memory (all asics).
1008  * Used at driver startup.
1009  * Returns 0 on success or an -error on failure.
1010  */
1011 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1012 {
1013         int r;
1014
1015         if (adev->wb.wb_obj == NULL) {
1016                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1017                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1018                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1019                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1020                                             (void **)&adev->wb.wb);
1021                 if (r) {
1022                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1023                         return r;
1024                 }
1025
1026                 adev->wb.num_wb = AMDGPU_MAX_WB;
1027                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1028
1029                 /* clear wb memory */
1030                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1031         }
1032
1033         return 0;
1034 }
1035
1036 /**
1037  * amdgpu_device_wb_get - Allocate a wb entry
1038  *
1039  * @adev: amdgpu_device pointer
1040  * @wb: wb index
1041  *
1042  * Allocate a wb slot for use by the driver (all asics).
1043  * Returns 0 on success or -EINVAL on failure.
1044  */
1045 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1046 {
1047         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1048
1049         if (offset < adev->wb.num_wb) {
1050                 __set_bit(offset, adev->wb.used);
1051                 *wb = offset << 3; /* convert to dw offset */
1052                 return 0;
1053         } else {
1054                 return -EINVAL;
1055         }
1056 }
1057
1058 /**
1059  * amdgpu_device_wb_free - Free a wb entry
1060  *
1061  * @adev: amdgpu_device pointer
1062  * @wb: wb index
1063  *
1064  * Free a wb slot allocated for use by the driver (all asics)
1065  */
1066 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1067 {
1068         wb >>= 3;
1069         if (wb < adev->wb.num_wb)
1070                 __clear_bit(wb, adev->wb.used);
1071 }
1072
1073 /**
1074  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1075  *
1076  * @adev: amdgpu_device pointer
1077  *
1078  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1079  * to fail, but if any of the BARs is not accessible after the size we abort
1080  * driver loading by returning -ENODEV.
1081  */
1082 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1083 {
1084         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1085         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1086         struct pci_bus *root;
1087         struct resource *res;
1088         unsigned i;
1089         u16 cmd;
1090         int r;
1091
1092         /* Bypass for VF */
1093         if (amdgpu_sriov_vf(adev))
1094                 return 0;
1095
1096         /* skip if the bios has already enabled large BAR */
1097         if (adev->gmc.real_vram_size &&
1098             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1099                 return 0;
1100
1101         /* Check if the root BUS has 64bit memory resources */
1102         root = adev->pdev->bus;
1103         while (root->parent)
1104                 root = root->parent;
1105
1106         pci_bus_for_each_resource(root, res, i) {
1107                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1108                     res->start > 0x100000000ull)
1109                         break;
1110         }
1111
1112         /* Trying to resize is pointless without a root hub window above 4GB */
1113         if (!res)
1114                 return 0;
1115
1116         /* Disable memory decoding while we change the BAR addresses and size */
1117         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1118         pci_write_config_word(adev->pdev, PCI_COMMAND,
1119                               cmd & ~PCI_COMMAND_MEMORY);
1120
1121         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1122         amdgpu_device_doorbell_fini(adev);
1123         if (adev->asic_type >= CHIP_BONAIRE)
1124                 pci_release_resource(adev->pdev, 2);
1125
1126         pci_release_resource(adev->pdev, 0);
1127
1128         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1129         if (r == -ENOSPC)
1130                 DRM_INFO("Not enough PCI address space for a large BAR.");
1131         else if (r && r != -ENOTSUPP)
1132                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1133
1134         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1135
1136         /* When the doorbell or fb BAR isn't available we have no chance of
1137          * using the device.
1138          */
1139         r = amdgpu_device_doorbell_init(adev);
1140         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1141                 return -ENODEV;
1142
1143         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1144
1145         return 0;
1146 }
1147
1148 /*
1149  * GPU helpers function.
1150  */
1151 /**
1152  * amdgpu_device_need_post - check if the hw need post or not
1153  *
1154  * @adev: amdgpu_device pointer
1155  *
1156  * Check if the asic has been initialized (all asics) at driver startup
1157  * or post is needed if  hw reset is performed.
1158  * Returns true if need or false if not.
1159  */
1160 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1161 {
1162         uint32_t reg;
1163
1164         if (amdgpu_sriov_vf(adev))
1165                 return false;
1166
1167         if (amdgpu_passthrough(adev)) {
1168                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1169                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1170                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1171                  * vpost executed for smc version below 22.15
1172                  */
1173                 if (adev->asic_type == CHIP_FIJI) {
1174                         int err;
1175                         uint32_t fw_ver;
1176                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1177                         /* force vPost if error occured */
1178                         if (err)
1179                                 return true;
1180
1181                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1182                         if (fw_ver < 0x00160e00)
1183                                 return true;
1184                 }
1185         }
1186
1187         if (adev->has_hw_reset) {
1188                 adev->has_hw_reset = false;
1189                 return true;
1190         }
1191
1192         /* bios scratch used on CIK+ */
1193         if (adev->asic_type >= CHIP_BONAIRE)
1194                 return amdgpu_atombios_scratch_need_asic_init(adev);
1195
1196         /* check MEM_SIZE for older asics */
1197         reg = amdgpu_asic_get_config_memsize(adev);
1198
1199         if ((reg != 0) && (reg != 0xffffffff))
1200                 return false;
1201
1202         return true;
1203 }
1204
1205 /* if we get transitioned to only one device, take VGA back */
1206 /**
1207  * amdgpu_device_vga_set_decode - enable/disable vga decode
1208  *
1209  * @cookie: amdgpu_device pointer
1210  * @state: enable/disable vga decode
1211  *
1212  * Enable/disable vga decode (all asics).
1213  * Returns VGA resource flags.
1214  */
1215 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1216 {
1217         struct amdgpu_device *adev = cookie;
1218         amdgpu_asic_set_vga_state(adev, state);
1219         if (state)
1220                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1221                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1222         else
1223                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1224 }
1225
1226 /**
1227  * amdgpu_device_check_block_size - validate the vm block size
1228  *
1229  * @adev: amdgpu_device pointer
1230  *
1231  * Validates the vm block size specified via module parameter.
1232  * The vm block size defines number of bits in page table versus page directory,
1233  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1234  * page table and the remaining bits are in the page directory.
1235  */
1236 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1237 {
1238         /* defines number of bits in page table versus page directory,
1239          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1240          * page table and the remaining bits are in the page directory */
1241         if (amdgpu_vm_block_size == -1)
1242                 return;
1243
1244         if (amdgpu_vm_block_size < 9) {
1245                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1246                          amdgpu_vm_block_size);
1247                 amdgpu_vm_block_size = -1;
1248         }
1249 }
1250
1251 /**
1252  * amdgpu_device_check_vm_size - validate the vm size
1253  *
1254  * @adev: amdgpu_device pointer
1255  *
1256  * Validates the vm size in GB specified via module parameter.
1257  * The VM size is the size of the GPU virtual memory space in GB.
1258  */
1259 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1260 {
1261         /* no need to check the default value */
1262         if (amdgpu_vm_size == -1)
1263                 return;
1264
1265         if (amdgpu_vm_size < 1) {
1266                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1267                          amdgpu_vm_size);
1268                 amdgpu_vm_size = -1;
1269         }
1270 }
1271
1272 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1273 {
1274         struct sysinfo si;
1275         bool is_os_64 = (sizeof(void *) == 8);
1276         uint64_t total_memory;
1277         uint64_t dram_size_seven_GB = 0x1B8000000;
1278         uint64_t dram_size_three_GB = 0xB8000000;
1279
1280         if (amdgpu_smu_memory_pool_size == 0)
1281                 return;
1282
1283         if (!is_os_64) {
1284                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1285                 goto def_value;
1286         }
1287         si_meminfo(&si);
1288         total_memory = (uint64_t)si.totalram * si.mem_unit;
1289
1290         if ((amdgpu_smu_memory_pool_size == 1) ||
1291                 (amdgpu_smu_memory_pool_size == 2)) {
1292                 if (total_memory < dram_size_three_GB)
1293                         goto def_value1;
1294         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1295                 (amdgpu_smu_memory_pool_size == 8)) {
1296                 if (total_memory < dram_size_seven_GB)
1297                         goto def_value1;
1298         } else {
1299                 DRM_WARN("Smu memory pool size not supported\n");
1300                 goto def_value;
1301         }
1302         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1303
1304         return;
1305
1306 def_value1:
1307         DRM_WARN("No enough system memory\n");
1308 def_value:
1309         adev->pm.smu_prv_buffer_size = 0;
1310 }
1311
1312 /**
1313  * amdgpu_device_check_arguments - validate module params
1314  *
1315  * @adev: amdgpu_device pointer
1316  *
1317  * Validates certain module parameters and updates
1318  * the associated values used by the driver (all asics).
1319  */
1320 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1321 {
1322         if (amdgpu_sched_jobs < 4) {
1323                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1324                          amdgpu_sched_jobs);
1325                 amdgpu_sched_jobs = 4;
1326         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1327                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1328                          amdgpu_sched_jobs);
1329                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1330         }
1331
1332         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1333                 /* gart size must be greater or equal to 32M */
1334                 dev_warn(adev->dev, "gart size (%d) too small\n",
1335                          amdgpu_gart_size);
1336                 amdgpu_gart_size = -1;
1337         }
1338
1339         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1340                 /* gtt size must be greater or equal to 32M */
1341                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1342                                  amdgpu_gtt_size);
1343                 amdgpu_gtt_size = -1;
1344         }
1345
1346         /* valid range is between 4 and 9 inclusive */
1347         if (amdgpu_vm_fragment_size != -1 &&
1348             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1349                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1350                 amdgpu_vm_fragment_size = -1;
1351         }
1352
1353         if (amdgpu_sched_hw_submission < 2) {
1354                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1355                          amdgpu_sched_hw_submission);
1356                 amdgpu_sched_hw_submission = 2;
1357         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1358                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1359                          amdgpu_sched_hw_submission);
1360                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1361         }
1362
1363         amdgpu_device_check_smu_prv_buffer_size(adev);
1364
1365         amdgpu_device_check_vm_size(adev);
1366
1367         amdgpu_device_check_block_size(adev);
1368
1369         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1370
1371         amdgpu_gmc_tmz_set(adev);
1372
1373         if (amdgpu_num_kcq == -1) {
1374                 amdgpu_num_kcq = 8;
1375         } else if (amdgpu_num_kcq > 8 || amdgpu_num_kcq < 0) {
1376                 amdgpu_num_kcq = 8;
1377                 dev_warn(adev->dev, "set kernel compute queue number to 8 due to invalid parameter provided by user\n");
1378         }
1379
1380         amdgpu_gmc_noretry_set(adev);
1381
1382         return 0;
1383 }
1384
1385 /**
1386  * amdgpu_switcheroo_set_state - set switcheroo state
1387  *
1388  * @pdev: pci dev pointer
1389  * @state: vga_switcheroo state
1390  *
1391  * Callback for the switcheroo driver.  Suspends or resumes the
1392  * the asics before or after it is powered up using ACPI methods.
1393  */
1394 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1395                                         enum vga_switcheroo_state state)
1396 {
1397         struct drm_device *dev = pci_get_drvdata(pdev);
1398         int r;
1399
1400         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1401                 return;
1402
1403         if (state == VGA_SWITCHEROO_ON) {
1404                 pr_info("switched on\n");
1405                 /* don't suspend or resume card normally */
1406                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1407
1408                 pci_set_power_state(dev->pdev, PCI_D0);
1409                 amdgpu_device_load_pci_state(dev->pdev);
1410                 r = pci_enable_device(dev->pdev);
1411                 if (r)
1412                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1413                 amdgpu_device_resume(dev, true);
1414
1415                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1416                 drm_kms_helper_poll_enable(dev);
1417         } else {
1418                 pr_info("switched off\n");
1419                 drm_kms_helper_poll_disable(dev);
1420                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1421                 amdgpu_device_suspend(dev, true);
1422                 amdgpu_device_cache_pci_state(dev->pdev);
1423                 /* Shut down the device */
1424                 pci_disable_device(dev->pdev);
1425                 pci_set_power_state(dev->pdev, PCI_D3cold);
1426                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1427         }
1428 }
1429
1430 /**
1431  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1432  *
1433  * @pdev: pci dev pointer
1434  *
1435  * Callback for the switcheroo driver.  Check of the switcheroo
1436  * state can be changed.
1437  * Returns true if the state can be changed, false if not.
1438  */
1439 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1440 {
1441         struct drm_device *dev = pci_get_drvdata(pdev);
1442
1443         /*
1444         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1445         * locking inversion with the driver load path. And the access here is
1446         * completely racy anyway. So don't bother with locking for now.
1447         */
1448         return atomic_read(&dev->open_count) == 0;
1449 }
1450
1451 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1452         .set_gpu_state = amdgpu_switcheroo_set_state,
1453         .reprobe = NULL,
1454         .can_switch = amdgpu_switcheroo_can_switch,
1455 };
1456
1457 /**
1458  * amdgpu_device_ip_set_clockgating_state - set the CG state
1459  *
1460  * @dev: amdgpu_device pointer
1461  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1462  * @state: clockgating state (gate or ungate)
1463  *
1464  * Sets the requested clockgating state for all instances of
1465  * the hardware IP specified.
1466  * Returns the error code from the last instance.
1467  */
1468 int amdgpu_device_ip_set_clockgating_state(void *dev,
1469                                            enum amd_ip_block_type block_type,
1470                                            enum amd_clockgating_state state)
1471 {
1472         struct amdgpu_device *adev = dev;
1473         int i, r = 0;
1474
1475         for (i = 0; i < adev->num_ip_blocks; i++) {
1476                 if (!adev->ip_blocks[i].status.valid)
1477                         continue;
1478                 if (adev->ip_blocks[i].version->type != block_type)
1479                         continue;
1480                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1481                         continue;
1482                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1483                         (void *)adev, state);
1484                 if (r)
1485                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1486                                   adev->ip_blocks[i].version->funcs->name, r);
1487         }
1488         return r;
1489 }
1490
1491 /**
1492  * amdgpu_device_ip_set_powergating_state - set the PG state
1493  *
1494  * @dev: amdgpu_device pointer
1495  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1496  * @state: powergating state (gate or ungate)
1497  *
1498  * Sets the requested powergating state for all instances of
1499  * the hardware IP specified.
1500  * Returns the error code from the last instance.
1501  */
1502 int amdgpu_device_ip_set_powergating_state(void *dev,
1503                                            enum amd_ip_block_type block_type,
1504                                            enum amd_powergating_state state)
1505 {
1506         struct amdgpu_device *adev = dev;
1507         int i, r = 0;
1508
1509         for (i = 0; i < adev->num_ip_blocks; i++) {
1510                 if (!adev->ip_blocks[i].status.valid)
1511                         continue;
1512                 if (adev->ip_blocks[i].version->type != block_type)
1513                         continue;
1514                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1515                         continue;
1516                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1517                         (void *)adev, state);
1518                 if (r)
1519                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1520                                   adev->ip_blocks[i].version->funcs->name, r);
1521         }
1522         return r;
1523 }
1524
1525 /**
1526  * amdgpu_device_ip_get_clockgating_state - get the CG state
1527  *
1528  * @adev: amdgpu_device pointer
1529  * @flags: clockgating feature flags
1530  *
1531  * Walks the list of IPs on the device and updates the clockgating
1532  * flags for each IP.
1533  * Updates @flags with the feature flags for each hardware IP where
1534  * clockgating is enabled.
1535  */
1536 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1537                                             u32 *flags)
1538 {
1539         int i;
1540
1541         for (i = 0; i < adev->num_ip_blocks; i++) {
1542                 if (!adev->ip_blocks[i].status.valid)
1543                         continue;
1544                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1545                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1546         }
1547 }
1548
1549 /**
1550  * amdgpu_device_ip_wait_for_idle - wait for idle
1551  *
1552  * @adev: amdgpu_device pointer
1553  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1554  *
1555  * Waits for the request hardware IP to be idle.
1556  * Returns 0 for success or a negative error code on failure.
1557  */
1558 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1559                                    enum amd_ip_block_type block_type)
1560 {
1561         int i, r;
1562
1563         for (i = 0; i < adev->num_ip_blocks; i++) {
1564                 if (!adev->ip_blocks[i].status.valid)
1565                         continue;
1566                 if (adev->ip_blocks[i].version->type == block_type) {
1567                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1568                         if (r)
1569                                 return r;
1570                         break;
1571                 }
1572         }
1573         return 0;
1574
1575 }
1576
1577 /**
1578  * amdgpu_device_ip_is_idle - is the hardware IP idle
1579  *
1580  * @adev: amdgpu_device pointer
1581  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1582  *
1583  * Check if the hardware IP is idle or not.
1584  * Returns true if it the IP is idle, false if not.
1585  */
1586 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1587                               enum amd_ip_block_type block_type)
1588 {
1589         int i;
1590
1591         for (i = 0; i < adev->num_ip_blocks; i++) {
1592                 if (!adev->ip_blocks[i].status.valid)
1593                         continue;
1594                 if (adev->ip_blocks[i].version->type == block_type)
1595                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1596         }
1597         return true;
1598
1599 }
1600
1601 /**
1602  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1603  *
1604  * @adev: amdgpu_device pointer
1605  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1606  *
1607  * Returns a pointer to the hardware IP block structure
1608  * if it exists for the asic, otherwise NULL.
1609  */
1610 struct amdgpu_ip_block *
1611 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1612                               enum amd_ip_block_type type)
1613 {
1614         int i;
1615
1616         for (i = 0; i < adev->num_ip_blocks; i++)
1617                 if (adev->ip_blocks[i].version->type == type)
1618                         return &adev->ip_blocks[i];
1619
1620         return NULL;
1621 }
1622
1623 /**
1624  * amdgpu_device_ip_block_version_cmp
1625  *
1626  * @adev: amdgpu_device pointer
1627  * @type: enum amd_ip_block_type
1628  * @major: major version
1629  * @minor: minor version
1630  *
1631  * return 0 if equal or greater
1632  * return 1 if smaller or the ip_block doesn't exist
1633  */
1634 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1635                                        enum amd_ip_block_type type,
1636                                        u32 major, u32 minor)
1637 {
1638         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1639
1640         if (ip_block && ((ip_block->version->major > major) ||
1641                         ((ip_block->version->major == major) &&
1642                         (ip_block->version->minor >= minor))))
1643                 return 0;
1644
1645         return 1;
1646 }
1647
1648 /**
1649  * amdgpu_device_ip_block_add
1650  *
1651  * @adev: amdgpu_device pointer
1652  * @ip_block_version: pointer to the IP to add
1653  *
1654  * Adds the IP block driver information to the collection of IPs
1655  * on the asic.
1656  */
1657 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1658                                const struct amdgpu_ip_block_version *ip_block_version)
1659 {
1660         if (!ip_block_version)
1661                 return -EINVAL;
1662
1663         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1664                   ip_block_version->funcs->name);
1665
1666         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1667
1668         return 0;
1669 }
1670
1671 /**
1672  * amdgpu_device_enable_virtual_display - enable virtual display feature
1673  *
1674  * @adev: amdgpu_device pointer
1675  *
1676  * Enabled the virtual display feature if the user has enabled it via
1677  * the module parameter virtual_display.  This feature provides a virtual
1678  * display hardware on headless boards or in virtualized environments.
1679  * This function parses and validates the configuration string specified by
1680  * the user and configues the virtual display configuration (number of
1681  * virtual connectors, crtcs, etc.) specified.
1682  */
1683 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1684 {
1685         adev->enable_virtual_display = false;
1686
1687         if (amdgpu_virtual_display) {
1688                 struct drm_device *ddev = adev_to_drm(adev);
1689                 const char *pci_address_name = pci_name(ddev->pdev);
1690                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1691
1692                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1693                 pciaddstr_tmp = pciaddstr;
1694                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1695                         pciaddname = strsep(&pciaddname_tmp, ",");
1696                         if (!strcmp("all", pciaddname)
1697                             || !strcmp(pci_address_name, pciaddname)) {
1698                                 long num_crtc;
1699                                 int res = -1;
1700
1701                                 adev->enable_virtual_display = true;
1702
1703                                 if (pciaddname_tmp)
1704                                         res = kstrtol(pciaddname_tmp, 10,
1705                                                       &num_crtc);
1706
1707                                 if (!res) {
1708                                         if (num_crtc < 1)
1709                                                 num_crtc = 1;
1710                                         if (num_crtc > 6)
1711                                                 num_crtc = 6;
1712                                         adev->mode_info.num_crtc = num_crtc;
1713                                 } else {
1714                                         adev->mode_info.num_crtc = 1;
1715                                 }
1716                                 break;
1717                         }
1718                 }
1719
1720                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1721                          amdgpu_virtual_display, pci_address_name,
1722                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1723
1724                 kfree(pciaddstr);
1725         }
1726 }
1727
1728 /**
1729  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1730  *
1731  * @adev: amdgpu_device pointer
1732  *
1733  * Parses the asic configuration parameters specified in the gpu info
1734  * firmware and makes them availale to the driver for use in configuring
1735  * the asic.
1736  * Returns 0 on success, -EINVAL on failure.
1737  */
1738 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1739 {
1740         const char *chip_name;
1741         char fw_name[40];
1742         int err;
1743         const struct gpu_info_firmware_header_v1_0 *hdr;
1744
1745         adev->firmware.gpu_info_fw = NULL;
1746
1747         if (adev->mman.discovery_bin) {
1748                 amdgpu_discovery_get_gfx_info(adev);
1749
1750                 /*
1751                  * FIXME: The bounding box is still needed by Navi12, so
1752                  * temporarily read it from gpu_info firmware. Should be droped
1753                  * when DAL no longer needs it.
1754                  */
1755                 if (adev->asic_type != CHIP_NAVI12)
1756                         return 0;
1757         }
1758
1759         switch (adev->asic_type) {
1760 #ifdef CONFIG_DRM_AMDGPU_SI
1761         case CHIP_VERDE:
1762         case CHIP_TAHITI:
1763         case CHIP_PITCAIRN:
1764         case CHIP_OLAND:
1765         case CHIP_HAINAN:
1766 #endif
1767 #ifdef CONFIG_DRM_AMDGPU_CIK
1768         case CHIP_BONAIRE:
1769         case CHIP_HAWAII:
1770         case CHIP_KAVERI:
1771         case CHIP_KABINI:
1772         case CHIP_MULLINS:
1773 #endif
1774         case CHIP_TOPAZ:
1775         case CHIP_TONGA:
1776         case CHIP_FIJI:
1777         case CHIP_POLARIS10:
1778         case CHIP_POLARIS11:
1779         case CHIP_POLARIS12:
1780         case CHIP_VEGAM:
1781         case CHIP_CARRIZO:
1782         case CHIP_STONEY:
1783         case CHIP_VEGA20:
1784         case CHIP_SIENNA_CICHLID:
1785         case CHIP_NAVY_FLOUNDER:
1786         default:
1787                 return 0;
1788         case CHIP_VEGA10:
1789                 chip_name = "vega10";
1790                 break;
1791         case CHIP_VEGA12:
1792                 chip_name = "vega12";
1793                 break;
1794         case CHIP_RAVEN:
1795                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1796                         chip_name = "raven2";
1797                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1798                         chip_name = "picasso";
1799                 else
1800                         chip_name = "raven";
1801                 break;
1802         case CHIP_ARCTURUS:
1803                 chip_name = "arcturus";
1804                 break;
1805         case CHIP_RENOIR:
1806                 chip_name = "renoir";
1807                 break;
1808         case CHIP_NAVI10:
1809                 chip_name = "navi10";
1810                 break;
1811         case CHIP_NAVI14:
1812                 chip_name = "navi14";
1813                 break;
1814         case CHIP_NAVI12:
1815                 chip_name = "navi12";
1816                 break;
1817         }
1818
1819         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1820         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1821         if (err) {
1822                 dev_err(adev->dev,
1823                         "Failed to load gpu_info firmware \"%s\"\n",
1824                         fw_name);
1825                 goto out;
1826         }
1827         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1828         if (err) {
1829                 dev_err(adev->dev,
1830                         "Failed to validate gpu_info firmware \"%s\"\n",
1831                         fw_name);
1832                 goto out;
1833         }
1834
1835         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1836         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1837
1838         switch (hdr->version_major) {
1839         case 1:
1840         {
1841                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1842                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1843                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1844
1845                 /*
1846                  * Should be droped when DAL no longer needs it.
1847                  */
1848                 if (adev->asic_type == CHIP_NAVI12)
1849                         goto parse_soc_bounding_box;
1850
1851                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1852                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1853                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1854                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1855                 adev->gfx.config.max_texture_channel_caches =
1856                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1857                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1858                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1859                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1860                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1861                 adev->gfx.config.double_offchip_lds_buf =
1862                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1863                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1864                 adev->gfx.cu_info.max_waves_per_simd =
1865                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1866                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1867                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1868                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1869                 if (hdr->version_minor >= 1) {
1870                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1871                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1872                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1873                         adev->gfx.config.num_sc_per_sh =
1874                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1875                         adev->gfx.config.num_packer_per_sc =
1876                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1877                 }
1878
1879 parse_soc_bounding_box:
1880                 /*
1881                  * soc bounding box info is not integrated in disocovery table,
1882                  * we always need to parse it from gpu info firmware if needed.
1883                  */
1884                 if (hdr->version_minor == 2) {
1885                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1886                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1887                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1888                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1889                 }
1890                 break;
1891         }
1892         default:
1893                 dev_err(adev->dev,
1894                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1895                 err = -EINVAL;
1896                 goto out;
1897         }
1898 out:
1899         return err;
1900 }
1901
1902 /**
1903  * amdgpu_device_ip_early_init - run early init for hardware IPs
1904  *
1905  * @adev: amdgpu_device pointer
1906  *
1907  * Early initialization pass for hardware IPs.  The hardware IPs that make
1908  * up each asic are discovered each IP's early_init callback is run.  This
1909  * is the first stage in initializing the asic.
1910  * Returns 0 on success, negative error code on failure.
1911  */
1912 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1913 {
1914         int i, r;
1915
1916         amdgpu_device_enable_virtual_display(adev);
1917
1918         if (amdgpu_sriov_vf(adev)) {
1919                 r = amdgpu_virt_request_full_gpu(adev, true);
1920                 if (r)
1921                         return r;
1922         }
1923
1924         switch (adev->asic_type) {
1925 #ifdef CONFIG_DRM_AMDGPU_SI
1926         case CHIP_VERDE:
1927         case CHIP_TAHITI:
1928         case CHIP_PITCAIRN:
1929         case CHIP_OLAND:
1930         case CHIP_HAINAN:
1931                 adev->family = AMDGPU_FAMILY_SI;
1932                 r = si_set_ip_blocks(adev);
1933                 if (r)
1934                         return r;
1935                 break;
1936 #endif
1937 #ifdef CONFIG_DRM_AMDGPU_CIK
1938         case CHIP_BONAIRE:
1939         case CHIP_HAWAII:
1940         case CHIP_KAVERI:
1941         case CHIP_KABINI:
1942         case CHIP_MULLINS:
1943                 if (adev->flags & AMD_IS_APU)
1944                         adev->family = AMDGPU_FAMILY_KV;
1945                 else
1946                         adev->family = AMDGPU_FAMILY_CI;
1947
1948                 r = cik_set_ip_blocks(adev);
1949                 if (r)
1950                         return r;
1951                 break;
1952 #endif
1953         case CHIP_TOPAZ:
1954         case CHIP_TONGA:
1955         case CHIP_FIJI:
1956         case CHIP_POLARIS10:
1957         case CHIP_POLARIS11:
1958         case CHIP_POLARIS12:
1959         case CHIP_VEGAM:
1960         case CHIP_CARRIZO:
1961         case CHIP_STONEY:
1962                 if (adev->flags & AMD_IS_APU)
1963                         adev->family = AMDGPU_FAMILY_CZ;
1964                 else
1965                         adev->family = AMDGPU_FAMILY_VI;
1966
1967                 r = vi_set_ip_blocks(adev);
1968                 if (r)
1969                         return r;
1970                 break;
1971         case CHIP_VEGA10:
1972         case CHIP_VEGA12:
1973         case CHIP_VEGA20:
1974         case CHIP_RAVEN:
1975         case CHIP_ARCTURUS:
1976         case CHIP_RENOIR:
1977                 if (adev->flags & AMD_IS_APU)
1978                         adev->family = AMDGPU_FAMILY_RV;
1979                 else
1980                         adev->family = AMDGPU_FAMILY_AI;
1981
1982                 r = soc15_set_ip_blocks(adev);
1983                 if (r)
1984                         return r;
1985                 break;
1986         case  CHIP_NAVI10:
1987         case  CHIP_NAVI14:
1988         case  CHIP_NAVI12:
1989         case  CHIP_SIENNA_CICHLID:
1990         case  CHIP_NAVY_FLOUNDER:
1991                 adev->family = AMDGPU_FAMILY_NV;
1992
1993                 r = nv_set_ip_blocks(adev);
1994                 if (r)
1995                         return r;
1996                 break;
1997         default:
1998                 /* FIXME: not supported yet */
1999                 return -EINVAL;
2000         }
2001
2002         amdgpu_amdkfd_device_probe(adev);
2003
2004         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2005         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2006                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2007
2008         for (i = 0; i < adev->num_ip_blocks; i++) {
2009                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2010                         DRM_ERROR("disabled ip block: %d <%s>\n",
2011                                   i, adev->ip_blocks[i].version->funcs->name);
2012                         adev->ip_blocks[i].status.valid = false;
2013                 } else {
2014                         if (adev->ip_blocks[i].version->funcs->early_init) {
2015                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2016                                 if (r == -ENOENT) {
2017                                         adev->ip_blocks[i].status.valid = false;
2018                                 } else if (r) {
2019                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2020                                                   adev->ip_blocks[i].version->funcs->name, r);
2021                                         return r;
2022                                 } else {
2023                                         adev->ip_blocks[i].status.valid = true;
2024                                 }
2025                         } else {
2026                                 adev->ip_blocks[i].status.valid = true;
2027                         }
2028                 }
2029                 /* get the vbios after the asic_funcs are set up */
2030                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2031                         r = amdgpu_device_parse_gpu_info_fw(adev);
2032                         if (r)
2033                                 return r;
2034
2035                         /* Read BIOS */
2036                         if (!amdgpu_get_bios(adev))
2037                                 return -EINVAL;
2038
2039                         r = amdgpu_atombios_init(adev);
2040                         if (r) {
2041                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2042                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2043                                 return r;
2044                         }
2045                 }
2046         }
2047
2048         adev->cg_flags &= amdgpu_cg_mask;
2049         adev->pg_flags &= amdgpu_pg_mask;
2050
2051         return 0;
2052 }
2053
2054 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2055 {
2056         int i, r;
2057
2058         for (i = 0; i < adev->num_ip_blocks; i++) {
2059                 if (!adev->ip_blocks[i].status.sw)
2060                         continue;
2061                 if (adev->ip_blocks[i].status.hw)
2062                         continue;
2063                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2064                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2065                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2066                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2067                         if (r) {
2068                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2069                                           adev->ip_blocks[i].version->funcs->name, r);
2070                                 return r;
2071                         }
2072                         adev->ip_blocks[i].status.hw = true;
2073                 }
2074         }
2075
2076         return 0;
2077 }
2078
2079 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2080 {
2081         int i, r;
2082
2083         for (i = 0; i < adev->num_ip_blocks; i++) {
2084                 if (!adev->ip_blocks[i].status.sw)
2085                         continue;
2086                 if (adev->ip_blocks[i].status.hw)
2087                         continue;
2088                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2089                 if (r) {
2090                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2091                                   adev->ip_blocks[i].version->funcs->name, r);
2092                         return r;
2093                 }
2094                 adev->ip_blocks[i].status.hw = true;
2095         }
2096
2097         return 0;
2098 }
2099
2100 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2101 {
2102         int r = 0;
2103         int i;
2104         uint32_t smu_version;
2105
2106         if (adev->asic_type >= CHIP_VEGA10) {
2107                 for (i = 0; i < adev->num_ip_blocks; i++) {
2108                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2109                                 continue;
2110
2111                         /* no need to do the fw loading again if already done*/
2112                         if (adev->ip_blocks[i].status.hw == true)
2113                                 break;
2114
2115                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2116                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2117                                 if (r) {
2118                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2119                                                           adev->ip_blocks[i].version->funcs->name, r);
2120                                         return r;
2121                                 }
2122                         } else {
2123                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2124                                 if (r) {
2125                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2126                                                           adev->ip_blocks[i].version->funcs->name, r);
2127                                         return r;
2128                                 }
2129                         }
2130
2131                         adev->ip_blocks[i].status.hw = true;
2132                         break;
2133                 }
2134         }
2135
2136         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2137                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2138
2139         return r;
2140 }
2141
2142 /**
2143  * amdgpu_device_ip_init - run init for hardware IPs
2144  *
2145  * @adev: amdgpu_device pointer
2146  *
2147  * Main initialization pass for hardware IPs.  The list of all the hardware
2148  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2149  * are run.  sw_init initializes the software state associated with each IP
2150  * and hw_init initializes the hardware associated with each IP.
2151  * Returns 0 on success, negative error code on failure.
2152  */
2153 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2154 {
2155         int i, r;
2156
2157         r = amdgpu_ras_init(adev);
2158         if (r)
2159                 return r;
2160
2161         for (i = 0; i < adev->num_ip_blocks; i++) {
2162                 if (!adev->ip_blocks[i].status.valid)
2163                         continue;
2164                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2165                 if (r) {
2166                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2167                                   adev->ip_blocks[i].version->funcs->name, r);
2168                         goto init_failed;
2169                 }
2170                 adev->ip_blocks[i].status.sw = true;
2171
2172                 /* need to do gmc hw init early so we can allocate gpu mem */
2173                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2174                         r = amdgpu_device_vram_scratch_init(adev);
2175                         if (r) {
2176                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2177                                 goto init_failed;
2178                         }
2179                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2180                         if (r) {
2181                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2182                                 goto init_failed;
2183                         }
2184                         r = amdgpu_device_wb_init(adev);
2185                         if (r) {
2186                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2187                                 goto init_failed;
2188                         }
2189                         adev->ip_blocks[i].status.hw = true;
2190
2191                         /* right after GMC hw init, we create CSA */
2192                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2193                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2194                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2195                                                                 AMDGPU_CSA_SIZE);
2196                                 if (r) {
2197                                         DRM_ERROR("allocate CSA failed %d\n", r);
2198                                         goto init_failed;
2199                                 }
2200                         }
2201                 }
2202         }
2203
2204         if (amdgpu_sriov_vf(adev))
2205                 amdgpu_virt_init_data_exchange(adev);
2206
2207         r = amdgpu_ib_pool_init(adev);
2208         if (r) {
2209                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2210                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2211                 goto init_failed;
2212         }
2213
2214         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2215         if (r)
2216                 goto init_failed;
2217
2218         r = amdgpu_device_ip_hw_init_phase1(adev);
2219         if (r)
2220                 goto init_failed;
2221
2222         r = amdgpu_device_fw_loading(adev);
2223         if (r)
2224                 goto init_failed;
2225
2226         r = amdgpu_device_ip_hw_init_phase2(adev);
2227         if (r)
2228                 goto init_failed;
2229
2230         /*
2231          * retired pages will be loaded from eeprom and reserved here,
2232          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2233          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2234          * for I2C communication which only true at this point.
2235          *
2236          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2237          * failure from bad gpu situation and stop amdgpu init process
2238          * accordingly. For other failed cases, it will still release all
2239          * the resource and print error message, rather than returning one
2240          * negative value to upper level.
2241          *
2242          * Note: theoretically, this should be called before all vram allocations
2243          * to protect retired page from abusing
2244          */
2245         r = amdgpu_ras_recovery_init(adev);
2246         if (r)
2247                 goto init_failed;
2248
2249         if (adev->gmc.xgmi.num_physical_nodes > 1)
2250                 amdgpu_xgmi_add_device(adev);
2251         amdgpu_amdkfd_device_init(adev);
2252
2253         amdgpu_fru_get_product_info(adev);
2254
2255 init_failed:
2256         if (amdgpu_sriov_vf(adev))
2257                 amdgpu_virt_release_full_gpu(adev, true);
2258
2259         return r;
2260 }
2261
2262 /**
2263  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2264  *
2265  * @adev: amdgpu_device pointer
2266  *
2267  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2268  * this function before a GPU reset.  If the value is retained after a
2269  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2270  */
2271 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2272 {
2273         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2274 }
2275
2276 /**
2277  * amdgpu_device_check_vram_lost - check if vram is valid
2278  *
2279  * @adev: amdgpu_device pointer
2280  *
2281  * Checks the reset magic value written to the gart pointer in VRAM.
2282  * The driver calls this after a GPU reset to see if the contents of
2283  * VRAM is lost or now.
2284  * returns true if vram is lost, false if not.
2285  */
2286 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2287 {
2288         if (memcmp(adev->gart.ptr, adev->reset_magic,
2289                         AMDGPU_RESET_MAGIC_NUM))
2290                 return true;
2291
2292         if (!amdgpu_in_reset(adev))
2293                 return false;
2294
2295         /*
2296          * For all ASICs with baco/mode1 reset, the VRAM is
2297          * always assumed to be lost.
2298          */
2299         switch (amdgpu_asic_reset_method(adev)) {
2300         case AMD_RESET_METHOD_BACO:
2301         case AMD_RESET_METHOD_MODE1:
2302                 return true;
2303         default:
2304                 return false;
2305         }
2306 }
2307
2308 /**
2309  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2310  *
2311  * @adev: amdgpu_device pointer
2312  * @state: clockgating state (gate or ungate)
2313  *
2314  * The list of all the hardware IPs that make up the asic is walked and the
2315  * set_clockgating_state callbacks are run.
2316  * Late initialization pass enabling clockgating for hardware IPs.
2317  * Fini or suspend, pass disabling clockgating for hardware IPs.
2318  * Returns 0 on success, negative error code on failure.
2319  */
2320
2321 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2322                                                 enum amd_clockgating_state state)
2323 {
2324         int i, j, r;
2325
2326         if (amdgpu_emu_mode == 1)
2327                 return 0;
2328
2329         for (j = 0; j < adev->num_ip_blocks; j++) {
2330                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2331                 if (!adev->ip_blocks[i].status.late_initialized)
2332                         continue;
2333                 /* skip CG for VCE/UVD, it's handled specially */
2334                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2335                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2336                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2337                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2338                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2339                         /* enable clockgating to save power */
2340                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2341                                                                                      state);
2342                         if (r) {
2343                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2344                                           adev->ip_blocks[i].version->funcs->name, r);
2345                                 return r;
2346                         }
2347                 }
2348         }
2349
2350         return 0;
2351 }
2352
2353 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2354 {
2355         int i, j, r;
2356
2357         if (amdgpu_emu_mode == 1)
2358                 return 0;
2359
2360         for (j = 0; j < adev->num_ip_blocks; j++) {
2361                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2362                 if (!adev->ip_blocks[i].status.late_initialized)
2363                         continue;
2364                 /* skip CG for VCE/UVD, it's handled specially */
2365                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2366                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2367                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2368                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2369                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2370                         /* enable powergating to save power */
2371                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2372                                                                                         state);
2373                         if (r) {
2374                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2375                                           adev->ip_blocks[i].version->funcs->name, r);
2376                                 return r;
2377                         }
2378                 }
2379         }
2380         return 0;
2381 }
2382
2383 static int amdgpu_device_enable_mgpu_fan_boost(void)
2384 {
2385         struct amdgpu_gpu_instance *gpu_ins;
2386         struct amdgpu_device *adev;
2387         int i, ret = 0;
2388
2389         mutex_lock(&mgpu_info.mutex);
2390
2391         /*
2392          * MGPU fan boost feature should be enabled
2393          * only when there are two or more dGPUs in
2394          * the system
2395          */
2396         if (mgpu_info.num_dgpu < 2)
2397                 goto out;
2398
2399         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2400                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2401                 adev = gpu_ins->adev;
2402                 if (!(adev->flags & AMD_IS_APU) &&
2403                     !gpu_ins->mgpu_fan_enabled) {
2404                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2405                         if (ret)
2406                                 break;
2407
2408                         gpu_ins->mgpu_fan_enabled = 1;
2409                 }
2410         }
2411
2412 out:
2413         mutex_unlock(&mgpu_info.mutex);
2414
2415         return ret;
2416 }
2417
2418 /**
2419  * amdgpu_device_ip_late_init - run late init for hardware IPs
2420  *
2421  * @adev: amdgpu_device pointer
2422  *
2423  * Late initialization pass for hardware IPs.  The list of all the hardware
2424  * IPs that make up the asic is walked and the late_init callbacks are run.
2425  * late_init covers any special initialization that an IP requires
2426  * after all of the have been initialized or something that needs to happen
2427  * late in the init process.
2428  * Returns 0 on success, negative error code on failure.
2429  */
2430 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2431 {
2432         struct amdgpu_gpu_instance *gpu_instance;
2433         int i = 0, r;
2434
2435         for (i = 0; i < adev->num_ip_blocks; i++) {
2436                 if (!adev->ip_blocks[i].status.hw)
2437                         continue;
2438                 if (adev->ip_blocks[i].version->funcs->late_init) {
2439                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2440                         if (r) {
2441                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2442                                           adev->ip_blocks[i].version->funcs->name, r);
2443                                 return r;
2444                         }
2445                 }
2446                 adev->ip_blocks[i].status.late_initialized = true;
2447         }
2448
2449         amdgpu_ras_set_error_query_ready(adev, true);
2450
2451         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2452         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2453
2454         amdgpu_device_fill_reset_magic(adev);
2455
2456         r = amdgpu_device_enable_mgpu_fan_boost();
2457         if (r)
2458                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2459
2460
2461         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2462                 mutex_lock(&mgpu_info.mutex);
2463
2464                 /*
2465                  * Reset device p-state to low as this was booted with high.
2466                  *
2467                  * This should be performed only after all devices from the same
2468                  * hive get initialized.
2469                  *
2470                  * However, it's unknown how many device in the hive in advance.
2471                  * As this is counted one by one during devices initializations.
2472                  *
2473                  * So, we wait for all XGMI interlinked devices initialized.
2474                  * This may bring some delays as those devices may come from
2475                  * different hives. But that should be OK.
2476                  */
2477                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2478                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2479                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2480                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2481                                         continue;
2482
2483                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2484                                                 AMDGPU_XGMI_PSTATE_MIN);
2485                                 if (r) {
2486                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2487                                         break;
2488                                 }
2489                         }
2490                 }
2491
2492                 mutex_unlock(&mgpu_info.mutex);
2493         }
2494
2495         return 0;
2496 }
2497
2498 /**
2499  * amdgpu_device_ip_fini - run fini for hardware IPs
2500  *
2501  * @adev: amdgpu_device pointer
2502  *
2503  * Main teardown pass for hardware IPs.  The list of all the hardware
2504  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2505  * are run.  hw_fini tears down the hardware associated with each IP
2506  * and sw_fini tears down any software state associated with each IP.
2507  * Returns 0 on success, negative error code on failure.
2508  */
2509 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2510 {
2511         int i, r;
2512
2513         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2514                 amdgpu_virt_release_ras_err_handler_data(adev);
2515
2516         amdgpu_ras_pre_fini(adev);
2517
2518         if (adev->gmc.xgmi.num_physical_nodes > 1)
2519                 amdgpu_xgmi_remove_device(adev);
2520
2521         amdgpu_amdkfd_device_fini(adev);
2522
2523         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2524         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2525
2526         /* need to disable SMC first */
2527         for (i = 0; i < adev->num_ip_blocks; i++) {
2528                 if (!adev->ip_blocks[i].status.hw)
2529                         continue;
2530                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2531                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2532                         /* XXX handle errors */
2533                         if (r) {
2534                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2535                                           adev->ip_blocks[i].version->funcs->name, r);
2536                         }
2537                         adev->ip_blocks[i].status.hw = false;
2538                         break;
2539                 }
2540         }
2541
2542         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2543                 if (!adev->ip_blocks[i].status.hw)
2544                         continue;
2545
2546                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2547                 /* XXX handle errors */
2548                 if (r) {
2549                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2550                                   adev->ip_blocks[i].version->funcs->name, r);
2551                 }
2552
2553                 adev->ip_blocks[i].status.hw = false;
2554         }
2555
2556
2557         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2558                 if (!adev->ip_blocks[i].status.sw)
2559                         continue;
2560
2561                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2562                         amdgpu_ucode_free_bo(adev);
2563                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2564                         amdgpu_device_wb_fini(adev);
2565                         amdgpu_device_vram_scratch_fini(adev);
2566                         amdgpu_ib_pool_fini(adev);
2567                 }
2568
2569                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2570                 /* XXX handle errors */
2571                 if (r) {
2572                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2573                                   adev->ip_blocks[i].version->funcs->name, r);
2574                 }
2575                 adev->ip_blocks[i].status.sw = false;
2576                 adev->ip_blocks[i].status.valid = false;
2577         }
2578
2579         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2580                 if (!adev->ip_blocks[i].status.late_initialized)
2581                         continue;
2582                 if (adev->ip_blocks[i].version->funcs->late_fini)
2583                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2584                 adev->ip_blocks[i].status.late_initialized = false;
2585         }
2586
2587         amdgpu_ras_fini(adev);
2588
2589         if (amdgpu_sriov_vf(adev))
2590                 if (amdgpu_virt_release_full_gpu(adev, false))
2591                         DRM_ERROR("failed to release exclusive mode on fini\n");
2592
2593         return 0;
2594 }
2595
2596 /**
2597  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2598  *
2599  * @work: work_struct.
2600  */
2601 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2602 {
2603         struct amdgpu_device *adev =
2604                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2605         int r;
2606
2607         r = amdgpu_ib_ring_tests(adev);
2608         if (r)
2609                 DRM_ERROR("ib ring test failed (%d).\n", r);
2610 }
2611
2612 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2613 {
2614         struct amdgpu_device *adev =
2615                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2616
2617         mutex_lock(&adev->gfx.gfx_off_mutex);
2618         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2619                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2620                         adev->gfx.gfx_off_state = true;
2621         }
2622         mutex_unlock(&adev->gfx.gfx_off_mutex);
2623 }
2624
2625 /**
2626  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2627  *
2628  * @adev: amdgpu_device pointer
2629  *
2630  * Main suspend function for hardware IPs.  The list of all the hardware
2631  * IPs that make up the asic is walked, clockgating is disabled and the
2632  * suspend callbacks are run.  suspend puts the hardware and software state
2633  * in each IP into a state suitable for suspend.
2634  * Returns 0 on success, negative error code on failure.
2635  */
2636 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2637 {
2638         int i, r;
2639
2640         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2641         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2642
2643         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2644                 if (!adev->ip_blocks[i].status.valid)
2645                         continue;
2646
2647                 /* displays are handled separately */
2648                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2649                         continue;
2650
2651                 /* XXX handle errors */
2652                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2653                 /* XXX handle errors */
2654                 if (r) {
2655                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2656                                   adev->ip_blocks[i].version->funcs->name, r);
2657                         return r;
2658                 }
2659
2660                 adev->ip_blocks[i].status.hw = false;
2661         }
2662
2663         return 0;
2664 }
2665
2666 /**
2667  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2668  *
2669  * @adev: amdgpu_device pointer
2670  *
2671  * Main suspend function for hardware IPs.  The list of all the hardware
2672  * IPs that make up the asic is walked, clockgating is disabled and the
2673  * suspend callbacks are run.  suspend puts the hardware and software state
2674  * in each IP into a state suitable for suspend.
2675  * Returns 0 on success, negative error code on failure.
2676  */
2677 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2678 {
2679         int i, r;
2680
2681         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2682                 if (!adev->ip_blocks[i].status.valid)
2683                         continue;
2684                 /* displays are handled in phase1 */
2685                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2686                         continue;
2687                 /* PSP lost connection when err_event_athub occurs */
2688                 if (amdgpu_ras_intr_triggered() &&
2689                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2690                         adev->ip_blocks[i].status.hw = false;
2691                         continue;
2692                 }
2693                 /* XXX handle errors */
2694                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2695                 /* XXX handle errors */
2696                 if (r) {
2697                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2698                                   adev->ip_blocks[i].version->funcs->name, r);
2699                 }
2700                 adev->ip_blocks[i].status.hw = false;
2701                 /* handle putting the SMC in the appropriate state */
2702                 if(!amdgpu_sriov_vf(adev)){
2703                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2704                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2705                                 if (r) {
2706                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2707                                                         adev->mp1_state, r);
2708                                         return r;
2709                                 }
2710                         }
2711                 }
2712                 adev->ip_blocks[i].status.hw = false;
2713         }
2714
2715         return 0;
2716 }
2717
2718 /**
2719  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2720  *
2721  * @adev: amdgpu_device pointer
2722  *
2723  * Main suspend function for hardware IPs.  The list of all the hardware
2724  * IPs that make up the asic is walked, clockgating is disabled and the
2725  * suspend callbacks are run.  suspend puts the hardware and software state
2726  * in each IP into a state suitable for suspend.
2727  * Returns 0 on success, negative error code on failure.
2728  */
2729 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2730 {
2731         int r;
2732
2733         if (amdgpu_sriov_vf(adev))
2734                 amdgpu_virt_request_full_gpu(adev, false);
2735
2736         r = amdgpu_device_ip_suspend_phase1(adev);
2737         if (r)
2738                 return r;
2739         r = amdgpu_device_ip_suspend_phase2(adev);
2740
2741         if (amdgpu_sriov_vf(adev))
2742                 amdgpu_virt_release_full_gpu(adev, false);
2743
2744         return r;
2745 }
2746
2747 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2748 {
2749         int i, r;
2750
2751         static enum amd_ip_block_type ip_order[] = {
2752                 AMD_IP_BLOCK_TYPE_GMC,
2753                 AMD_IP_BLOCK_TYPE_COMMON,
2754                 AMD_IP_BLOCK_TYPE_PSP,
2755                 AMD_IP_BLOCK_TYPE_IH,
2756         };
2757
2758         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2759                 int j;
2760                 struct amdgpu_ip_block *block;
2761
2762                 block = &adev->ip_blocks[i];
2763                 block->status.hw = false;
2764
2765                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2766
2767                         if (block->version->type != ip_order[j] ||
2768                                 !block->status.valid)
2769                                 continue;
2770
2771                         r = block->version->funcs->hw_init(adev);
2772                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2773                         if (r)
2774                                 return r;
2775                         block->status.hw = true;
2776                 }
2777         }
2778
2779         return 0;
2780 }
2781
2782 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2783 {
2784         int i, r;
2785
2786         static enum amd_ip_block_type ip_order[] = {
2787                 AMD_IP_BLOCK_TYPE_SMC,
2788                 AMD_IP_BLOCK_TYPE_DCE,
2789                 AMD_IP_BLOCK_TYPE_GFX,
2790                 AMD_IP_BLOCK_TYPE_SDMA,
2791                 AMD_IP_BLOCK_TYPE_UVD,
2792                 AMD_IP_BLOCK_TYPE_VCE,
2793                 AMD_IP_BLOCK_TYPE_VCN
2794         };
2795
2796         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2797                 int j;
2798                 struct amdgpu_ip_block *block;
2799
2800                 for (j = 0; j < adev->num_ip_blocks; j++) {
2801                         block = &adev->ip_blocks[j];
2802
2803                         if (block->version->type != ip_order[i] ||
2804                                 !block->status.valid ||
2805                                 block->status.hw)
2806                                 continue;
2807
2808                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2809                                 r = block->version->funcs->resume(adev);
2810                         else
2811                                 r = block->version->funcs->hw_init(adev);
2812
2813                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2814                         if (r)
2815                                 return r;
2816                         block->status.hw = true;
2817                 }
2818         }
2819
2820         return 0;
2821 }
2822
2823 /**
2824  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2825  *
2826  * @adev: amdgpu_device pointer
2827  *
2828  * First resume function for hardware IPs.  The list of all the hardware
2829  * IPs that make up the asic is walked and the resume callbacks are run for
2830  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2831  * after a suspend and updates the software state as necessary.  This
2832  * function is also used for restoring the GPU after a GPU reset.
2833  * Returns 0 on success, negative error code on failure.
2834  */
2835 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2836 {
2837         int i, r;
2838
2839         for (i = 0; i < adev->num_ip_blocks; i++) {
2840                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2841                         continue;
2842                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2843                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2844                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2845
2846                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2847                         if (r) {
2848                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2849                                           adev->ip_blocks[i].version->funcs->name, r);
2850                                 return r;
2851                         }
2852                         adev->ip_blocks[i].status.hw = true;
2853                 }
2854         }
2855
2856         return 0;
2857 }
2858
2859 /**
2860  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2861  *
2862  * @adev: amdgpu_device pointer
2863  *
2864  * First resume function for hardware IPs.  The list of all the hardware
2865  * IPs that make up the asic is walked and the resume callbacks are run for
2866  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2867  * functional state after a suspend and updates the software state as
2868  * necessary.  This function is also used for restoring the GPU after a GPU
2869  * reset.
2870  * Returns 0 on success, negative error code on failure.
2871  */
2872 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2873 {
2874         int i, r;
2875
2876         for (i = 0; i < adev->num_ip_blocks; i++) {
2877                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2878                         continue;
2879                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2880                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2881                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2882                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2883                         continue;
2884                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2885                 if (r) {
2886                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2887                                   adev->ip_blocks[i].version->funcs->name, r);
2888                         return r;
2889                 }
2890                 adev->ip_blocks[i].status.hw = true;
2891         }
2892
2893         return 0;
2894 }
2895
2896 /**
2897  * amdgpu_device_ip_resume - run resume for hardware IPs
2898  *
2899  * @adev: amdgpu_device pointer
2900  *
2901  * Main resume function for hardware IPs.  The hardware IPs
2902  * are split into two resume functions because they are
2903  * are also used in in recovering from a GPU reset and some additional
2904  * steps need to be take between them.  In this case (S3/S4) they are
2905  * run sequentially.
2906  * Returns 0 on success, negative error code on failure.
2907  */
2908 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2909 {
2910         int r;
2911
2912         r = amdgpu_device_ip_resume_phase1(adev);
2913         if (r)
2914                 return r;
2915
2916         r = amdgpu_device_fw_loading(adev);
2917         if (r)
2918                 return r;
2919
2920         r = amdgpu_device_ip_resume_phase2(adev);
2921
2922         return r;
2923 }
2924
2925 /**
2926  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2927  *
2928  * @adev: amdgpu_device pointer
2929  *
2930  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2931  */
2932 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2933 {
2934         if (amdgpu_sriov_vf(adev)) {
2935                 if (adev->is_atom_fw) {
2936                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2937                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2938                 } else {
2939                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2940                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2941                 }
2942
2943                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2944                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2945         }
2946 }
2947
2948 /**
2949  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2950  *
2951  * @asic_type: AMD asic type
2952  *
2953  * Check if there is DC (new modesetting infrastructre) support for an asic.
2954  * returns true if DC has support, false if not.
2955  */
2956 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2957 {
2958         switch (asic_type) {
2959 #if defined(CONFIG_DRM_AMD_DC)
2960 #if defined(CONFIG_DRM_AMD_DC_SI)
2961         case CHIP_TAHITI:
2962         case CHIP_PITCAIRN:
2963         case CHIP_VERDE:
2964         case CHIP_OLAND:
2965 #endif
2966         case CHIP_BONAIRE:
2967         case CHIP_KAVERI:
2968         case CHIP_KABINI:
2969         case CHIP_MULLINS:
2970                 /*
2971                  * We have systems in the wild with these ASICs that require
2972                  * LVDS and VGA support which is not supported with DC.
2973                  *
2974                  * Fallback to the non-DC driver here by default so as not to
2975                  * cause regressions.
2976                  */
2977                 return amdgpu_dc > 0;
2978         case CHIP_HAWAII:
2979         case CHIP_CARRIZO:
2980         case CHIP_STONEY:
2981         case CHIP_POLARIS10:
2982         case CHIP_POLARIS11:
2983         case CHIP_POLARIS12:
2984         case CHIP_VEGAM:
2985         case CHIP_TONGA:
2986         case CHIP_FIJI:
2987         case CHIP_VEGA10:
2988         case CHIP_VEGA12:
2989         case CHIP_VEGA20:
2990 #if defined(CONFIG_DRM_AMD_DC_DCN)
2991         case CHIP_RAVEN:
2992         case CHIP_NAVI10:
2993         case CHIP_NAVI14:
2994         case CHIP_NAVI12:
2995         case CHIP_RENOIR:
2996 #endif
2997 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2998         case CHIP_SIENNA_CICHLID:
2999         case CHIP_NAVY_FLOUNDER:
3000 #endif
3001                 return amdgpu_dc != 0;
3002 #endif
3003         default:
3004                 if (amdgpu_dc > 0)
3005                         DRM_INFO("Display Core has been requested via kernel parameter "
3006                                          "but isn't supported by ASIC, ignoring\n");
3007                 return false;
3008         }
3009 }
3010
3011 /**
3012  * amdgpu_device_has_dc_support - check if dc is supported
3013  *
3014  * @adev: amdgpu_device_pointer
3015  *
3016  * Returns true for supported, false for not supported
3017  */
3018 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3019 {
3020         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3021                 return false;
3022
3023         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3024 }
3025
3026
3027 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3028 {
3029         struct amdgpu_device *adev =
3030                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3031         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3032
3033         /* It's a bug to not have a hive within this function */
3034         if (WARN_ON(!hive))
3035                 return;
3036
3037         /*
3038          * Use task barrier to synchronize all xgmi reset works across the
3039          * hive. task_barrier_enter and task_barrier_exit will block
3040          * until all the threads running the xgmi reset works reach
3041          * those points. task_barrier_full will do both blocks.
3042          */
3043         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3044
3045                 task_barrier_enter(&hive->tb);
3046                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3047
3048                 if (adev->asic_reset_res)
3049                         goto fail;
3050
3051                 task_barrier_exit(&hive->tb);
3052                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3053
3054                 if (adev->asic_reset_res)
3055                         goto fail;
3056
3057                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3058                         adev->mmhub.funcs->reset_ras_error_count(adev);
3059         } else {
3060
3061                 task_barrier_full(&hive->tb);
3062                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3063         }
3064
3065 fail:
3066         if (adev->asic_reset_res)
3067                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3068                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3069         amdgpu_put_xgmi_hive(hive);
3070 }
3071
3072 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3073 {
3074         char *input = amdgpu_lockup_timeout;
3075         char *timeout_setting = NULL;
3076         int index = 0;
3077         long timeout;
3078         int ret = 0;
3079
3080         /*
3081          * By default timeout for non compute jobs is 10000.
3082          * And there is no timeout enforced on compute jobs.
3083          * In SR-IOV or passthrough mode, timeout for compute
3084          * jobs are 60000 by default.
3085          */
3086         adev->gfx_timeout = msecs_to_jiffies(10000);
3087         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3088         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3089                 adev->compute_timeout =  msecs_to_jiffies(60000);
3090         else
3091                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3092
3093         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3094                 while ((timeout_setting = strsep(&input, ",")) &&
3095                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3096                         ret = kstrtol(timeout_setting, 0, &timeout);
3097                         if (ret)
3098                                 return ret;
3099
3100                         if (timeout == 0) {
3101                                 index++;
3102                                 continue;
3103                         } else if (timeout < 0) {
3104                                 timeout = MAX_SCHEDULE_TIMEOUT;
3105                         } else {
3106                                 timeout = msecs_to_jiffies(timeout);
3107                         }
3108
3109                         switch (index++) {
3110                         case 0:
3111                                 adev->gfx_timeout = timeout;
3112                                 break;
3113                         case 1:
3114                                 adev->compute_timeout = timeout;
3115                                 break;
3116                         case 2:
3117                                 adev->sdma_timeout = timeout;
3118                                 break;
3119                         case 3:
3120                                 adev->video_timeout = timeout;
3121                                 break;
3122                         default:
3123                                 break;
3124                         }
3125                 }
3126                 /*
3127                  * There is only one value specified and
3128                  * it should apply to all non-compute jobs.
3129                  */
3130                 if (index == 1) {
3131                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3132                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3133                                 adev->compute_timeout = adev->gfx_timeout;
3134                 }
3135         }
3136
3137         return ret;
3138 }
3139
3140 static const struct attribute *amdgpu_dev_attributes[] = {
3141         &dev_attr_product_name.attr,
3142         &dev_attr_product_number.attr,
3143         &dev_attr_serial_number.attr,
3144         &dev_attr_pcie_replay_count.attr,
3145         NULL
3146 };
3147
3148
3149 /**
3150  * amdgpu_device_init - initialize the driver
3151  *
3152  * @adev: amdgpu_device pointer
3153  * @flags: driver flags
3154  *
3155  * Initializes the driver info and hw (all asics).
3156  * Returns 0 for success or an error on failure.
3157  * Called at driver startup.
3158  */
3159 int amdgpu_device_init(struct amdgpu_device *adev,
3160                        uint32_t flags)
3161 {
3162         struct drm_device *ddev = adev_to_drm(adev);
3163         struct pci_dev *pdev = adev->pdev;
3164         int r, i;
3165         bool boco = false;
3166         u32 max_MBps;
3167
3168         adev->shutdown = false;
3169         adev->flags = flags;
3170
3171         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3172                 adev->asic_type = amdgpu_force_asic_type;
3173         else
3174                 adev->asic_type = flags & AMD_ASIC_MASK;
3175
3176         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3177         if (amdgpu_emu_mode == 1)
3178                 adev->usec_timeout *= 10;
3179         adev->gmc.gart_size = 512 * 1024 * 1024;
3180         adev->accel_working = false;
3181         adev->num_rings = 0;
3182         adev->mman.buffer_funcs = NULL;
3183         adev->mman.buffer_funcs_ring = NULL;
3184         adev->vm_manager.vm_pte_funcs = NULL;
3185         adev->vm_manager.vm_pte_num_scheds = 0;
3186         adev->gmc.gmc_funcs = NULL;
3187         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3188         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3189
3190         adev->smc_rreg = &amdgpu_invalid_rreg;
3191         adev->smc_wreg = &amdgpu_invalid_wreg;
3192         adev->pcie_rreg = &amdgpu_invalid_rreg;
3193         adev->pcie_wreg = &amdgpu_invalid_wreg;
3194         adev->pciep_rreg = &amdgpu_invalid_rreg;
3195         adev->pciep_wreg = &amdgpu_invalid_wreg;
3196         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3197         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3198         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3199         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3200         adev->didt_rreg = &amdgpu_invalid_rreg;
3201         adev->didt_wreg = &amdgpu_invalid_wreg;
3202         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3203         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3204         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3205         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3206
3207         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3208                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3209                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3210
3211         /* mutex initialization are all done here so we
3212          * can recall function without having locking issues */
3213         atomic_set(&adev->irq.ih.lock, 0);
3214         mutex_init(&adev->firmware.mutex);
3215         mutex_init(&adev->pm.mutex);
3216         mutex_init(&adev->gfx.gpu_clock_mutex);
3217         mutex_init(&adev->srbm_mutex);
3218         mutex_init(&adev->gfx.pipe_reserve_mutex);
3219         mutex_init(&adev->gfx.gfx_off_mutex);
3220         mutex_init(&adev->grbm_idx_mutex);
3221         mutex_init(&adev->mn_lock);
3222         mutex_init(&adev->virt.vf_errors.lock);
3223         hash_init(adev->mn_hash);
3224         atomic_set(&adev->in_gpu_reset, 0);
3225         init_rwsem(&adev->reset_sem);
3226         mutex_init(&adev->psp.mutex);
3227         mutex_init(&adev->notifier_lock);
3228
3229         r = amdgpu_device_check_arguments(adev);
3230         if (r)
3231                 return r;
3232
3233         spin_lock_init(&adev->mmio_idx_lock);
3234         spin_lock_init(&adev->smc_idx_lock);
3235         spin_lock_init(&adev->pcie_idx_lock);
3236         spin_lock_init(&adev->uvd_ctx_idx_lock);
3237         spin_lock_init(&adev->didt_idx_lock);
3238         spin_lock_init(&adev->gc_cac_idx_lock);
3239         spin_lock_init(&adev->se_cac_idx_lock);
3240         spin_lock_init(&adev->audio_endpt_idx_lock);
3241         spin_lock_init(&adev->mm_stats.lock);
3242
3243         INIT_LIST_HEAD(&adev->shadow_list);
3244         mutex_init(&adev->shadow_list_lock);
3245
3246         INIT_DELAYED_WORK(&adev->delayed_init_work,
3247                           amdgpu_device_delayed_init_work_handler);
3248         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3249                           amdgpu_device_delay_enable_gfx_off);
3250
3251         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3252
3253         adev->gfx.gfx_off_req_count = 1;
3254         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3255
3256         atomic_set(&adev->throttling_logging_enabled, 1);
3257         /*
3258          * If throttling continues, logging will be performed every minute
3259          * to avoid log flooding. "-1" is subtracted since the thermal
3260          * throttling interrupt comes every second. Thus, the total logging
3261          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3262          * for throttling interrupt) = 60 seconds.
3263          */
3264         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3265         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3266
3267         /* Registers mapping */
3268         /* TODO: block userspace mapping of io register */
3269         if (adev->asic_type >= CHIP_BONAIRE) {
3270                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3271                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3272         } else {
3273                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3274                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3275         }
3276
3277         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3278         if (adev->rmmio == NULL) {
3279                 return -ENOMEM;
3280         }
3281         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3282         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3283
3284         /* io port mapping */
3285         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3286                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3287                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3288                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3289                         break;
3290                 }
3291         }
3292         if (adev->rio_mem == NULL)
3293                 DRM_INFO("PCI I/O BAR is not found.\n");
3294
3295         /* enable PCIE atomic ops */
3296         r = pci_enable_atomic_ops_to_root(adev->pdev,
3297                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3298                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3299         if (r) {
3300                 adev->have_atomics_support = false;
3301                 DRM_INFO("PCIE atomic ops is not supported\n");
3302         } else {
3303                 adev->have_atomics_support = true;
3304         }
3305
3306         amdgpu_device_get_pcie_info(adev);
3307
3308         if (amdgpu_mcbp)
3309                 DRM_INFO("MCBP is enabled\n");
3310
3311         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3312                 adev->enable_mes = true;
3313
3314         /* detect hw virtualization here */
3315         amdgpu_detect_virtualization(adev);
3316
3317         r = amdgpu_device_get_job_timeout_settings(adev);
3318         if (r) {
3319                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3320                 goto failed_unmap;
3321         }
3322
3323         /* early init functions */
3324         r = amdgpu_device_ip_early_init(adev);
3325         if (r)
3326                 goto failed_unmap;
3327
3328         /* doorbell bar mapping and doorbell index init*/
3329         amdgpu_device_doorbell_init(adev);
3330
3331         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3332         /* this will fail for cards that aren't VGA class devices, just
3333          * ignore it */
3334         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3335
3336         if (amdgpu_device_supports_boco(ddev))
3337                 boco = true;
3338         if (amdgpu_has_atpx() &&
3339             (amdgpu_is_atpx_hybrid() ||
3340              amdgpu_has_atpx_dgpu_power_cntl()) &&
3341             !pci_is_thunderbolt_attached(adev->pdev))
3342                 vga_switcheroo_register_client(adev->pdev,
3343                                                &amdgpu_switcheroo_ops, boco);
3344         if (boco)
3345                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3346
3347         if (amdgpu_emu_mode == 1) {
3348                 /* post the asic on emulation mode */
3349                 emu_soc_asic_init(adev);
3350                 goto fence_driver_init;
3351         }
3352
3353         /* detect if we are with an SRIOV vbios */
3354         amdgpu_device_detect_sriov_bios(adev);
3355
3356         /* check if we need to reset the asic
3357          *  E.g., driver was not cleanly unloaded previously, etc.
3358          */
3359         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3360                 r = amdgpu_asic_reset(adev);
3361                 if (r) {
3362                         dev_err(adev->dev, "asic reset on init failed\n");
3363                         goto failed;
3364                 }
3365         }
3366
3367         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3368
3369         /* Post card if necessary */
3370         if (amdgpu_device_need_post(adev)) {
3371                 if (!adev->bios) {
3372                         dev_err(adev->dev, "no vBIOS found\n");
3373                         r = -EINVAL;
3374                         goto failed;
3375                 }
3376                 DRM_INFO("GPU posting now...\n");
3377                 r = amdgpu_device_asic_init(adev);
3378                 if (r) {
3379                         dev_err(adev->dev, "gpu post error!\n");
3380                         goto failed;
3381                 }
3382         }
3383
3384         if (adev->is_atom_fw) {
3385                 /* Initialize clocks */
3386                 r = amdgpu_atomfirmware_get_clock_info(adev);
3387                 if (r) {
3388                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3389                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3390                         goto failed;
3391                 }
3392         } else {
3393                 /* Initialize clocks */
3394                 r = amdgpu_atombios_get_clock_info(adev);
3395                 if (r) {
3396                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3397                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3398                         goto failed;
3399                 }
3400                 /* init i2c buses */
3401                 if (!amdgpu_device_has_dc_support(adev))
3402                         amdgpu_atombios_i2c_init(adev);
3403         }
3404
3405 fence_driver_init:
3406         /* Fence driver */
3407         r = amdgpu_fence_driver_init(adev);
3408         if (r) {
3409                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3410                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3411                 goto failed;
3412         }
3413
3414         /* init the mode config */
3415         drm_mode_config_init(adev_to_drm(adev));
3416
3417         r = amdgpu_device_ip_init(adev);
3418         if (r) {
3419                 /* failed in exclusive mode due to timeout */
3420                 if (amdgpu_sriov_vf(adev) &&
3421                     !amdgpu_sriov_runtime(adev) &&
3422                     amdgpu_virt_mmio_blocked(adev) &&
3423                     !amdgpu_virt_wait_reset(adev)) {
3424                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3425                         /* Don't send request since VF is inactive. */
3426                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3427                         adev->virt.ops = NULL;
3428                         r = -EAGAIN;
3429                         goto failed;
3430                 }
3431                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3432                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3433                 goto failed;
3434         }
3435
3436         dev_info(adev->dev,
3437                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3438                         adev->gfx.config.max_shader_engines,
3439                         adev->gfx.config.max_sh_per_se,
3440                         adev->gfx.config.max_cu_per_sh,
3441                         adev->gfx.cu_info.number);
3442
3443         adev->accel_working = true;
3444
3445         amdgpu_vm_check_compute_bug(adev);
3446
3447         /* Initialize the buffer migration limit. */
3448         if (amdgpu_moverate >= 0)
3449                 max_MBps = amdgpu_moverate;
3450         else
3451                 max_MBps = 8; /* Allow 8 MB/s. */
3452         /* Get a log2 for easy divisions. */
3453         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3454
3455         amdgpu_fbdev_init(adev);
3456
3457         r = amdgpu_pm_sysfs_init(adev);
3458         if (r) {
3459                 adev->pm_sysfs_en = false;
3460                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3461         } else
3462                 adev->pm_sysfs_en = true;
3463
3464         r = amdgpu_ucode_sysfs_init(adev);
3465         if (r) {
3466                 adev->ucode_sysfs_en = false;
3467                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3468         } else
3469                 adev->ucode_sysfs_en = true;
3470
3471         if ((amdgpu_testing & 1)) {
3472                 if (adev->accel_working)
3473                         amdgpu_test_moves(adev);
3474                 else
3475                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3476         }
3477         if (amdgpu_benchmarking) {
3478                 if (adev->accel_working)
3479                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3480                 else
3481                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3482         }
3483
3484         /*
3485          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3486          * Otherwise the mgpu fan boost feature will be skipped due to the
3487          * gpu instance is counted less.
3488          */
3489         amdgpu_register_gpu_instance(adev);
3490
3491         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3492          * explicit gating rather than handling it automatically.
3493          */
3494         r = amdgpu_device_ip_late_init(adev);
3495         if (r) {
3496                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3497                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3498                 goto failed;
3499         }
3500
3501         /* must succeed. */
3502         amdgpu_ras_resume(adev);
3503
3504         queue_delayed_work(system_wq, &adev->delayed_init_work,
3505                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3506
3507         if (amdgpu_sriov_vf(adev))
3508                 flush_delayed_work(&adev->delayed_init_work);
3509
3510         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3511         if (r)
3512                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3513
3514         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3515                 r = amdgpu_pmu_init(adev);
3516         if (r)
3517                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3518
3519         /* Have stored pci confspace at hand for restore in sudden PCI error */
3520         if (amdgpu_device_cache_pci_state(adev->pdev))
3521                 pci_restore_state(pdev);
3522
3523         return 0;
3524
3525 failed:
3526         amdgpu_vf_error_trans_all(adev);
3527         if (boco)
3528                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3529
3530 failed_unmap:
3531         iounmap(adev->rmmio);
3532         adev->rmmio = NULL;
3533
3534         return r;
3535 }
3536
3537 /**
3538  * amdgpu_device_fini - tear down the driver
3539  *
3540  * @adev: amdgpu_device pointer
3541  *
3542  * Tear down the driver info (all asics).
3543  * Called at driver shutdown.
3544  */
3545 void amdgpu_device_fini(struct amdgpu_device *adev)
3546 {
3547         dev_info(adev->dev, "amdgpu: finishing device.\n");
3548         flush_delayed_work(&adev->delayed_init_work);
3549         adev->shutdown = true;
3550
3551         kfree(adev->pci_state);
3552
3553         /* make sure IB test finished before entering exclusive mode
3554          * to avoid preemption on IB test
3555          * */
3556         if (amdgpu_sriov_vf(adev)) {
3557                 amdgpu_virt_request_full_gpu(adev, false);
3558                 amdgpu_virt_fini_data_exchange(adev);
3559         }
3560
3561         /* disable all interrupts */
3562         amdgpu_irq_disable_all(adev);
3563         if (adev->mode_info.mode_config_initialized){
3564                 if (!amdgpu_device_has_dc_support(adev))
3565                         drm_helper_force_disable_all(adev_to_drm(adev));
3566                 else
3567                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3568         }
3569         amdgpu_fence_driver_fini(adev);
3570         if (adev->pm_sysfs_en)
3571                 amdgpu_pm_sysfs_fini(adev);
3572         amdgpu_fbdev_fini(adev);
3573         amdgpu_device_ip_fini(adev);
3574         release_firmware(adev->firmware.gpu_info_fw);
3575         adev->firmware.gpu_info_fw = NULL;
3576         adev->accel_working = false;
3577         /* free i2c buses */
3578         if (!amdgpu_device_has_dc_support(adev))
3579                 amdgpu_i2c_fini(adev);
3580
3581         if (amdgpu_emu_mode != 1)
3582                 amdgpu_atombios_fini(adev);
3583
3584         kfree(adev->bios);
3585         adev->bios = NULL;
3586         if (amdgpu_has_atpx() &&
3587             (amdgpu_is_atpx_hybrid() ||
3588              amdgpu_has_atpx_dgpu_power_cntl()) &&
3589             !pci_is_thunderbolt_attached(adev->pdev))
3590                 vga_switcheroo_unregister_client(adev->pdev);
3591         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3592                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3593         vga_client_register(adev->pdev, NULL, NULL, NULL);
3594         if (adev->rio_mem)
3595                 pci_iounmap(adev->pdev, adev->rio_mem);
3596         adev->rio_mem = NULL;
3597         iounmap(adev->rmmio);
3598         adev->rmmio = NULL;
3599         amdgpu_device_doorbell_fini(adev);
3600
3601         if (adev->ucode_sysfs_en)
3602                 amdgpu_ucode_sysfs_fini(adev);
3603
3604         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3605         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3606                 amdgpu_pmu_fini(adev);
3607         if (adev->mman.discovery_bin)
3608                 amdgpu_discovery_fini(adev);
3609 }
3610
3611
3612 /*
3613  * Suspend & resume.
3614  */
3615 /**
3616  * amdgpu_device_suspend - initiate device suspend
3617  *
3618  * @dev: drm dev pointer
3619  * @fbcon : notify the fbdev of suspend
3620  *
3621  * Puts the hw in the suspend state (all asics).
3622  * Returns 0 for success or an error on failure.
3623  * Called at driver suspend.
3624  */
3625 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3626 {
3627         struct amdgpu_device *adev;
3628         struct drm_crtc *crtc;
3629         struct drm_connector *connector;
3630         struct drm_connector_list_iter iter;
3631         int r;
3632
3633         adev = drm_to_adev(dev);
3634
3635         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3636                 return 0;
3637
3638         adev->in_suspend = true;
3639         drm_kms_helper_poll_disable(dev);
3640
3641         if (fbcon)
3642                 amdgpu_fbdev_set_suspend(adev, 1);
3643
3644         cancel_delayed_work_sync(&adev->delayed_init_work);
3645
3646         if (!amdgpu_device_has_dc_support(adev)) {
3647                 /* turn off display hw */
3648                 drm_modeset_lock_all(dev);
3649                 drm_connector_list_iter_begin(dev, &iter);
3650                 drm_for_each_connector_iter(connector, &iter)
3651                         drm_helper_connector_dpms(connector,
3652                                                   DRM_MODE_DPMS_OFF);
3653                 drm_connector_list_iter_end(&iter);
3654                 drm_modeset_unlock_all(dev);
3655                         /* unpin the front buffers and cursors */
3656                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3657                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3658                         struct drm_framebuffer *fb = crtc->primary->fb;
3659                         struct amdgpu_bo *robj;
3660
3661                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3662                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3663                                 r = amdgpu_bo_reserve(aobj, true);
3664                                 if (r == 0) {
3665                                         amdgpu_bo_unpin(aobj);
3666                                         amdgpu_bo_unreserve(aobj);
3667                                 }
3668                         }
3669
3670                         if (fb == NULL || fb->obj[0] == NULL) {
3671                                 continue;
3672                         }
3673                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3674                         /* don't unpin kernel fb objects */
3675                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3676                                 r = amdgpu_bo_reserve(robj, true);
3677                                 if (r == 0) {
3678                                         amdgpu_bo_unpin(robj);
3679                                         amdgpu_bo_unreserve(robj);
3680                                 }
3681                         }
3682                 }
3683         }
3684
3685         amdgpu_ras_suspend(adev);
3686
3687         r = amdgpu_device_ip_suspend_phase1(adev);
3688
3689         amdgpu_amdkfd_suspend(adev, !fbcon);
3690
3691         /* evict vram memory */
3692         amdgpu_bo_evict_vram(adev);
3693
3694         amdgpu_fence_driver_suspend(adev);
3695
3696         r = amdgpu_device_ip_suspend_phase2(adev);
3697
3698         /* evict remaining vram memory
3699          * This second call to evict vram is to evict the gart page table
3700          * using the CPU.
3701          */
3702         amdgpu_bo_evict_vram(adev);
3703
3704         return 0;
3705 }
3706
3707 /**
3708  * amdgpu_device_resume - initiate device resume
3709  *
3710  * @dev: drm dev pointer
3711  * @fbcon : notify the fbdev of resume
3712  *
3713  * Bring the hw back to operating state (all asics).
3714  * Returns 0 for success or an error on failure.
3715  * Called at driver resume.
3716  */
3717 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3718 {
3719         struct drm_connector *connector;
3720         struct drm_connector_list_iter iter;
3721         struct amdgpu_device *adev = drm_to_adev(dev);
3722         struct drm_crtc *crtc;
3723         int r = 0;
3724
3725         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3726                 return 0;
3727
3728         /* post card */
3729         if (amdgpu_device_need_post(adev)) {
3730                 r = amdgpu_device_asic_init(adev);
3731                 if (r)
3732                         dev_err(adev->dev, "amdgpu asic init failed\n");
3733         }
3734
3735         r = amdgpu_device_ip_resume(adev);
3736         if (r) {
3737                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3738                 return r;
3739         }
3740         amdgpu_fence_driver_resume(adev);
3741
3742
3743         r = amdgpu_device_ip_late_init(adev);
3744         if (r)
3745                 return r;
3746
3747         queue_delayed_work(system_wq, &adev->delayed_init_work,
3748                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3749
3750         if (!amdgpu_device_has_dc_support(adev)) {
3751                 /* pin cursors */
3752                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3753                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3754
3755                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3756                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3757                                 r = amdgpu_bo_reserve(aobj, true);
3758                                 if (r == 0) {
3759                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3760                                         if (r != 0)
3761                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3762                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3763                                         amdgpu_bo_unreserve(aobj);
3764                                 }
3765                         }
3766                 }
3767         }
3768         r = amdgpu_amdkfd_resume(adev, !fbcon);
3769         if (r)
3770                 return r;
3771
3772         /* Make sure IB tests flushed */
3773         flush_delayed_work(&adev->delayed_init_work);
3774
3775         /* blat the mode back in */
3776         if (fbcon) {
3777                 if (!amdgpu_device_has_dc_support(adev)) {
3778                         /* pre DCE11 */
3779                         drm_helper_resume_force_mode(dev);
3780
3781                         /* turn on display hw */
3782                         drm_modeset_lock_all(dev);
3783
3784                         drm_connector_list_iter_begin(dev, &iter);
3785                         drm_for_each_connector_iter(connector, &iter)
3786                                 drm_helper_connector_dpms(connector,
3787                                                           DRM_MODE_DPMS_ON);
3788                         drm_connector_list_iter_end(&iter);
3789
3790                         drm_modeset_unlock_all(dev);
3791                 }
3792                 amdgpu_fbdev_set_suspend(adev, 0);
3793         }
3794
3795         drm_kms_helper_poll_enable(dev);
3796
3797         amdgpu_ras_resume(adev);
3798
3799         /*
3800          * Most of the connector probing functions try to acquire runtime pm
3801          * refs to ensure that the GPU is powered on when connector polling is
3802          * performed. Since we're calling this from a runtime PM callback,
3803          * trying to acquire rpm refs will cause us to deadlock.
3804          *
3805          * Since we're guaranteed to be holding the rpm lock, it's safe to
3806          * temporarily disable the rpm helpers so this doesn't deadlock us.
3807          */
3808 #ifdef CONFIG_PM
3809         dev->dev->power.disable_depth++;
3810 #endif
3811         if (!amdgpu_device_has_dc_support(adev))
3812                 drm_helper_hpd_irq_event(dev);
3813         else
3814                 drm_kms_helper_hotplug_event(dev);
3815 #ifdef CONFIG_PM
3816         dev->dev->power.disable_depth--;
3817 #endif
3818         adev->in_suspend = false;
3819
3820         return 0;
3821 }
3822
3823 /**
3824  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3825  *
3826  * @adev: amdgpu_device pointer
3827  *
3828  * The list of all the hardware IPs that make up the asic is walked and
3829  * the check_soft_reset callbacks are run.  check_soft_reset determines
3830  * if the asic is still hung or not.
3831  * Returns true if any of the IPs are still in a hung state, false if not.
3832  */
3833 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3834 {
3835         int i;
3836         bool asic_hang = false;
3837
3838         if (amdgpu_sriov_vf(adev))
3839                 return true;
3840
3841         if (amdgpu_asic_need_full_reset(adev))
3842                 return true;
3843
3844         for (i = 0; i < adev->num_ip_blocks; i++) {
3845                 if (!adev->ip_blocks[i].status.valid)
3846                         continue;
3847                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3848                         adev->ip_blocks[i].status.hang =
3849                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3850                 if (adev->ip_blocks[i].status.hang) {
3851                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3852                         asic_hang = true;
3853                 }
3854         }
3855         return asic_hang;
3856 }
3857
3858 /**
3859  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3860  *
3861  * @adev: amdgpu_device pointer
3862  *
3863  * The list of all the hardware IPs that make up the asic is walked and the
3864  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3865  * handles any IP specific hardware or software state changes that are
3866  * necessary for a soft reset to succeed.
3867  * Returns 0 on success, negative error code on failure.
3868  */
3869 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3870 {
3871         int i, r = 0;
3872
3873         for (i = 0; i < adev->num_ip_blocks; i++) {
3874                 if (!adev->ip_blocks[i].status.valid)
3875                         continue;
3876                 if (adev->ip_blocks[i].status.hang &&
3877                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3878                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3879                         if (r)
3880                                 return r;
3881                 }
3882         }
3883
3884         return 0;
3885 }
3886
3887 /**
3888  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3889  *
3890  * @adev: amdgpu_device pointer
3891  *
3892  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3893  * reset is necessary to recover.
3894  * Returns true if a full asic reset is required, false if not.
3895  */
3896 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3897 {
3898         int i;
3899
3900         if (amdgpu_asic_need_full_reset(adev))
3901                 return true;
3902
3903         for (i = 0; i < adev->num_ip_blocks; i++) {
3904                 if (!adev->ip_blocks[i].status.valid)
3905                         continue;
3906                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3907                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3908                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3909                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3910                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3911                         if (adev->ip_blocks[i].status.hang) {
3912                                 dev_info(adev->dev, "Some block need full reset!\n");
3913                                 return true;
3914                         }
3915                 }
3916         }
3917         return false;
3918 }
3919
3920 /**
3921  * amdgpu_device_ip_soft_reset - do a soft reset
3922  *
3923  * @adev: amdgpu_device pointer
3924  *
3925  * The list of all the hardware IPs that make up the asic is walked and the
3926  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3927  * IP specific hardware or software state changes that are necessary to soft
3928  * reset the IP.
3929  * Returns 0 on success, negative error code on failure.
3930  */
3931 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3932 {
3933         int i, r = 0;
3934
3935         for (i = 0; i < adev->num_ip_blocks; i++) {
3936                 if (!adev->ip_blocks[i].status.valid)
3937                         continue;
3938                 if (adev->ip_blocks[i].status.hang &&
3939                     adev->ip_blocks[i].version->funcs->soft_reset) {
3940                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3941                         if (r)
3942                                 return r;
3943                 }
3944         }
3945
3946         return 0;
3947 }
3948
3949 /**
3950  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3951  *
3952  * @adev: amdgpu_device pointer
3953  *
3954  * The list of all the hardware IPs that make up the asic is walked and the
3955  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3956  * handles any IP specific hardware or software state changes that are
3957  * necessary after the IP has been soft reset.
3958  * Returns 0 on success, negative error code on failure.
3959  */
3960 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3961 {
3962         int i, r = 0;
3963
3964         for (i = 0; i < adev->num_ip_blocks; i++) {
3965                 if (!adev->ip_blocks[i].status.valid)
3966                         continue;
3967                 if (adev->ip_blocks[i].status.hang &&
3968                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3969                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3970                 if (r)
3971                         return r;
3972         }
3973
3974         return 0;
3975 }
3976
3977 /**
3978  * amdgpu_device_recover_vram - Recover some VRAM contents
3979  *
3980  * @adev: amdgpu_device pointer
3981  *
3982  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3983  * restore things like GPUVM page tables after a GPU reset where
3984  * the contents of VRAM might be lost.
3985  *
3986  * Returns:
3987  * 0 on success, negative error code on failure.
3988  */
3989 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3990 {
3991         struct dma_fence *fence = NULL, *next = NULL;
3992         struct amdgpu_bo *shadow;
3993         long r = 1, tmo;
3994
3995         if (amdgpu_sriov_runtime(adev))
3996                 tmo = msecs_to_jiffies(8000);
3997         else
3998                 tmo = msecs_to_jiffies(100);
3999
4000         dev_info(adev->dev, "recover vram bo from shadow start\n");
4001         mutex_lock(&adev->shadow_list_lock);
4002         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4003
4004                 /* No need to recover an evicted BO */
4005                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4006                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4007                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4008                         continue;
4009
4010                 r = amdgpu_bo_restore_shadow(shadow, &next);
4011                 if (r)
4012                         break;
4013
4014                 if (fence) {
4015                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4016                         dma_fence_put(fence);
4017                         fence = next;
4018                         if (tmo == 0) {
4019                                 r = -ETIMEDOUT;
4020                                 break;
4021                         } else if (tmo < 0) {
4022                                 r = tmo;
4023                                 break;
4024                         }
4025                 } else {
4026                         fence = next;
4027                 }
4028         }
4029         mutex_unlock(&adev->shadow_list_lock);
4030
4031         if (fence)
4032                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4033         dma_fence_put(fence);
4034
4035         if (r < 0 || tmo <= 0) {
4036                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4037                 return -EIO;
4038         }
4039
4040         dev_info(adev->dev, "recover vram bo from shadow done\n");
4041         return 0;
4042 }
4043
4044
4045 /**
4046  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4047  *
4048  * @adev: amdgpu device pointer
4049  * @from_hypervisor: request from hypervisor
4050  *
4051  * do VF FLR and reinitialize Asic
4052  * return 0 means succeeded otherwise failed
4053  */
4054 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4055                                      bool from_hypervisor)
4056 {
4057         int r;
4058
4059         if (from_hypervisor)
4060                 r = amdgpu_virt_request_full_gpu(adev, true);
4061         else
4062                 r = amdgpu_virt_reset_gpu(adev);
4063         if (r)
4064                 return r;
4065
4066         amdgpu_amdkfd_pre_reset(adev);
4067
4068         /* Resume IP prior to SMC */
4069         r = amdgpu_device_ip_reinit_early_sriov(adev);
4070         if (r)
4071                 goto error;
4072
4073         amdgpu_virt_init_data_exchange(adev);
4074         /* we need recover gart prior to run SMC/CP/SDMA resume */
4075         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4076
4077         r = amdgpu_device_fw_loading(adev);
4078         if (r)
4079                 return r;
4080
4081         /* now we are okay to resume SMC/CP/SDMA */
4082         r = amdgpu_device_ip_reinit_late_sriov(adev);
4083         if (r)
4084                 goto error;
4085
4086         amdgpu_irq_gpu_reset_resume_helper(adev);
4087         r = amdgpu_ib_ring_tests(adev);
4088         amdgpu_amdkfd_post_reset(adev);
4089
4090 error:
4091         amdgpu_virt_release_full_gpu(adev, true);
4092         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4093                 amdgpu_inc_vram_lost(adev);
4094                 r = amdgpu_device_recover_vram(adev);
4095         }
4096
4097         return r;
4098 }
4099
4100 /**
4101  * amdgpu_device_has_job_running - check if there is any job in mirror list
4102  *
4103  * @adev: amdgpu device pointer
4104  *
4105  * check if there is any job in mirror list
4106  */
4107 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4108 {
4109         int i;
4110         struct drm_sched_job *job;
4111
4112         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4113                 struct amdgpu_ring *ring = adev->rings[i];
4114
4115                 if (!ring || !ring->sched.thread)
4116                         continue;
4117
4118                 spin_lock(&ring->sched.job_list_lock);
4119                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4120                                 struct drm_sched_job, node);
4121                 spin_unlock(&ring->sched.job_list_lock);
4122                 if (job)
4123                         return true;
4124         }
4125         return false;
4126 }
4127
4128 /**
4129  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4130  *
4131  * @adev: amdgpu device pointer
4132  *
4133  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4134  * a hung GPU.
4135  */
4136 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4137 {
4138         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4139                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4140                 return false;
4141         }
4142
4143         if (amdgpu_gpu_recovery == 0)
4144                 goto disabled;
4145
4146         if (amdgpu_sriov_vf(adev))
4147                 return true;
4148
4149         if (amdgpu_gpu_recovery == -1) {
4150                 switch (adev->asic_type) {
4151                 case CHIP_BONAIRE:
4152                 case CHIP_HAWAII:
4153                 case CHIP_TOPAZ:
4154                 case CHIP_TONGA:
4155                 case CHIP_FIJI:
4156                 case CHIP_POLARIS10:
4157                 case CHIP_POLARIS11:
4158                 case CHIP_POLARIS12:
4159                 case CHIP_VEGAM:
4160                 case CHIP_VEGA20:
4161                 case CHIP_VEGA10:
4162                 case CHIP_VEGA12:
4163                 case CHIP_RAVEN:
4164                 case CHIP_ARCTURUS:
4165                 case CHIP_RENOIR:
4166                 case CHIP_NAVI10:
4167                 case CHIP_NAVI14:
4168                 case CHIP_NAVI12:
4169                 case CHIP_SIENNA_CICHLID:
4170                         break;
4171                 default:
4172                         goto disabled;
4173                 }
4174         }
4175
4176         return true;
4177
4178 disabled:
4179                 dev_info(adev->dev, "GPU recovery disabled.\n");
4180                 return false;
4181 }
4182
4183
4184 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4185                                         struct amdgpu_job *job,
4186                                         bool *need_full_reset_arg)
4187 {
4188         int i, r = 0;
4189         bool need_full_reset  = *need_full_reset_arg;
4190
4191         amdgpu_debugfs_wait_dump(adev);
4192
4193         if (amdgpu_sriov_vf(adev)) {
4194                 /* stop the data exchange thread */
4195                 amdgpu_virt_fini_data_exchange(adev);
4196         }
4197
4198         /* block all schedulers and reset given job's ring */
4199         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4200                 struct amdgpu_ring *ring = adev->rings[i];
4201
4202                 if (!ring || !ring->sched.thread)
4203                         continue;
4204
4205                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4206                 amdgpu_fence_driver_force_completion(ring);
4207         }
4208
4209         if(job)
4210                 drm_sched_increase_karma(&job->base);
4211
4212         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4213         if (!amdgpu_sriov_vf(adev)) {
4214
4215                 if (!need_full_reset)
4216                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4217
4218                 if (!need_full_reset) {
4219                         amdgpu_device_ip_pre_soft_reset(adev);
4220                         r = amdgpu_device_ip_soft_reset(adev);
4221                         amdgpu_device_ip_post_soft_reset(adev);
4222                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4223                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4224                                 need_full_reset = true;
4225                         }
4226                 }
4227
4228                 if (need_full_reset)
4229                         r = amdgpu_device_ip_suspend(adev);
4230
4231                 *need_full_reset_arg = need_full_reset;
4232         }
4233
4234         return r;
4235 }
4236
4237 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4238                                struct list_head *device_list_handle,
4239                                bool *need_full_reset_arg,
4240                                bool skip_hw_reset)
4241 {
4242         struct amdgpu_device *tmp_adev = NULL;
4243         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4244         int r = 0;
4245
4246         /*
4247          * ASIC reset has to be done on all HGMI hive nodes ASAP
4248          * to allow proper links negotiation in FW (within 1 sec)
4249          */
4250         if (!skip_hw_reset && need_full_reset) {
4251                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4252                         /* For XGMI run all resets in parallel to speed up the process */
4253                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4254                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4255                                         r = -EALREADY;
4256                         } else
4257                                 r = amdgpu_asic_reset(tmp_adev);
4258
4259                         if (r) {
4260                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4261                                          r, adev_to_drm(tmp_adev)->unique);
4262                                 break;
4263                         }
4264                 }
4265
4266                 /* For XGMI wait for all resets to complete before proceed */
4267                 if (!r) {
4268                         list_for_each_entry(tmp_adev, device_list_handle,
4269                                             gmc.xgmi.head) {
4270                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4271                                         flush_work(&tmp_adev->xgmi_reset_work);
4272                                         r = tmp_adev->asic_reset_res;
4273                                         if (r)
4274                                                 break;
4275                                 }
4276                         }
4277                 }
4278         }
4279
4280         if (!r && amdgpu_ras_intr_triggered()) {
4281                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4282                         if (tmp_adev->mmhub.funcs &&
4283                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4284                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4285                 }
4286
4287                 amdgpu_ras_intr_cleared();
4288         }
4289
4290         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4291                 if (need_full_reset) {
4292                         /* post card */
4293                         if (amdgpu_device_asic_init(tmp_adev))
4294                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4295
4296                         if (!r) {
4297                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4298                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4299                                 if (r)
4300                                         goto out;
4301
4302                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4303                                 if (vram_lost) {
4304                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4305                                         amdgpu_inc_vram_lost(tmp_adev);
4306                                 }
4307
4308                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4309                                 if (r)
4310                                         goto out;
4311
4312                                 r = amdgpu_device_fw_loading(tmp_adev);
4313                                 if (r)
4314                                         return r;
4315
4316                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4317                                 if (r)
4318                                         goto out;
4319
4320                                 if (vram_lost)
4321                                         amdgpu_device_fill_reset_magic(tmp_adev);
4322
4323                                 /*
4324                                  * Add this ASIC as tracked as reset was already
4325                                  * complete successfully.
4326                                  */
4327                                 amdgpu_register_gpu_instance(tmp_adev);
4328
4329                                 r = amdgpu_device_ip_late_init(tmp_adev);
4330                                 if (r)
4331                                         goto out;
4332
4333                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4334
4335                                 /*
4336                                  * The GPU enters bad state once faulty pages
4337                                  * by ECC has reached the threshold, and ras
4338                                  * recovery is scheduled next. So add one check
4339                                  * here to break recovery if it indeed exceeds
4340                                  * bad page threshold, and remind user to
4341                                  * retire this GPU or setting one bigger
4342                                  * bad_page_threshold value to fix this once
4343                                  * probing driver again.
4344                                  */
4345                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4346                                         /* must succeed. */
4347                                         amdgpu_ras_resume(tmp_adev);
4348                                 } else {
4349                                         r = -EINVAL;
4350                                         goto out;
4351                                 }
4352
4353                                 /* Update PSP FW topology after reset */
4354                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4355                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4356                         }
4357                 }
4358
4359 out:
4360                 if (!r) {
4361                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4362                         r = amdgpu_ib_ring_tests(tmp_adev);
4363                         if (r) {
4364                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4365                                 r = amdgpu_device_ip_suspend(tmp_adev);
4366                                 need_full_reset = true;
4367                                 r = -EAGAIN;
4368                                 goto end;
4369                         }
4370                 }
4371
4372                 if (!r)
4373                         r = amdgpu_device_recover_vram(tmp_adev);
4374                 else
4375                         tmp_adev->asic_reset_res = r;
4376         }
4377
4378 end:
4379         *need_full_reset_arg = need_full_reset;
4380         return r;
4381 }
4382
4383 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4384                                 struct amdgpu_hive_info *hive)
4385 {
4386         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4387                 return false;
4388
4389         if (hive) {
4390                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4391         } else {
4392                 down_write(&adev->reset_sem);
4393         }
4394
4395         atomic_inc(&adev->gpu_reset_counter);
4396         switch (amdgpu_asic_reset_method(adev)) {
4397         case AMD_RESET_METHOD_MODE1:
4398                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4399                 break;
4400         case AMD_RESET_METHOD_MODE2:
4401                 adev->mp1_state = PP_MP1_STATE_RESET;
4402                 break;
4403         default:
4404                 adev->mp1_state = PP_MP1_STATE_NONE;
4405                 break;
4406         }
4407
4408         return true;
4409 }
4410
4411 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4412 {
4413         amdgpu_vf_error_trans_all(adev);
4414         adev->mp1_state = PP_MP1_STATE_NONE;
4415         atomic_set(&adev->in_gpu_reset, 0);
4416         up_write(&adev->reset_sem);
4417 }
4418
4419 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4420 {
4421         struct pci_dev *p = NULL;
4422
4423         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4424                         adev->pdev->bus->number, 1);
4425         if (p) {
4426                 pm_runtime_enable(&(p->dev));
4427                 pm_runtime_resume(&(p->dev));
4428         }
4429 }
4430
4431 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4432 {
4433         enum amd_reset_method reset_method;
4434         struct pci_dev *p = NULL;
4435         u64 expires;
4436
4437         /*
4438          * For now, only BACO and mode1 reset are confirmed
4439          * to suffer the audio issue without proper suspended.
4440          */
4441         reset_method = amdgpu_asic_reset_method(adev);
4442         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4443              (reset_method != AMD_RESET_METHOD_MODE1))
4444                 return -EINVAL;
4445
4446         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4447                         adev->pdev->bus->number, 1);
4448         if (!p)
4449                 return -ENODEV;
4450
4451         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4452         if (!expires)
4453                 /*
4454                  * If we cannot get the audio device autosuspend delay,
4455                  * a fixed 4S interval will be used. Considering 3S is
4456                  * the audio controller default autosuspend delay setting.
4457                  * 4S used here is guaranteed to cover that.
4458                  */
4459                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4460
4461         while (!pm_runtime_status_suspended(&(p->dev))) {
4462                 if (!pm_runtime_suspend(&(p->dev)))
4463                         break;
4464
4465                 if (expires < ktime_get_mono_fast_ns()) {
4466                         dev_warn(adev->dev, "failed to suspend display audio\n");
4467                         /* TODO: abort the succeeding gpu reset? */
4468                         return -ETIMEDOUT;
4469                 }
4470         }
4471
4472         pm_runtime_disable(&(p->dev));
4473
4474         return 0;
4475 }
4476
4477 /**
4478  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4479  *
4480  * @adev: amdgpu device pointer
4481  * @job: which job trigger hang
4482  *
4483  * Attempt to reset the GPU if it has hung (all asics).
4484  * Attempt to do soft-reset or full-reset and reinitialize Asic
4485  * Returns 0 for success or an error on failure.
4486  */
4487
4488 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4489                               struct amdgpu_job *job)
4490 {
4491         struct list_head device_list, *device_list_handle =  NULL;
4492         bool need_full_reset = false;
4493         bool job_signaled = false;
4494         struct amdgpu_hive_info *hive = NULL;
4495         struct amdgpu_device *tmp_adev = NULL;
4496         int i, r = 0;
4497         bool need_emergency_restart = false;
4498         bool audio_suspended = false;
4499
4500         /**
4501          * Special case: RAS triggered and full reset isn't supported
4502          */
4503         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4504
4505         /*
4506          * Flush RAM to disk so that after reboot
4507          * the user can read log and see why the system rebooted.
4508          */
4509         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4510                 DRM_WARN("Emergency reboot.");
4511
4512                 ksys_sync_helper();
4513                 emergency_restart();
4514         }
4515
4516         dev_info(adev->dev, "GPU %s begin!\n",
4517                 need_emergency_restart ? "jobs stop":"reset");
4518
4519         /*
4520          * Here we trylock to avoid chain of resets executing from
4521          * either trigger by jobs on different adevs in XGMI hive or jobs on
4522          * different schedulers for same device while this TO handler is running.
4523          * We always reset all schedulers for device and all devices for XGMI
4524          * hive so that should take care of them too.
4525          */
4526         hive = amdgpu_get_xgmi_hive(adev);
4527         if (hive) {
4528                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4529                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4530                                 job ? job->base.id : -1, hive->hive_id);
4531                         amdgpu_put_xgmi_hive(hive);
4532                         return 0;
4533                 }
4534                 mutex_lock(&hive->hive_lock);
4535         }
4536
4537         /*
4538          * Build list of devices to reset.
4539          * In case we are in XGMI hive mode, resort the device list
4540          * to put adev in the 1st position.
4541          */
4542         INIT_LIST_HEAD(&device_list);
4543         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4544                 if (!hive)
4545                         return -ENODEV;
4546                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4547                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4548                 device_list_handle = &hive->device_list;
4549         } else {
4550                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4551                 device_list_handle = &device_list;
4552         }
4553
4554         /* block all schedulers and reset given job's ring */
4555         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4556                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4557                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4558                                   job ? job->base.id : -1);
4559                         r = 0;
4560                         goto skip_recovery;
4561                 }
4562
4563                 /*
4564                  * Try to put the audio codec into suspend state
4565                  * before gpu reset started.
4566                  *
4567                  * Due to the power domain of the graphics device
4568                  * is shared with AZ power domain. Without this,
4569                  * we may change the audio hardware from behind
4570                  * the audio driver's back. That will trigger
4571                  * some audio codec errors.
4572                  */
4573                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4574                         audio_suspended = true;
4575
4576                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4577
4578                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4579
4580                 if (!amdgpu_sriov_vf(tmp_adev))
4581                         amdgpu_amdkfd_pre_reset(tmp_adev);
4582
4583                 /*
4584                  * Mark these ASICs to be reseted as untracked first
4585                  * And add them back after reset completed
4586                  */
4587                 amdgpu_unregister_gpu_instance(tmp_adev);
4588
4589                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4590
4591                 /* disable ras on ALL IPs */
4592                 if (!need_emergency_restart &&
4593                       amdgpu_device_ip_need_full_reset(tmp_adev))
4594                         amdgpu_ras_suspend(tmp_adev);
4595
4596                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4597                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4598
4599                         if (!ring || !ring->sched.thread)
4600                                 continue;
4601
4602                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4603
4604                         if (need_emergency_restart)
4605                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4606                 }
4607         }
4608
4609         if (need_emergency_restart)
4610                 goto skip_sched_resume;
4611
4612         /*
4613          * Must check guilty signal here since after this point all old
4614          * HW fences are force signaled.
4615          *
4616          * job->base holds a reference to parent fence
4617          */
4618         if (job && job->base.s_fence->parent &&
4619             dma_fence_is_signaled(job->base.s_fence->parent)) {
4620                 job_signaled = true;
4621                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4622                 goto skip_hw_reset;
4623         }
4624
4625 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4626         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4627                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4628                                                  NULL,
4629                                                  &need_full_reset);
4630                 /*TODO Should we stop ?*/
4631                 if (r) {
4632                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4633                                   r, adev_to_drm(tmp_adev)->unique);
4634                         tmp_adev->asic_reset_res = r;
4635                 }
4636         }
4637
4638         /* Actual ASIC resets if needed.*/
4639         /* TODO Implement XGMI hive reset logic for SRIOV */
4640         if (amdgpu_sriov_vf(adev)) {
4641                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4642                 if (r)
4643                         adev->asic_reset_res = r;
4644         } else {
4645                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4646                 if (r && r == -EAGAIN)
4647                         goto retry;
4648         }
4649
4650 skip_hw_reset:
4651
4652         /* Post ASIC reset for all devs .*/
4653         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4654
4655                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4656                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4657
4658                         if (!ring || !ring->sched.thread)
4659                                 continue;
4660
4661                         /* No point to resubmit jobs if we didn't HW reset*/
4662                         if (!tmp_adev->asic_reset_res && !job_signaled)
4663                                 drm_sched_resubmit_jobs(&ring->sched);
4664
4665                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4666                 }
4667
4668                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4669                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4670                 }
4671
4672                 tmp_adev->asic_reset_res = 0;
4673
4674                 if (r) {
4675                         /* bad news, how to tell it to userspace ? */
4676                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4677                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4678                 } else {
4679                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4680                 }
4681         }
4682
4683 skip_sched_resume:
4684         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4685                 /*unlock kfd: SRIOV would do it separately */
4686                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4687                         amdgpu_amdkfd_post_reset(tmp_adev);
4688                 if (audio_suspended)
4689                         amdgpu_device_resume_display_audio(tmp_adev);
4690                 amdgpu_device_unlock_adev(tmp_adev);
4691         }
4692
4693 skip_recovery:
4694         if (hive) {
4695                 atomic_set(&hive->in_reset, 0);
4696                 mutex_unlock(&hive->hive_lock);
4697                 amdgpu_put_xgmi_hive(hive);
4698         }
4699
4700         if (r)
4701                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4702         return r;
4703 }
4704
4705 /**
4706  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4707  *
4708  * @adev: amdgpu_device pointer
4709  *
4710  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4711  * and lanes) of the slot the device is in. Handles APUs and
4712  * virtualized environments where PCIE config space may not be available.
4713  */
4714 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4715 {
4716         struct pci_dev *pdev;
4717         enum pci_bus_speed speed_cap, platform_speed_cap;
4718         enum pcie_link_width platform_link_width;
4719
4720         if (amdgpu_pcie_gen_cap)
4721                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4722
4723         if (amdgpu_pcie_lane_cap)
4724                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4725
4726         /* covers APUs as well */
4727         if (pci_is_root_bus(adev->pdev->bus)) {
4728                 if (adev->pm.pcie_gen_mask == 0)
4729                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4730                 if (adev->pm.pcie_mlw_mask == 0)
4731                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4732                 return;
4733         }
4734
4735         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4736                 return;
4737
4738         pcie_bandwidth_available(adev->pdev, NULL,
4739                                  &platform_speed_cap, &platform_link_width);
4740
4741         if (adev->pm.pcie_gen_mask == 0) {
4742                 /* asic caps */
4743                 pdev = adev->pdev;
4744                 speed_cap = pcie_get_speed_cap(pdev);
4745                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4746                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4747                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4748                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4749                 } else {
4750                         if (speed_cap == PCIE_SPEED_16_0GT)
4751                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4752                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4753                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4754                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4755                         else if (speed_cap == PCIE_SPEED_8_0GT)
4756                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4757                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4758                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4759                         else if (speed_cap == PCIE_SPEED_5_0GT)
4760                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4761                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4762                         else
4763                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4764                 }
4765                 /* platform caps */
4766                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4767                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4768                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4769                 } else {
4770                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4771                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4772                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4773                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4774                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4775                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4776                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4777                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4778                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4779                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4780                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4781                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4782                         else
4783                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4784
4785                 }
4786         }
4787         if (adev->pm.pcie_mlw_mask == 0) {
4788                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4789                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4790                 } else {
4791                         switch (platform_link_width) {
4792                         case PCIE_LNK_X32:
4793                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4794                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4795                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4796                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4797                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4798                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4799                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4800                                 break;
4801                         case PCIE_LNK_X16:
4802                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4803                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4804                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4805                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4806                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4807                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4808                                 break;
4809                         case PCIE_LNK_X12:
4810                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4811                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4812                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4813                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4814                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4815                                 break;
4816                         case PCIE_LNK_X8:
4817                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4818                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4819                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4820                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4821                                 break;
4822                         case PCIE_LNK_X4:
4823                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4824                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4825                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4826                                 break;
4827                         case PCIE_LNK_X2:
4828                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4829                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4830                                 break;
4831                         case PCIE_LNK_X1:
4832                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4833                                 break;
4834                         default:
4835                                 break;
4836                         }
4837                 }
4838         }
4839 }
4840
4841 int amdgpu_device_baco_enter(struct drm_device *dev)
4842 {
4843         struct amdgpu_device *adev = drm_to_adev(dev);
4844         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4845
4846         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4847                 return -ENOTSUPP;
4848
4849         if (ras && ras->supported)
4850                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4851
4852         return amdgpu_dpm_baco_enter(adev);
4853 }
4854
4855 int amdgpu_device_baco_exit(struct drm_device *dev)
4856 {
4857         struct amdgpu_device *adev = drm_to_adev(dev);
4858         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4859         int ret = 0;
4860
4861         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4862                 return -ENOTSUPP;
4863
4864         ret = amdgpu_dpm_baco_exit(adev);
4865         if (ret)
4866                 return ret;
4867
4868         if (ras && ras->supported)
4869                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4870
4871         return 0;
4872 }
4873
4874 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4875 {
4876         int i;
4877
4878         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4879                 struct amdgpu_ring *ring = adev->rings[i];
4880
4881                 if (!ring || !ring->sched.thread)
4882                         continue;
4883
4884                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4885         }
4886 }
4887
4888 /**
4889  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4890  * @pdev: PCI device struct
4891  * @state: PCI channel state
4892  *
4893  * Description: Called when a PCI error is detected.
4894  *
4895  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4896  */
4897 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4898 {
4899         struct drm_device *dev = pci_get_drvdata(pdev);
4900         struct amdgpu_device *adev = drm_to_adev(dev);
4901         int i;
4902
4903         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4904
4905         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4906                 DRM_WARN("No support for XGMI hive yet...");
4907                 return PCI_ERS_RESULT_DISCONNECT;
4908         }
4909
4910         switch (state) {
4911         case pci_channel_io_normal:
4912                 return PCI_ERS_RESULT_CAN_RECOVER;
4913         /* Fatal error, prepare for slot reset */
4914         case pci_channel_io_frozen:
4915                 /*
4916                  * Cancel and wait for all TDRs in progress if failing to
4917                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4918                  *
4919                  * Locking adev->reset_sem will prevent any external access
4920                  * to GPU during PCI error recovery
4921                  */
4922                 while (!amdgpu_device_lock_adev(adev, NULL))
4923                         amdgpu_cancel_all_tdr(adev);
4924
4925                 /*
4926                  * Block any work scheduling as we do for regular GPU reset
4927                  * for the duration of the recovery
4928                  */
4929                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4930                         struct amdgpu_ring *ring = adev->rings[i];
4931
4932                         if (!ring || !ring->sched.thread)
4933                                 continue;
4934
4935                         drm_sched_stop(&ring->sched, NULL);
4936                 }
4937                 return PCI_ERS_RESULT_NEED_RESET;
4938         case pci_channel_io_perm_failure:
4939                 /* Permanent error, prepare for device removal */
4940                 return PCI_ERS_RESULT_DISCONNECT;
4941         }
4942
4943         return PCI_ERS_RESULT_NEED_RESET;
4944 }
4945
4946 /**
4947  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4948  * @pdev: pointer to PCI device
4949  */
4950 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4951 {
4952
4953         DRM_INFO("PCI error: mmio enabled callback!!\n");
4954
4955         /* TODO - dump whatever for debugging purposes */
4956
4957         /* This called only if amdgpu_pci_error_detected returns
4958          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4959          * works, no need to reset slot.
4960          */
4961
4962         return PCI_ERS_RESULT_RECOVERED;
4963 }
4964
4965 /**
4966  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4967  * @pdev: PCI device struct
4968  *
4969  * Description: This routine is called by the pci error recovery
4970  * code after the PCI slot has been reset, just before we
4971  * should resume normal operations.
4972  */
4973 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4974 {
4975         struct drm_device *dev = pci_get_drvdata(pdev);
4976         struct amdgpu_device *adev = drm_to_adev(dev);
4977         int r, i;
4978         bool need_full_reset = true;
4979         u32 memsize;
4980         struct list_head device_list;
4981
4982         DRM_INFO("PCI error: slot reset callback!!\n");
4983
4984         INIT_LIST_HEAD(&device_list);
4985         list_add_tail(&adev->gmc.xgmi.head, &device_list);
4986
4987         /* wait for asic to come out of reset */
4988         msleep(500);
4989
4990         /* Restore PCI confspace */
4991         amdgpu_device_load_pci_state(pdev);
4992
4993         /* confirm  ASIC came out of reset */
4994         for (i = 0; i < adev->usec_timeout; i++) {
4995                 memsize = amdgpu_asic_get_config_memsize(adev);
4996
4997                 if (memsize != 0xffffffff)
4998                         break;
4999                 udelay(1);
5000         }
5001         if (memsize == 0xffffffff) {
5002                 r = -ETIME;
5003                 goto out;
5004         }
5005
5006         adev->in_pci_err_recovery = true;
5007         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5008         adev->in_pci_err_recovery = false;
5009         if (r)
5010                 goto out;
5011
5012         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5013
5014 out:
5015         if (!r) {
5016                 if (amdgpu_device_cache_pci_state(adev->pdev))
5017                         pci_restore_state(adev->pdev);
5018
5019                 DRM_INFO("PCIe error recovery succeeded\n");
5020         } else {
5021                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5022                 amdgpu_device_unlock_adev(adev);
5023         }
5024
5025         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5026 }
5027
5028 /**
5029  * amdgpu_pci_resume() - resume normal ops after PCI reset
5030  * @pdev: pointer to PCI device
5031  *
5032  * Called when the error recovery driver tells us that its
5033  * OK to resume normal operation. Use completion to allow
5034  * halted scsi ops to resume.
5035  */
5036 void amdgpu_pci_resume(struct pci_dev *pdev)
5037 {
5038         struct drm_device *dev = pci_get_drvdata(pdev);
5039         struct amdgpu_device *adev = drm_to_adev(dev);
5040         int i;
5041
5042
5043         DRM_INFO("PCI error: resume callback!!\n");
5044
5045         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5046                 struct amdgpu_ring *ring = adev->rings[i];
5047
5048                 if (!ring || !ring->sched.thread)
5049                         continue;
5050
5051
5052                 drm_sched_resubmit_jobs(&ring->sched);
5053                 drm_sched_start(&ring->sched, true);
5054         }
5055
5056         amdgpu_device_unlock_adev(adev);
5057 }
5058
5059 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5060 {
5061         struct drm_device *dev = pci_get_drvdata(pdev);
5062         struct amdgpu_device *adev = drm_to_adev(dev);
5063         int r;
5064
5065         r = pci_save_state(pdev);
5066         if (!r) {
5067                 kfree(adev->pci_state);
5068
5069                 adev->pci_state = pci_store_saved_state(pdev);
5070
5071                 if (!adev->pci_state) {
5072                         DRM_ERROR("Failed to store PCI saved state");
5073                         return false;
5074                 }
5075         } else {
5076                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5077                 return false;
5078         }
5079
5080         return true;
5081 }
5082
5083 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5084 {
5085         struct drm_device *dev = pci_get_drvdata(pdev);
5086         struct amdgpu_device *adev = drm_to_adev(dev);
5087         int r;
5088
5089         if (!adev->pci_state)
5090                 return false;
5091
5092         r = pci_load_saved_state(pdev, adev->pci_state);
5093
5094         if (!r) {
5095                 pci_restore_state(pdev);
5096         } else {
5097                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5098                 return false;
5099         }
5100
5101         return true;
5102 }
5103
5104