drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/green_sardine_gpu_info.bin");
  85
  86 #define AMDGPU_RESUME_MS                2000
  87
  88 const char *amdgpu_asic_name[] = {
  89         "TAHITI",
  90         "PITCAIRN",
  91         "VERDE",
  92         "OLAND",
  93         "HAINAN",
  94         "BONAIRE",
  95         "KAVERI",
  96         "KABINI",
  97         "HAWAII",
  98         "MULLINS",
  99         "TOPAZ",
 100         "TONGA",
 101         "FIJI",
 102         "CARRIZO",
 103         "STONEY",
 104         "POLARIS10",
 105         "POLARIS11",
 106         "POLARIS12",
 107         "VEGAM",
 108         "VEGA10",
 109         "VEGA12",
 110         "VEGA20",
 111         "RAVEN",
 112         "ARCTURUS",
 113         "RENOIR",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "VANGOGH",
 120         "DIMGREY_CAVEFISH",
 121         "LAST",
 122 };
 123
 124 /**
 125  * DOC: pcie_replay_count
 126  *
 127  * The amdgpu driver provides a sysfs API for reporting the total number
 128  * of PCIe replays (NAKs)
 129  * The file pcie_replay_count is used for this and returns the total
 130  * number of replays as a sum of the NAKs generated and NAKs received
 131  */
 132
 133 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 134                 struct device_attribute *attr, char *buf)
 135 {
 136         struct drm_device *ddev = dev_get_drvdata(dev);
 137         struct amdgpu_device *adev = drm_to_adev(ddev);
 138         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 139
 140         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 141 }
 142
 143 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 144                 amdgpu_device_get_pcie_replay_count, NULL);
 145
 146 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 147
 148 /**
 149  * DOC: product_name
 150  *
 151  * The amdgpu driver provides a sysfs API for reporting the product name
 152  * for the device
 153  * The file serial_number is used for this and returns the product name
 154  * as returned from the FRU.
 155  * NOTE: This is only available for certain server cards
 156  */
 157
 158 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 159                 struct device_attribute *attr, char *buf)
 160 {
 161         struct drm_device *ddev = dev_get_drvdata(dev);
 162         struct amdgpu_device *adev = drm_to_adev(ddev);
 163
 164         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 165 }
 166
 167 static DEVICE_ATTR(product_name, S_IRUGO,
 168                 amdgpu_device_get_product_name, NULL);
 169
 170 /**
 171  * DOC: product_number
 172  *
 173  * The amdgpu driver provides a sysfs API for reporting the part number
 174  * for the device
 175  * The file serial_number is used for this and returns the part number
 176  * as returned from the FRU.
 177  * NOTE: This is only available for certain server cards
 178  */
 179
 180 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 181                 struct device_attribute *attr, char *buf)
 182 {
 183         struct drm_device *ddev = dev_get_drvdata(dev);
 184         struct amdgpu_device *adev = drm_to_adev(ddev);
 185
 186         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 187 }
 188
 189 static DEVICE_ATTR(product_number, S_IRUGO,
 190                 amdgpu_device_get_product_number, NULL);
 191
 192 /**
 193  * DOC: serial_number
 194  *
 195  * The amdgpu driver provides a sysfs API for reporting the serial number
 196  * for the device
 197  * The file serial_number is used for this and returns the serial number
 198  * as returned from the FRU.
 199  * NOTE: This is only available for certain server cards
 200  */
 201
 202 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 203                 struct device_attribute *attr, char *buf)
 204 {
 205         struct drm_device *ddev = dev_get_drvdata(dev);
 206         struct amdgpu_device *adev = drm_to_adev(ddev);
 207
 208         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 209 }
 210
 211 static DEVICE_ATTR(serial_number, S_IRUGO,
 212                 amdgpu_device_get_serial_number, NULL);
 213
 214 /**
 215  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 216  *
 217  * @dev: drm_device pointer
 218  *
 219  * Returns true if the device is a dGPU with HG/PX power control,
 220  * otherwise return false.
 221  */
 222 bool amdgpu_device_supports_boco(struct drm_device *dev)
 223 {
 224         struct amdgpu_device *adev = drm_to_adev(dev);
 225
 226         if (adev->flags & AMD_IS_PX)
 227                 return true;
 228         return false;
 229 }
 230
 231 /**
 232  * amdgpu_device_supports_baco - Does the device support BACO
 233  *
 234  * @dev: drm_device pointer
 235  *
 236  * Returns true if the device supporte BACO,
 237  * otherwise return false.
 238  */
 239 bool amdgpu_device_supports_baco(struct drm_device *dev)
 240 {
 241         struct amdgpu_device *adev = drm_to_adev(dev);
 242
 243         return amdgpu_asic_supports_baco(adev);
 244 }
 245
 246 /*
 247  * VRAM access helper functions
 248  */
 249
 250 /**
 251  * amdgpu_device_vram_access - read/write a buffer in vram
 252  *
 253  * @adev: amdgpu_device pointer
 254  * @pos: offset of the buffer in vram
 255  * @buf: virtual address of the buffer in system memory
 256  * @size: read/write size, sizeof(@buf) must > @size
 257  * @write: true - write to vram, otherwise - read from vram
 258  */
 259 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 260                                uint32_t *buf, size_t size, bool write)
 261 {
 262         unsigned long flags;
 263         uint32_t hi = ~0;
 264         uint64_t last;
 265
 266
 267 #ifdef CONFIG_64BIT
 268         last = min(pos + size, adev->gmc.visible_vram_size);
 269         if (last > pos) {
 270                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 271                 size_t count = last - pos;
 272
 273                 if (write) {
 274                         memcpy_toio(addr, buf, count);
 275                         mb();
 276                         amdgpu_asic_flush_hdp(adev, NULL);
 277                 } else {
 278                         amdgpu_asic_invalidate_hdp(adev, NULL);
 279                         mb();
 280                         memcpy_fromio(buf, addr, count);
 281                 }
 282
 283                 if (count == size)
 284                         return;
 285
 286                 pos += count;
 287                 buf += count / 4;
 288                 size -= count;
 289         }
 290 #endif
 291
 292         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 293         for (last = pos + size; pos < last; pos += 4) {
 294                 uint32_t tmp = pos >> 31;
 295
 296                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 297                 if (tmp != hi) {
 298                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 299                         hi = tmp;
 300                 }
 301                 if (write)
 302                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 303                 else
 304                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 305         }
 306         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 307 }
 308
 309 /*
 310  * register access helper functions.
 311  */
 312 /**
 313  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 314  *
 315  * @adev: amdgpu_device pointer
 316  * @reg: dword aligned register offset
 317  * @acc_flags: access flags which require special behavior
 318  *
 319  * Returns the 32 bit value from the offset specified.
 320  */
 321 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 322                             uint32_t reg, uint32_t acc_flags)
 323 {
 324         uint32_t ret;
 325
 326         if (adev->in_pci_err_recovery)
 327                 return 0;
 328
 329         if ((reg * 4) < adev->rmmio_size) {
 330                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 331                     amdgpu_sriov_runtime(adev) &&
 332                     down_read_trylock(&adev->reset_sem)) {
 333                         ret = amdgpu_kiq_rreg(adev, reg);
 334                         up_read(&adev->reset_sem);
 335                 } else {
 336                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 337                 }
 338         } else {
 339                 ret = adev->pcie_rreg(adev, reg * 4);
 340         }
 341
 342         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 343
 344         return ret;
 345 }
 346
 347 /*
 348  * MMIO register read with bytes helper functions
 349  * @offset:bytes offset from MMIO start
 350  *
 351 */
 352
 353 /**
 354  * amdgpu_mm_rreg8 - read a memory mapped IO register
 355  *
 356  * @adev: amdgpu_device pointer
 357  * @offset: byte aligned register offset
 358  *
 359  * Returns the 8 bit value from the offset specified.
 360  */
 361 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 362 {
 363         if (adev->in_pci_err_recovery)
 364                 return 0;
 365
 366         if (offset < adev->rmmio_size)
 367                 return (readb(adev->rmmio + offset));
 368         BUG();
 369 }
 370
 371 /*
 372  * MMIO register write with bytes helper functions
 373  * @offset:bytes offset from MMIO start
 374  * @value: the value want to be written to the register
 375  *
 376 */
 377 /**
 378  * amdgpu_mm_wreg8 - read a memory mapped IO register
 379  *
 380  * @adev: amdgpu_device pointer
 381  * @offset: byte aligned register offset
 382  * @value: 8 bit value to write
 383  *
 384  * Writes the value specified to the offset specified.
 385  */
 386 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 387 {
 388         if (adev->in_pci_err_recovery)
 389                 return;
 390
 391         if (offset < adev->rmmio_size)
 392                 writeb(value, adev->rmmio + offset);
 393         else
 394                 BUG();
 395 }
 396
 397 /**
 398  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 399  *
 400  * @adev: amdgpu_device pointer
 401  * @reg: dword aligned register offset
 402  * @v: 32 bit value to write to the register
 403  * @acc_flags: access flags which require special behavior
 404  *
 405  * Writes the value specified to the offset specified.
 406  */
 407 void amdgpu_device_wreg(struct amdgpu_device *adev,
 408                         uint32_t reg, uint32_t v,
 409                         uint32_t acc_flags)
 410 {
 411         if (adev->in_pci_err_recovery)
 412                 return;
 413
 414         if ((reg * 4) < adev->rmmio_size) {
 415                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 416                     amdgpu_sriov_runtime(adev) &&
 417                     down_read_trylock(&adev->reset_sem)) {
 418                         amdgpu_kiq_wreg(adev, reg, v);
 419                         up_read(&adev->reset_sem);
 420                 } else {
 421                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 422                 }
 423         } else {
 424                 adev->pcie_wreg(adev, reg * 4, v);
 425         }
 426
 427         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 428 }
 429
 430 /*
 431  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 432  *
 433  * this function is invoked only the debugfs register access
 434  * */
 435 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 436                              uint32_t reg, uint32_t v)
 437 {
 438         if (adev->in_pci_err_recovery)
 439                 return;
 440
 441         if (amdgpu_sriov_fullaccess(adev) &&
 442             adev->gfx.rlc.funcs &&
 443             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 444                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 445                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 446         } else {
 447                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 448         }
 449 }
 450
 451 /**
 452  * amdgpu_io_rreg - read an IO register
 453  *
 454  * @adev: amdgpu_device pointer
 455  * @reg: dword aligned register offset
 456  *
 457  * Returns the 32 bit value from the offset specified.
 458  */
 459 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 460 {
 461         if (adev->in_pci_err_recovery)
 462                 return 0;
 463
 464         if ((reg * 4) < adev->rio_mem_size)
 465                 return ioread32(adev->rio_mem + (reg * 4));
 466         else {
 467                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 468                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 469         }
 470 }
 471
 472 /**
 473  * amdgpu_io_wreg - write to an IO register
 474  *
 475  * @adev: amdgpu_device pointer
 476  * @reg: dword aligned register offset
 477  * @v: 32 bit value to write to the register
 478  *
 479  * Writes the value specified to the offset specified.
 480  */
 481 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 482 {
 483         if (adev->in_pci_err_recovery)
 484                 return;
 485
 486         if ((reg * 4) < adev->rio_mem_size)
 487                 iowrite32(v, adev->rio_mem + (reg * 4));
 488         else {
 489                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 490                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 491         }
 492 }
 493
 494 /**
 495  * amdgpu_mm_rdoorbell - read a doorbell dword
 496  *
 497  * @adev: amdgpu_device pointer
 498  * @index: doorbell index
 499  *
 500  * Returns the value in the doorbell aperture at the
 501  * requested doorbell index (CIK).
 502  */
 503 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 504 {
 505         if (adev->in_pci_err_recovery)
 506                 return 0;
 507
 508         if (index < adev->doorbell.num_doorbells) {
 509                 return readl(adev->doorbell.ptr + index);
 510         } else {
 511                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 512                 return 0;
 513         }
 514 }
 515
 516 /**
 517  * amdgpu_mm_wdoorbell - write a doorbell dword
 518  *
 519  * @adev: amdgpu_device pointer
 520  * @index: doorbell index
 521  * @v: value to write
 522  *
 523  * Writes @v to the doorbell aperture at the
 524  * requested doorbell index (CIK).
 525  */
 526 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 527 {
 528         if (adev->in_pci_err_recovery)
 529                 return;
 530
 531         if (index < adev->doorbell.num_doorbells) {
 532                 writel(v, adev->doorbell.ptr + index);
 533         } else {
 534                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 535         }
 536 }
 537
 538 /**
 539  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 540  *
 541  * @adev: amdgpu_device pointer
 542  * @index: doorbell index
 543  *
 544  * Returns the value in the doorbell aperture at the
 545  * requested doorbell index (VEGA10+).
 546  */
 547 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 548 {
 549         if (adev->in_pci_err_recovery)
 550                 return 0;
 551
 552         if (index < adev->doorbell.num_doorbells) {
 553                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 554         } else {
 555                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 556                 return 0;
 557         }
 558 }
 559
 560 /**
 561  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 562  *
 563  * @adev: amdgpu_device pointer
 564  * @index: doorbell index
 565  * @v: value to write
 566  *
 567  * Writes @v to the doorbell aperture at the
 568  * requested doorbell index (VEGA10+).
 569  */
 570 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 571 {
 572         if (adev->in_pci_err_recovery)
 573                 return;
 574
 575         if (index < adev->doorbell.num_doorbells) {
 576                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 577         } else {
 578                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 579         }
 580 }
 581
 582 /**
 583  * amdgpu_device_indirect_rreg - read an indirect register
 584  *
 585  * @adev: amdgpu_device pointer
 586  * @pcie_index: mmio register offset
 587  * @pcie_data: mmio register offset
 588  *
 589  * Returns the value of indirect register @reg_addr
 590  */
 591 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 592                                 u32 pcie_index, u32 pcie_data,
 593                                 u32 reg_addr)
 594 {
 595         unsigned long flags;
 596         u32 r;
 597         void __iomem *pcie_index_offset;
 598         void __iomem *pcie_data_offset;
 599
 600         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 601         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 602         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 603
 604         writel(reg_addr, pcie_index_offset);
 605         readl(pcie_index_offset);
 606         r = readl(pcie_data_offset);
 607         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 608
 609         return r;
 610 }
 611
 612 /**
 613  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 614  *
 615  * @adev: amdgpu_device pointer
 616  * @pcie_index: mmio register offset
 617  * @pcie_data: mmio register offset
 618  *
 619  * Returns the value of indirect register @reg_addr
 620  */
 621 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 622                                   u32 pcie_index, u32 pcie_data,
 623                                   u32 reg_addr)
 624 {
 625         unsigned long flags;
 626         u64 r;
 627         void __iomem *pcie_index_offset;
 628         void __iomem *pcie_data_offset;
 629
 630         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 631         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 632         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 633
 634         /* read low 32 bits */
 635         writel(reg_addr, pcie_index_offset);
 636         readl(pcie_index_offset);
 637         r = readl(pcie_data_offset);
 638         /* read high 32 bits */
 639         writel(reg_addr + 4, pcie_index_offset);
 640         readl(pcie_index_offset);
 641         r |= ((u64)readl(pcie_data_offset) << 32);
 642         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 643
 644         return r;
 645 }
 646
 647 /**
 648  * amdgpu_device_indirect_wreg - write an indirect register address
 649  *
 650  * @adev: amdgpu_device pointer
 651  * @pcie_index: mmio register offset
 652  * @pcie_data: mmio register offset
 653  * @reg_addr: indirect register offset
 654  * @reg_data: indirect register data
 655  *
 656  */
 657 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 658                                  u32 pcie_index, u32 pcie_data,
 659                                  u32 reg_addr, u32 reg_data)
 660 {
 661         unsigned long flags;
 662         void __iomem *pcie_index_offset;
 663         void __iomem *pcie_data_offset;
 664
 665         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 666         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 667         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 668
 669         writel(reg_addr, pcie_index_offset);
 670         readl(pcie_index_offset);
 671         writel(reg_data, pcie_data_offset);
 672         readl(pcie_data_offset);
 673         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 674 }
 675
 676 /**
 677  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 678  *
 679  * @adev: amdgpu_device pointer
 680  * @pcie_index: mmio register offset
 681  * @pcie_data: mmio register offset
 682  * @reg_addr: indirect register offset
 683  * @reg_data: indirect register data
 684  *
 685  */
 686 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 687                                    u32 pcie_index, u32 pcie_data,
 688                                    u32 reg_addr, u64 reg_data)
 689 {
 690         unsigned long flags;
 691         void __iomem *pcie_index_offset;
 692         void __iomem *pcie_data_offset;
 693
 694         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 695         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 696         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 697
 698         /* write low 32 bits */
 699         writel(reg_addr, pcie_index_offset);
 700         readl(pcie_index_offset);
 701         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 702         readl(pcie_data_offset);
 703         /* write high 32 bits */
 704         writel(reg_addr + 4, pcie_index_offset);
 705         readl(pcie_index_offset);
 706         writel((u32)(reg_data >> 32), pcie_data_offset);
 707         readl(pcie_data_offset);
 708         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 709 }
 710
 711 /**
 712  * amdgpu_invalid_rreg - dummy reg read function
 713  *
 714  * @adev: amdgpu_device pointer
 715  * @reg: offset of register
 716  *
 717  * Dummy register read function.  Used for register blocks
 718  * that certain asics don't have (all asics).
 719  * Returns the value in the register.
 720  */
 721 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 722 {
 723         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 724         BUG();
 725         return 0;
 726 }
 727
 728 /**
 729  * amdgpu_invalid_wreg - dummy reg write function
 730  *
 731  * @adev: amdgpu_device pointer
 732  * @reg: offset of register
 733  * @v: value to write to the register
 734  *
 735  * Dummy register read function.  Used for register blocks
 736  * that certain asics don't have (all asics).
 737  */
 738 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 739 {
 740         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 741                   reg, v);
 742         BUG();
 743 }
 744
 745 /**
 746  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 747  *
 748  * @adev: amdgpu_device pointer
 749  * @reg: offset of register
 750  *
 751  * Dummy register read function.  Used for register blocks
 752  * that certain asics don't have (all asics).
 753  * Returns the value in the register.
 754  */
 755 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 756 {
 757         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 758         BUG();
 759         return 0;
 760 }
 761
 762 /**
 763  * amdgpu_invalid_wreg64 - dummy reg write function
 764  *
 765  * @adev: amdgpu_device pointer
 766  * @reg: offset of register
 767  * @v: value to write to the register
 768  *
 769  * Dummy register read function.  Used for register blocks
 770  * that certain asics don't have (all asics).
 771  */
 772 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 773 {
 774         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 775                   reg, v);
 776         BUG();
 777 }
 778
 779 /**
 780  * amdgpu_block_invalid_rreg - dummy reg read function
 781  *
 782  * @adev: amdgpu_device pointer
 783  * @block: offset of instance
 784  * @reg: offset of register
 785  *
 786  * Dummy register read function.  Used for register blocks
 787  * that certain asics don't have (all asics).
 788  * Returns the value in the register.
 789  */
 790 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 791                                           uint32_t block, uint32_t reg)
 792 {
 793         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 794                   reg, block);
 795         BUG();
 796         return 0;
 797 }
 798
 799 /**
 800  * amdgpu_block_invalid_wreg - dummy reg write function
 801  *
 802  * @adev: amdgpu_device pointer
 803  * @block: offset of instance
 804  * @reg: offset of register
 805  * @v: value to write to the register
 806  *
 807  * Dummy register read function.  Used for register blocks
 808  * that certain asics don't have (all asics).
 809  */
 810 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 811                                       uint32_t block,
 812                                       uint32_t reg, uint32_t v)
 813 {
 814         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 815                   reg, block, v);
 816         BUG();
 817 }
 818
 819 /**
 820  * amdgpu_device_asic_init - Wrapper for atom asic_init
 821  *
 822  * @adev: amdgpu_device pointer
 823  *
 824  * Does any asic specific work and then calls atom asic init.
 825  */
 826 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 827 {
 828         amdgpu_asic_pre_asic_init(adev);
 829
 830         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 831 }
 832
 833 /**
 834  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 835  *
 836  * @adev: amdgpu_device pointer
 837  *
 838  * Allocates a scratch page of VRAM for use by various things in the
 839  * driver.
 840  */
 841 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 842 {
 843         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 844                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 845                                        &adev->vram_scratch.robj,
 846                                        &adev->vram_scratch.gpu_addr,
 847                                        (void **)&adev->vram_scratch.ptr);
 848 }
 849
 850 /**
 851  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 852  *
 853  * @adev: amdgpu_device pointer
 854  *
 855  * Frees the VRAM scratch page.
 856  */
 857 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 858 {
 859         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 860 }
 861
 862 /**
 863  * amdgpu_device_program_register_sequence - program an array of registers.
 864  *
 865  * @adev: amdgpu_device pointer
 866  * @registers: pointer to the register array
 867  * @array_size: size of the register array
 868  *
 869  * Programs an array or registers with and and or masks.
 870  * This is a helper for setting golden registers.
 871  */
 872 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 873                                              const u32 *registers,
 874                                              const u32 array_size)
 875 {
 876         u32 tmp, reg, and_mask, or_mask;
 877         int i;
 878
 879         if (array_size % 3)
 880                 return;
 881
 882         for (i = 0; i < array_size; i +=3) {
 883                 reg = registers[i + 0];
 884                 and_mask = registers[i + 1];
 885                 or_mask = registers[i + 2];
 886
 887                 if (and_mask == 0xffffffff) {
 888                         tmp = or_mask;
 889                 } else {
 890                         tmp = RREG32(reg);
 891                         tmp &= ~and_mask;
 892                         if (adev->family >= AMDGPU_FAMILY_AI)
 893                                 tmp |= (or_mask & and_mask);
 894                         else
 895                                 tmp |= or_mask;
 896                 }
 897                 WREG32(reg, tmp);
 898         }
 899 }
 900
 901 /**
 902  * amdgpu_device_pci_config_reset - reset the GPU
 903  *
 904  * @adev: amdgpu_device pointer
 905  *
 906  * Resets the GPU using the pci config reset sequence.
 907  * Only applicable to asics prior to vega10.
 908  */
 909 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 910 {
 911         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 912 }
 913
 914 /*
 915  * GPU doorbell aperture helpers function.
 916  */
 917 /**
 918  * amdgpu_device_doorbell_init - Init doorbell driver information.
 919  *
 920  * @adev: amdgpu_device pointer
 921  *
 922  * Init doorbell driver information (CIK)
 923  * Returns 0 on success, error on failure.
 924  */
 925 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 926 {
 927
 928         /* No doorbell on SI hardware generation */
 929         if (adev->asic_type < CHIP_BONAIRE) {
 930                 adev->doorbell.base = 0;
 931                 adev->doorbell.size = 0;
 932                 adev->doorbell.num_doorbells = 0;
 933                 adev->doorbell.ptr = NULL;
 934                 return 0;
 935         }
 936
 937         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 938                 return -EINVAL;
 939
 940         amdgpu_asic_init_doorbell_index(adev);
 941
 942         /* doorbell bar mapping */
 943         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 944         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 945
 946         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 947                                              adev->doorbell_index.max_assignment+1);
 948         if (adev->doorbell.num_doorbells == 0)
 949                 return -EINVAL;
 950
 951         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 952          * paging queue doorbell use the second page. The
 953          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 954          * doorbells are in the first page. So with paging queue enabled,
 955          * the max num_doorbells should + 1 page (0x400 in dword)
 956          */
 957         if (adev->asic_type >= CHIP_VEGA10)
 958                 adev->doorbell.num_doorbells += 0x400;
 959
 960         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 961                                      adev->doorbell.num_doorbells *
 962                                      sizeof(u32));
 963         if (adev->doorbell.ptr == NULL)
 964                 return -ENOMEM;
 965
 966         return 0;
 967 }
 968
 969 /**
 970  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 971  *
 972  * @adev: amdgpu_device pointer
 973  *
 974  * Tear down doorbell driver information (CIK)
 975  */
 976 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 977 {
 978         iounmap(adev->doorbell.ptr);
 979         adev->doorbell.ptr = NULL;
 980 }
 981
 982
 983
 984 /*
 985  * amdgpu_device_wb_*()
 986  * Writeback is the method by which the GPU updates special pages in memory
 987  * with the status of certain GPU events (fences, ring pointers,etc.).
 988  */
 989
 990 /**
 991  * amdgpu_device_wb_fini - Disable Writeback and free memory
 992  *
 993  * @adev: amdgpu_device pointer
 994  *
 995  * Disables Writeback and frees the Writeback memory (all asics).
 996  * Used at driver shutdown.
 997  */
 998 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 999 {
1000         if (adev->wb.wb_obj) {
1001                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1002                                       &adev->wb.gpu_addr,
1003                                       (void **)&adev->wb.wb);
1004                 adev->wb.wb_obj = NULL;
1005         }
1006 }
1007
1008 /**
1009  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1010  *
1011  * @adev: amdgpu_device pointer
1012  *
1013  * Initializes writeback and allocates writeback memory (all asics).
1014  * Used at driver startup.
1015  * Returns 0 on success or an -error on failure.
1016  */
1017 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1018 {
1019         int r;
1020
1021         if (adev->wb.wb_obj == NULL) {
1022                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1023                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1024                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1025                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1026                                             (void **)&adev->wb.wb);
1027                 if (r) {
1028                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1029                         return r;
1030                 }
1031
1032                 adev->wb.num_wb = AMDGPU_MAX_WB;
1033                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1034
1035                 /* clear wb memory */
1036                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1037         }
1038
1039         return 0;
1040 }
1041
1042 /**
1043  * amdgpu_device_wb_get - Allocate a wb entry
1044  *
1045  * @adev: amdgpu_device pointer
1046  * @wb: wb index
1047  *
1048  * Allocate a wb slot for use by the driver (all asics).
1049  * Returns 0 on success or -EINVAL on failure.
1050  */
1051 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1052 {
1053         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1054
1055         if (offset < adev->wb.num_wb) {
1056                 __set_bit(offset, adev->wb.used);
1057                 *wb = offset << 3; /* convert to dw offset */
1058                 return 0;
1059         } else {
1060                 return -EINVAL;
1061         }
1062 }
1063
1064 /**
1065  * amdgpu_device_wb_free - Free a wb entry
1066  *
1067  * @adev: amdgpu_device pointer
1068  * @wb: wb index
1069  *
1070  * Free a wb slot allocated for use by the driver (all asics)
1071  */
1072 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1073 {
1074         wb >>= 3;
1075         if (wb < adev->wb.num_wb)
1076                 __clear_bit(wb, adev->wb.used);
1077 }
1078
1079 /**
1080  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1081  *
1082  * @adev: amdgpu_device pointer
1083  *
1084  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1085  * to fail, but if any of the BARs is not accessible after the size we abort
1086  * driver loading by returning -ENODEV.
1087  */
1088 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1089 {
1090         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
1091         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
1092         struct pci_bus *root;
1093         struct resource *res;
1094         unsigned i;
1095         u16 cmd;
1096         int r;
1097
1098         /* Bypass for VF */
1099         if (amdgpu_sriov_vf(adev))
1100                 return 0;
1101
1102         /* skip if the bios has already enabled large BAR */
1103         if (adev->gmc.real_vram_size &&
1104             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1105                 return 0;
1106
1107         /* Check if the root BUS has 64bit memory resources */
1108         root = adev->pdev->bus;
1109         while (root->parent)
1110                 root = root->parent;
1111
1112         pci_bus_for_each_resource(root, res, i) {
1113                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1114                     res->start > 0x100000000ull)
1115                         break;
1116         }
1117
1118         /* Trying to resize is pointless without a root hub window above 4GB */
1119         if (!res)
1120                 return 0;
1121
1122         /* Disable memory decoding while we change the BAR addresses and size */
1123         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1124         pci_write_config_word(adev->pdev, PCI_COMMAND,
1125                               cmd & ~PCI_COMMAND_MEMORY);
1126
1127         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1128         amdgpu_device_doorbell_fini(adev);
1129         if (adev->asic_type >= CHIP_BONAIRE)
1130                 pci_release_resource(adev->pdev, 2);
1131
1132         pci_release_resource(adev->pdev, 0);
1133
1134         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1135         if (r == -ENOSPC)
1136                 DRM_INFO("Not enough PCI address space for a large BAR.");
1137         else if (r && r != -ENOTSUPP)
1138                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1139
1140         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1141
1142         /* When the doorbell or fb BAR isn't available we have no chance of
1143          * using the device.
1144          */
1145         r = amdgpu_device_doorbell_init(adev);
1146         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1147                 return -ENODEV;
1148
1149         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1150
1151         return 0;
1152 }
1153
1154 /*
1155  * GPU helpers function.
1156  */
1157 /**
1158  * amdgpu_device_need_post - check if the hw need post or not
1159  *
1160  * @adev: amdgpu_device pointer
1161  *
1162  * Check if the asic has been initialized (all asics) at driver startup
1163  * or post is needed if  hw reset is performed.
1164  * Returns true if need or false if not.
1165  */
1166 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1167 {
1168         uint32_t reg;
1169
1170         if (amdgpu_sriov_vf(adev))
1171                 return false;
1172
1173         if (amdgpu_passthrough(adev)) {
1174                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1175                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1176                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1177                  * vpost executed for smc version below 22.15
1178                  */
1179                 if (adev->asic_type == CHIP_FIJI) {
1180                         int err;
1181                         uint32_t fw_ver;
1182                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1183                         /* force vPost if error occured */
1184                         if (err)
1185                                 return true;
1186
1187                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1188                         if (fw_ver < 0x00160e00)
1189                                 return true;
1190                 }
1191         }
1192
1193         if (adev->has_hw_reset) {
1194                 adev->has_hw_reset = false;
1195                 return true;
1196         }
1197
1198         /* bios scratch used on CIK+ */
1199         if (adev->asic_type >= CHIP_BONAIRE)
1200                 return amdgpu_atombios_scratch_need_asic_init(adev);
1201
1202         /* check MEM_SIZE for older asics */
1203         reg = amdgpu_asic_get_config_memsize(adev);
1204
1205         if ((reg != 0) && (reg != 0xffffffff))
1206                 return false;
1207
1208         return true;
1209 }
1210
1211 /* if we get transitioned to only one device, take VGA back */
1212 /**
1213  * amdgpu_device_vga_set_decode - enable/disable vga decode
1214  *
1215  * @cookie: amdgpu_device pointer
1216  * @state: enable/disable vga decode
1217  *
1218  * Enable/disable vga decode (all asics).
1219  * Returns VGA resource flags.
1220  */
1221 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1222 {
1223         struct amdgpu_device *adev = cookie;
1224         amdgpu_asic_set_vga_state(adev, state);
1225         if (state)
1226                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1227                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1228         else
1229                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1230 }
1231
1232 /**
1233  * amdgpu_device_check_block_size - validate the vm block size
1234  *
1235  * @adev: amdgpu_device pointer
1236  *
1237  * Validates the vm block size specified via module parameter.
1238  * The vm block size defines number of bits in page table versus page directory,
1239  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1240  * page table and the remaining bits are in the page directory.
1241  */
1242 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1243 {
1244         /* defines number of bits in page table versus page directory,
1245          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1246          * page table and the remaining bits are in the page directory */
1247         if (amdgpu_vm_block_size == -1)
1248                 return;
1249
1250         if (amdgpu_vm_block_size < 9) {
1251                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1252                          amdgpu_vm_block_size);
1253                 amdgpu_vm_block_size = -1;
1254         }
1255 }
1256
1257 /**
1258  * amdgpu_device_check_vm_size - validate the vm size
1259  *
1260  * @adev: amdgpu_device pointer
1261  *
1262  * Validates the vm size in GB specified via module parameter.
1263  * The VM size is the size of the GPU virtual memory space in GB.
1264  */
1265 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1266 {
1267         /* no need to check the default value */
1268         if (amdgpu_vm_size == -1)
1269                 return;
1270
1271         if (amdgpu_vm_size < 1) {
1272                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1273                          amdgpu_vm_size);
1274                 amdgpu_vm_size = -1;
1275         }
1276 }
1277
1278 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1279 {
1280         struct sysinfo si;
1281         bool is_os_64 = (sizeof(void *) == 8);
1282         uint64_t total_memory;
1283         uint64_t dram_size_seven_GB = 0x1B8000000;
1284         uint64_t dram_size_three_GB = 0xB8000000;
1285
1286         if (amdgpu_smu_memory_pool_size == 0)
1287                 return;
1288
1289         if (!is_os_64) {
1290                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1291                 goto def_value;
1292         }
1293         si_meminfo(&si);
1294         total_memory = (uint64_t)si.totalram * si.mem_unit;
1295
1296         if ((amdgpu_smu_memory_pool_size == 1) ||
1297                 (amdgpu_smu_memory_pool_size == 2)) {
1298                 if (total_memory < dram_size_three_GB)
1299                         goto def_value1;
1300         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1301                 (amdgpu_smu_memory_pool_size == 8)) {
1302                 if (total_memory < dram_size_seven_GB)
1303                         goto def_value1;
1304         } else {
1305                 DRM_WARN("Smu memory pool size not supported\n");
1306                 goto def_value;
1307         }
1308         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1309
1310         return;
1311
1312 def_value1:
1313         DRM_WARN("No enough system memory\n");
1314 def_value:
1315         adev->pm.smu_prv_buffer_size = 0;
1316 }
1317
1318 /**
1319  * amdgpu_device_check_arguments - validate module params
1320  *
1321  * @adev: amdgpu_device pointer
1322  *
1323  * Validates certain module parameters and updates
1324  * the associated values used by the driver (all asics).
1325  */
1326 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1327 {
1328         if (amdgpu_sched_jobs < 4) {
1329                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1330                          amdgpu_sched_jobs);
1331                 amdgpu_sched_jobs = 4;
1332         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1333                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1334                          amdgpu_sched_jobs);
1335                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1336         }
1337
1338         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1339                 /* gart size must be greater or equal to 32M */
1340                 dev_warn(adev->dev, "gart size (%d) too small\n",
1341                          amdgpu_gart_size);
1342                 amdgpu_gart_size = -1;
1343         }
1344
1345         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1346                 /* gtt size must be greater or equal to 32M */
1347                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1348                                  amdgpu_gtt_size);
1349                 amdgpu_gtt_size = -1;
1350         }
1351
1352         /* valid range is between 4 and 9 inclusive */
1353         if (amdgpu_vm_fragment_size != -1 &&
1354             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1355                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1356                 amdgpu_vm_fragment_size = -1;
1357         }
1358
1359         if (amdgpu_sched_hw_submission < 2) {
1360                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1361                          amdgpu_sched_hw_submission);
1362                 amdgpu_sched_hw_submission = 2;
1363         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1364                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1365                          amdgpu_sched_hw_submission);
1366                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1367         }
1368
1369         amdgpu_device_check_smu_prv_buffer_size(adev);
1370
1371         amdgpu_device_check_vm_size(adev);
1372
1373         amdgpu_device_check_block_size(adev);
1374
1375         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1376
1377         amdgpu_gmc_tmz_set(adev);
1378
1379         amdgpu_gmc_noretry_set(adev);
1380
1381         return 0;
1382 }
1383
1384 /**
1385  * amdgpu_switcheroo_set_state - set switcheroo state
1386  *
1387  * @pdev: pci dev pointer
1388  * @state: vga_switcheroo state
1389  *
1390  * Callback for the switcheroo driver.  Suspends or resumes the
1391  * the asics before or after it is powered up using ACPI methods.
1392  */
1393 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1394                                         enum vga_switcheroo_state state)
1395 {
1396         struct drm_device *dev = pci_get_drvdata(pdev);
1397         int r;
1398
1399         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1400                 return;
1401
1402         if (state == VGA_SWITCHEROO_ON) {
1403                 pr_info("switched on\n");
1404                 /* don't suspend or resume card normally */
1405                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1406
1407                 pci_set_power_state(dev->pdev, PCI_D0);
1408                 amdgpu_device_load_pci_state(dev->pdev);
1409                 r = pci_enable_device(dev->pdev);
1410                 if (r)
1411                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1412                 amdgpu_device_resume(dev, true);
1413
1414                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1415                 drm_kms_helper_poll_enable(dev);
1416         } else {
1417                 pr_info("switched off\n");
1418                 drm_kms_helper_poll_disable(dev);
1419                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1420                 amdgpu_device_suspend(dev, true);
1421                 amdgpu_device_cache_pci_state(dev->pdev);
1422                 /* Shut down the device */
1423                 pci_disable_device(dev->pdev);
1424                 pci_set_power_state(dev->pdev, PCI_D3cold);
1425                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1426         }
1427 }
1428
1429 /**
1430  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1431  *
1432  * @pdev: pci dev pointer
1433  *
1434  * Callback for the switcheroo driver.  Check of the switcheroo
1435  * state can be changed.
1436  * Returns true if the state can be changed, false if not.
1437  */
1438 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1439 {
1440         struct drm_device *dev = pci_get_drvdata(pdev);
1441
1442         /*
1443         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1444         * locking inversion with the driver load path. And the access here is
1445         * completely racy anyway. So don't bother with locking for now.
1446         */
1447         return atomic_read(&dev->open_count) == 0;
1448 }
1449
1450 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1451         .set_gpu_state = amdgpu_switcheroo_set_state,
1452         .reprobe = NULL,
1453         .can_switch = amdgpu_switcheroo_can_switch,
1454 };
1455
1456 /**
1457  * amdgpu_device_ip_set_clockgating_state - set the CG state
1458  *
1459  * @dev: amdgpu_device pointer
1460  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1461  * @state: clockgating state (gate or ungate)
1462  *
1463  * Sets the requested clockgating state for all instances of
1464  * the hardware IP specified.
1465  * Returns the error code from the last instance.
1466  */
1467 int amdgpu_device_ip_set_clockgating_state(void *dev,
1468                                            enum amd_ip_block_type block_type,
1469                                            enum amd_clockgating_state state)
1470 {
1471         struct amdgpu_device *adev = dev;
1472         int i, r = 0;
1473
1474         for (i = 0; i < adev->num_ip_blocks; i++) {
1475                 if (!adev->ip_blocks[i].status.valid)
1476                         continue;
1477                 if (adev->ip_blocks[i].version->type != block_type)
1478                         continue;
1479                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1480                         continue;
1481                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1482                         (void *)adev, state);
1483                 if (r)
1484                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1485                                   adev->ip_blocks[i].version->funcs->name, r);
1486         }
1487         return r;
1488 }
1489
1490 /**
1491  * amdgpu_device_ip_set_powergating_state - set the PG state
1492  *
1493  * @dev: amdgpu_device pointer
1494  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1495  * @state: powergating state (gate or ungate)
1496  *
1497  * Sets the requested powergating state for all instances of
1498  * the hardware IP specified.
1499  * Returns the error code from the last instance.
1500  */
1501 int amdgpu_device_ip_set_powergating_state(void *dev,
1502                                            enum amd_ip_block_type block_type,
1503                                            enum amd_powergating_state state)
1504 {
1505         struct amdgpu_device *adev = dev;
1506         int i, r = 0;
1507
1508         for (i = 0; i < adev->num_ip_blocks; i++) {
1509                 if (!adev->ip_blocks[i].status.valid)
1510                         continue;
1511                 if (adev->ip_blocks[i].version->type != block_type)
1512                         continue;
1513                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1514                         continue;
1515                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1516                         (void *)adev, state);
1517                 if (r)
1518                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1519                                   adev->ip_blocks[i].version->funcs->name, r);
1520         }
1521         return r;
1522 }
1523
1524 /**
1525  * amdgpu_device_ip_get_clockgating_state - get the CG state
1526  *
1527  * @adev: amdgpu_device pointer
1528  * @flags: clockgating feature flags
1529  *
1530  * Walks the list of IPs on the device and updates the clockgating
1531  * flags for each IP.
1532  * Updates @flags with the feature flags for each hardware IP where
1533  * clockgating is enabled.
1534  */
1535 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1536                                             u32 *flags)
1537 {
1538         int i;
1539
1540         for (i = 0; i < adev->num_ip_blocks; i++) {
1541                 if (!adev->ip_blocks[i].status.valid)
1542                         continue;
1543                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1544                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1545         }
1546 }
1547
1548 /**
1549  * amdgpu_device_ip_wait_for_idle - wait for idle
1550  *
1551  * @adev: amdgpu_device pointer
1552  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1553  *
1554  * Waits for the request hardware IP to be idle.
1555  * Returns 0 for success or a negative error code on failure.
1556  */
1557 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1558                                    enum amd_ip_block_type block_type)
1559 {
1560         int i, r;
1561
1562         for (i = 0; i < adev->num_ip_blocks; i++) {
1563                 if (!adev->ip_blocks[i].status.valid)
1564                         continue;
1565                 if (adev->ip_blocks[i].version->type == block_type) {
1566                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1567                         if (r)
1568                                 return r;
1569                         break;
1570                 }
1571         }
1572         return 0;
1573
1574 }
1575
1576 /**
1577  * amdgpu_device_ip_is_idle - is the hardware IP idle
1578  *
1579  * @adev: amdgpu_device pointer
1580  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1581  *
1582  * Check if the hardware IP is idle or not.
1583  * Returns true if it the IP is idle, false if not.
1584  */
1585 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1586                               enum amd_ip_block_type block_type)
1587 {
1588         int i;
1589
1590         for (i = 0; i < adev->num_ip_blocks; i++) {
1591                 if (!adev->ip_blocks[i].status.valid)
1592                         continue;
1593                 if (adev->ip_blocks[i].version->type == block_type)
1594                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1595         }
1596         return true;
1597
1598 }
1599
1600 /**
1601  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1602  *
1603  * @adev: amdgpu_device pointer
1604  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1605  *
1606  * Returns a pointer to the hardware IP block structure
1607  * if it exists for the asic, otherwise NULL.
1608  */
1609 struct amdgpu_ip_block *
1610 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1611                               enum amd_ip_block_type type)
1612 {
1613         int i;
1614
1615         for (i = 0; i < adev->num_ip_blocks; i++)
1616                 if (adev->ip_blocks[i].version->type == type)
1617                         return &adev->ip_blocks[i];
1618
1619         return NULL;
1620 }
1621
1622 /**
1623  * amdgpu_device_ip_block_version_cmp
1624  *
1625  * @adev: amdgpu_device pointer
1626  * @type: enum amd_ip_block_type
1627  * @major: major version
1628  * @minor: minor version
1629  *
1630  * return 0 if equal or greater
1631  * return 1 if smaller or the ip_block doesn't exist
1632  */
1633 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1634                                        enum amd_ip_block_type type,
1635                                        u32 major, u32 minor)
1636 {
1637         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1638
1639         if (ip_block && ((ip_block->version->major > major) ||
1640                         ((ip_block->version->major == major) &&
1641                         (ip_block->version->minor >= minor))))
1642                 return 0;
1643
1644         return 1;
1645 }
1646
1647 /**
1648  * amdgpu_device_ip_block_add
1649  *
1650  * @adev: amdgpu_device pointer
1651  * @ip_block_version: pointer to the IP to add
1652  *
1653  * Adds the IP block driver information to the collection of IPs
1654  * on the asic.
1655  */
1656 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1657                                const struct amdgpu_ip_block_version *ip_block_version)
1658 {
1659         if (!ip_block_version)
1660                 return -EINVAL;
1661
1662         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1663                   ip_block_version->funcs->name);
1664
1665         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1666
1667         return 0;
1668 }
1669
1670 /**
1671  * amdgpu_device_enable_virtual_display - enable virtual display feature
1672  *
1673  * @adev: amdgpu_device pointer
1674  *
1675  * Enabled the virtual display feature if the user has enabled it via
1676  * the module parameter virtual_display.  This feature provides a virtual
1677  * display hardware on headless boards or in virtualized environments.
1678  * This function parses and validates the configuration string specified by
1679  * the user and configues the virtual display configuration (number of
1680  * virtual connectors, crtcs, etc.) specified.
1681  */
1682 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1683 {
1684         adev->enable_virtual_display = false;
1685
1686         if (amdgpu_virtual_display) {
1687                 struct drm_device *ddev = adev_to_drm(adev);
1688                 const char *pci_address_name = pci_name(ddev->pdev);
1689                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1690
1691                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1692                 pciaddstr_tmp = pciaddstr;
1693                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1694                         pciaddname = strsep(&pciaddname_tmp, ",");
1695                         if (!strcmp("all", pciaddname)
1696                             || !strcmp(pci_address_name, pciaddname)) {
1697                                 long num_crtc;
1698                                 int res = -1;
1699
1700                                 adev->enable_virtual_display = true;
1701
1702                                 if (pciaddname_tmp)
1703                                         res = kstrtol(pciaddname_tmp, 10,
1704                                                       &num_crtc);
1705
1706                                 if (!res) {
1707                                         if (num_crtc < 1)
1708                                                 num_crtc = 1;
1709                                         if (num_crtc > 6)
1710                                                 num_crtc = 6;
1711                                         adev->mode_info.num_crtc = num_crtc;
1712                                 } else {
1713                                         adev->mode_info.num_crtc = 1;
1714                                 }
1715                                 break;
1716                         }
1717                 }
1718
1719                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1720                          amdgpu_virtual_display, pci_address_name,
1721                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1722
1723                 kfree(pciaddstr);
1724         }
1725 }
1726
1727 /**
1728  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1729  *
1730  * @adev: amdgpu_device pointer
1731  *
1732  * Parses the asic configuration parameters specified in the gpu info
1733  * firmware and makes them availale to the driver for use in configuring
1734  * the asic.
1735  * Returns 0 on success, -EINVAL on failure.
1736  */
1737 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1738 {
1739         const char *chip_name;
1740         char fw_name[40];
1741         int err;
1742         const struct gpu_info_firmware_header_v1_0 *hdr;
1743
1744         adev->firmware.gpu_info_fw = NULL;
1745
1746         if (adev->mman.discovery_bin) {
1747                 amdgpu_discovery_get_gfx_info(adev);
1748
1749                 /*
1750                  * FIXME: The bounding box is still needed by Navi12, so
1751                  * temporarily read it from gpu_info firmware. Should be droped
1752                  * when DAL no longer needs it.
1753                  */
1754                 if (adev->asic_type != CHIP_NAVI12)
1755                         return 0;
1756         }
1757
1758         switch (adev->asic_type) {
1759 #ifdef CONFIG_DRM_AMDGPU_SI
1760         case CHIP_VERDE:
1761         case CHIP_TAHITI:
1762         case CHIP_PITCAIRN:
1763         case CHIP_OLAND:
1764         case CHIP_HAINAN:
1765 #endif
1766 #ifdef CONFIG_DRM_AMDGPU_CIK
1767         case CHIP_BONAIRE:
1768         case CHIP_HAWAII:
1769         case CHIP_KAVERI:
1770         case CHIP_KABINI:
1771         case CHIP_MULLINS:
1772 #endif
1773         case CHIP_TOPAZ:
1774         case CHIP_TONGA:
1775         case CHIP_FIJI:
1776         case CHIP_POLARIS10:
1777         case CHIP_POLARIS11:
1778         case CHIP_POLARIS12:
1779         case CHIP_VEGAM:
1780         case CHIP_CARRIZO:
1781         case CHIP_STONEY:
1782         case CHIP_VEGA20:
1783         case CHIP_SIENNA_CICHLID:
1784         case CHIP_NAVY_FLOUNDER:
1785         case CHIP_DIMGREY_CAVEFISH:
1786         default:
1787                 return 0;
1788         case CHIP_VEGA10:
1789                 chip_name = "vega10";
1790                 break;
1791         case CHIP_VEGA12:
1792                 chip_name = "vega12";
1793                 break;
1794         case CHIP_RAVEN:
1795                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1796                         chip_name = "raven2";
1797                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1798                         chip_name = "picasso";
1799                 else
1800                         chip_name = "raven";
1801                 break;
1802         case CHIP_ARCTURUS:
1803                 chip_name = "arcturus";
1804                 break;
1805         case CHIP_RENOIR:
1806                 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1807                         chip_name = "renoir";
1808                 else
1809                         chip_name = "green_sardine";
1810                 break;
1811         case CHIP_NAVI10:
1812                 chip_name = "navi10";
1813                 break;
1814         case CHIP_NAVI14:
1815                 chip_name = "navi14";
1816                 break;
1817         case CHIP_NAVI12:
1818                 chip_name = "navi12";
1819                 break;
1820         case CHIP_VANGOGH:
1821                 chip_name = "vangogh";
1822                 break;
1823         }
1824
1825         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1826         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1827         if (err) {
1828                 dev_err(adev->dev,
1829                         "Failed to load gpu_info firmware \"%s\"\n",
1830                         fw_name);
1831                 goto out;
1832         }
1833         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1834         if (err) {
1835                 dev_err(adev->dev,
1836                         "Failed to validate gpu_info firmware \"%s\"\n",
1837                         fw_name);
1838                 goto out;
1839         }
1840
1841         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1842         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1843
1844         switch (hdr->version_major) {
1845         case 1:
1846         {
1847                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1848                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1849                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1850
1851                 /*
1852                  * Should be droped when DAL no longer needs it.
1853                  */
1854                 if (adev->asic_type == CHIP_NAVI12)
1855                         goto parse_soc_bounding_box;
1856
1857                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1858                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1859                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1860                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1861                 adev->gfx.config.max_texture_channel_caches =
1862                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1863                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1864                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1865                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1866                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1867                 adev->gfx.config.double_offchip_lds_buf =
1868                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1869                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1870                 adev->gfx.cu_info.max_waves_per_simd =
1871                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1872                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1873                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1874                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1875                 if (hdr->version_minor >= 1) {
1876                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1877                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1878                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1879                         adev->gfx.config.num_sc_per_sh =
1880                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1881                         adev->gfx.config.num_packer_per_sc =
1882                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1883                 }
1884
1885 parse_soc_bounding_box:
1886                 /*
1887                  * soc bounding box info is not integrated in disocovery table,
1888                  * we always need to parse it from gpu info firmware if needed.
1889                  */
1890                 if (hdr->version_minor == 2) {
1891                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1892                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1893                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1894                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1895                 }
1896                 break;
1897         }
1898         default:
1899                 dev_err(adev->dev,
1900                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1901                 err = -EINVAL;
1902                 goto out;
1903         }
1904 out:
1905         return err;
1906 }
1907
1908 /**
1909  * amdgpu_device_ip_early_init - run early init for hardware IPs
1910  *
1911  * @adev: amdgpu_device pointer
1912  *
1913  * Early initialization pass for hardware IPs.  The hardware IPs that make
1914  * up each asic are discovered each IP's early_init callback is run.  This
1915  * is the first stage in initializing the asic.
1916  * Returns 0 on success, negative error code on failure.
1917  */
1918 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1919 {
1920         int i, r;
1921
1922         amdgpu_device_enable_virtual_display(adev);
1923
1924         if (amdgpu_sriov_vf(adev)) {
1925                 r = amdgpu_virt_request_full_gpu(adev, true);
1926                 if (r)
1927                         return r;
1928         }
1929
1930         switch (adev->asic_type) {
1931 #ifdef CONFIG_DRM_AMDGPU_SI
1932         case CHIP_VERDE:
1933         case CHIP_TAHITI:
1934         case CHIP_PITCAIRN:
1935         case CHIP_OLAND:
1936         case CHIP_HAINAN:
1937                 adev->family = AMDGPU_FAMILY_SI;
1938                 r = si_set_ip_blocks(adev);
1939                 if (r)
1940                         return r;
1941                 break;
1942 #endif
1943 #ifdef CONFIG_DRM_AMDGPU_CIK
1944         case CHIP_BONAIRE:
1945         case CHIP_HAWAII:
1946         case CHIP_KAVERI:
1947         case CHIP_KABINI:
1948         case CHIP_MULLINS:
1949                 if (adev->flags & AMD_IS_APU)
1950                         adev->family = AMDGPU_FAMILY_KV;
1951                 else
1952                         adev->family = AMDGPU_FAMILY_CI;
1953
1954                 r = cik_set_ip_blocks(adev);
1955                 if (r)
1956                         return r;
1957                 break;
1958 #endif
1959         case CHIP_TOPAZ:
1960         case CHIP_TONGA:
1961         case CHIP_FIJI:
1962         case CHIP_POLARIS10:
1963         case CHIP_POLARIS11:
1964         case CHIP_POLARIS12:
1965         case CHIP_VEGAM:
1966         case CHIP_CARRIZO:
1967         case CHIP_STONEY:
1968                 if (adev->flags & AMD_IS_APU)
1969                         adev->family = AMDGPU_FAMILY_CZ;
1970                 else
1971                         adev->family = AMDGPU_FAMILY_VI;
1972
1973                 r = vi_set_ip_blocks(adev);
1974                 if (r)
1975                         return r;
1976                 break;
1977         case CHIP_VEGA10:
1978         case CHIP_VEGA12:
1979         case CHIP_VEGA20:
1980         case CHIP_RAVEN:
1981         case CHIP_ARCTURUS:
1982         case CHIP_RENOIR:
1983                 if (adev->flags & AMD_IS_APU)
1984                         adev->family = AMDGPU_FAMILY_RV;
1985                 else
1986                         adev->family = AMDGPU_FAMILY_AI;
1987
1988                 r = soc15_set_ip_blocks(adev);
1989                 if (r)
1990                         return r;
1991                 break;
1992         case  CHIP_NAVI10:
1993         case  CHIP_NAVI14:
1994         case  CHIP_NAVI12:
1995         case  CHIP_SIENNA_CICHLID:
1996         case  CHIP_NAVY_FLOUNDER:
1997         case  CHIP_DIMGREY_CAVEFISH:
1998         case CHIP_VANGOGH:
1999                 if (adev->asic_type == CHIP_VANGOGH)
2000                         adev->family = AMDGPU_FAMILY_VGH;
2001                 else
2002                         adev->family = AMDGPU_FAMILY_NV;
2003
2004                 r = nv_set_ip_blocks(adev);
2005                 if (r)
2006                         return r;
2007                 break;
2008         default:
2009                 /* FIXME: not supported yet */
2010                 return -EINVAL;
2011         }
2012
2013         amdgpu_amdkfd_device_probe(adev);
2014
2015         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2016         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2017                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2018
2019         for (i = 0; i < adev->num_ip_blocks; i++) {
2020                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2021                         DRM_ERROR("disabled ip block: %d <%s>\n",
2022                                   i, adev->ip_blocks[i].version->funcs->name);
2023                         adev->ip_blocks[i].status.valid = false;
2024                 } else {
2025                         if (adev->ip_blocks[i].version->funcs->early_init) {
2026                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2027                                 if (r == -ENOENT) {
2028                                         adev->ip_blocks[i].status.valid = false;
2029                                 } else if (r) {
2030                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2031                                                   adev->ip_blocks[i].version->funcs->name, r);
2032                                         return r;
2033                                 } else {
2034                                         adev->ip_blocks[i].status.valid = true;
2035                                 }
2036                         } else {
2037                                 adev->ip_blocks[i].status.valid = true;
2038                         }
2039                 }
2040                 /* get the vbios after the asic_funcs are set up */
2041                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2042                         r = amdgpu_device_parse_gpu_info_fw(adev);
2043                         if (r)
2044                                 return r;
2045
2046                         /* Read BIOS */
2047                         if (!amdgpu_get_bios(adev))
2048                                 return -EINVAL;
2049
2050                         r = amdgpu_atombios_init(adev);
2051                         if (r) {
2052                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2053                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2054                                 return r;
2055                         }
2056                 }
2057         }
2058
2059         adev->cg_flags &= amdgpu_cg_mask;
2060         adev->pg_flags &= amdgpu_pg_mask;
2061
2062         return 0;
2063 }
2064
2065 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2066 {
2067         int i, r;
2068
2069         for (i = 0; i < adev->num_ip_blocks; i++) {
2070                 if (!adev->ip_blocks[i].status.sw)
2071                         continue;
2072                 if (adev->ip_blocks[i].status.hw)
2073                         continue;
2074                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2075                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2076                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2077                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2078                         if (r) {
2079                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2080                                           adev->ip_blocks[i].version->funcs->name, r);
2081                                 return r;
2082                         }
2083                         adev->ip_blocks[i].status.hw = true;
2084                 }
2085         }
2086
2087         return 0;
2088 }
2089
2090 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2091 {
2092         int i, r;
2093
2094         for (i = 0; i < adev->num_ip_blocks; i++) {
2095                 if (!adev->ip_blocks[i].status.sw)
2096                         continue;
2097                 if (adev->ip_blocks[i].status.hw)
2098                         continue;
2099                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2100                 if (r) {
2101                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2102                                   adev->ip_blocks[i].version->funcs->name, r);
2103                         return r;
2104                 }
2105                 adev->ip_blocks[i].status.hw = true;
2106         }
2107
2108         return 0;
2109 }
2110
2111 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2112 {
2113         int r = 0;
2114         int i;
2115         uint32_t smu_version;
2116
2117         if (adev->asic_type >= CHIP_VEGA10) {
2118                 for (i = 0; i < adev->num_ip_blocks; i++) {
2119                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2120                                 continue;
2121
2122                         /* no need to do the fw loading again if already done*/
2123                         if (adev->ip_blocks[i].status.hw == true)
2124                                 break;
2125
2126                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2127                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2128                                 if (r) {
2129                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2130                                                           adev->ip_blocks[i].version->funcs->name, r);
2131                                         return r;
2132                                 }
2133                         } else {
2134                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2135                                 if (r) {
2136                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2137                                                           adev->ip_blocks[i].version->funcs->name, r);
2138                                         return r;
2139                                 }
2140                         }
2141
2142                         adev->ip_blocks[i].status.hw = true;
2143                         break;
2144                 }
2145         }
2146
2147         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2148                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2149
2150         return r;
2151 }
2152
2153 /**
2154  * amdgpu_device_ip_init - run init for hardware IPs
2155  *
2156  * @adev: amdgpu_device pointer
2157  *
2158  * Main initialization pass for hardware IPs.  The list of all the hardware
2159  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2160  * are run.  sw_init initializes the software state associated with each IP
2161  * and hw_init initializes the hardware associated with each IP.
2162  * Returns 0 on success, negative error code on failure.
2163  */
2164 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2165 {
2166         int i, r;
2167
2168         r = amdgpu_ras_init(adev);
2169         if (r)
2170                 return r;
2171
2172         for (i = 0; i < adev->num_ip_blocks; i++) {
2173                 if (!adev->ip_blocks[i].status.valid)
2174                         continue;
2175                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2176                 if (r) {
2177                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2178                                   adev->ip_blocks[i].version->funcs->name, r);
2179                         goto init_failed;
2180                 }
2181                 adev->ip_blocks[i].status.sw = true;
2182
2183                 /* need to do gmc hw init early so we can allocate gpu mem */
2184                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2185                         r = amdgpu_device_vram_scratch_init(adev);
2186                         if (r) {
2187                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2188                                 goto init_failed;
2189                         }
2190                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2191                         if (r) {
2192                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2193                                 goto init_failed;
2194                         }
2195                         r = amdgpu_device_wb_init(adev);
2196                         if (r) {
2197                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2198                                 goto init_failed;
2199                         }
2200                         adev->ip_blocks[i].status.hw = true;
2201
2202                         /* right after GMC hw init, we create CSA */
2203                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2204                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2205                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2206                                                                 AMDGPU_CSA_SIZE);
2207                                 if (r) {
2208                                         DRM_ERROR("allocate CSA failed %d\n", r);
2209                                         goto init_failed;
2210                                 }
2211                         }
2212                 }
2213         }
2214
2215         if (amdgpu_sriov_vf(adev))
2216                 amdgpu_virt_init_data_exchange(adev);
2217
2218         r = amdgpu_ib_pool_init(adev);
2219         if (r) {
2220                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2221                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2222                 goto init_failed;
2223         }
2224
2225         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2226         if (r)
2227                 goto init_failed;
2228
2229         r = amdgpu_device_ip_hw_init_phase1(adev);
2230         if (r)
2231                 goto init_failed;
2232
2233         r = amdgpu_device_fw_loading(adev);
2234         if (r)
2235                 goto init_failed;
2236
2237         r = amdgpu_device_ip_hw_init_phase2(adev);
2238         if (r)
2239                 goto init_failed;
2240
2241         /*
2242          * retired pages will be loaded from eeprom and reserved here,
2243          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2244          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2245          * for I2C communication which only true at this point.
2246          *
2247          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2248          * failure from bad gpu situation and stop amdgpu init process
2249          * accordingly. For other failed cases, it will still release all
2250          * the resource and print error message, rather than returning one
2251          * negative value to upper level.
2252          *
2253          * Note: theoretically, this should be called before all vram allocations
2254          * to protect retired page from abusing
2255          */
2256         r = amdgpu_ras_recovery_init(adev);
2257         if (r)
2258                 goto init_failed;
2259
2260         if (adev->gmc.xgmi.num_physical_nodes > 1)
2261                 amdgpu_xgmi_add_device(adev);
2262         amdgpu_amdkfd_device_init(adev);
2263
2264         amdgpu_fru_get_product_info(adev);
2265
2266 init_failed:
2267         if (amdgpu_sriov_vf(adev))
2268                 amdgpu_virt_release_full_gpu(adev, true);
2269
2270         return r;
2271 }
2272
2273 /**
2274  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2275  *
2276  * @adev: amdgpu_device pointer
2277  *
2278  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2279  * this function before a GPU reset.  If the value is retained after a
2280  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2281  */
2282 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2283 {
2284         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2285 }
2286
2287 /**
2288  * amdgpu_device_check_vram_lost - check if vram is valid
2289  *
2290  * @adev: amdgpu_device pointer
2291  *
2292  * Checks the reset magic value written to the gart pointer in VRAM.
2293  * The driver calls this after a GPU reset to see if the contents of
2294  * VRAM is lost or now.
2295  * returns true if vram is lost, false if not.
2296  */
2297 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2298 {
2299         if (memcmp(adev->gart.ptr, adev->reset_magic,
2300                         AMDGPU_RESET_MAGIC_NUM))
2301                 return true;
2302
2303         if (!amdgpu_in_reset(adev))
2304                 return false;
2305
2306         /*
2307          * For all ASICs with baco/mode1 reset, the VRAM is
2308          * always assumed to be lost.
2309          */
2310         switch (amdgpu_asic_reset_method(adev)) {
2311         case AMD_RESET_METHOD_BACO:
2312         case AMD_RESET_METHOD_MODE1:
2313                 return true;
2314         default:
2315                 return false;
2316         }
2317 }
2318
2319 /**
2320  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2321  *
2322  * @adev: amdgpu_device pointer
2323  * @state: clockgating state (gate or ungate)
2324  *
2325  * The list of all the hardware IPs that make up the asic is walked and the
2326  * set_clockgating_state callbacks are run.
2327  * Late initialization pass enabling clockgating for hardware IPs.
2328  * Fini or suspend, pass disabling clockgating for hardware IPs.
2329  * Returns 0 on success, negative error code on failure.
2330  */
2331
2332 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2333                                                 enum amd_clockgating_state state)
2334 {
2335         int i, j, r;
2336
2337         if (amdgpu_emu_mode == 1)
2338                 return 0;
2339
2340         for (j = 0; j < adev->num_ip_blocks; j++) {
2341                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2342                 if (!adev->ip_blocks[i].status.late_initialized)
2343                         continue;
2344                 /* skip CG for VCE/UVD, it's handled specially */
2345                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2346                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2347                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2348                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2349                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2350                         /* enable clockgating to save power */
2351                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2352                                                                                      state);
2353                         if (r) {
2354                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2355                                           adev->ip_blocks[i].version->funcs->name, r);
2356                                 return r;
2357                         }
2358                 }
2359         }
2360
2361         return 0;
2362 }
2363
2364 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2365 {
2366         int i, j, r;
2367
2368         if (amdgpu_emu_mode == 1)
2369                 return 0;
2370
2371         for (j = 0; j < adev->num_ip_blocks; j++) {
2372                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2373                 if (!adev->ip_blocks[i].status.late_initialized)
2374                         continue;
2375                 /* skip CG for VCE/UVD, it's handled specially */
2376                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2377                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2378                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2379                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2380                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2381                         /* enable powergating to save power */
2382                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2383                                                                                         state);
2384                         if (r) {
2385                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2386                                           adev->ip_blocks[i].version->funcs->name, r);
2387                                 return r;
2388                         }
2389                 }
2390         }
2391         return 0;
2392 }
2393
2394 static int amdgpu_device_enable_mgpu_fan_boost(void)
2395 {
2396         struct amdgpu_gpu_instance *gpu_ins;
2397         struct amdgpu_device *adev;
2398         int i, ret = 0;
2399
2400         mutex_lock(&mgpu_info.mutex);
2401
2402         /*
2403          * MGPU fan boost feature should be enabled
2404          * only when there are two or more dGPUs in
2405          * the system
2406          */
2407         if (mgpu_info.num_dgpu < 2)
2408                 goto out;
2409
2410         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2411                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2412                 adev = gpu_ins->adev;
2413                 if (!(adev->flags & AMD_IS_APU) &&
2414                     !gpu_ins->mgpu_fan_enabled) {
2415                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2416                         if (ret)
2417                                 break;
2418
2419                         gpu_ins->mgpu_fan_enabled = 1;
2420                 }
2421         }
2422
2423 out:
2424         mutex_unlock(&mgpu_info.mutex);
2425
2426         return ret;
2427 }
2428
2429 /**
2430  * amdgpu_device_ip_late_init - run late init for hardware IPs
2431  *
2432  * @adev: amdgpu_device pointer
2433  *
2434  * Late initialization pass for hardware IPs.  The list of all the hardware
2435  * IPs that make up the asic is walked and the late_init callbacks are run.
2436  * late_init covers any special initialization that an IP requires
2437  * after all of the have been initialized or something that needs to happen
2438  * late in the init process.
2439  * Returns 0 on success, negative error code on failure.
2440  */
2441 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2442 {
2443         struct amdgpu_gpu_instance *gpu_instance;
2444         int i = 0, r;
2445
2446         for (i = 0; i < adev->num_ip_blocks; i++) {
2447                 if (!adev->ip_blocks[i].status.hw)
2448                         continue;
2449                 if (adev->ip_blocks[i].version->funcs->late_init) {
2450                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2451                         if (r) {
2452                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2453                                           adev->ip_blocks[i].version->funcs->name, r);
2454                                 return r;
2455                         }
2456                 }
2457                 adev->ip_blocks[i].status.late_initialized = true;
2458         }
2459
2460         amdgpu_ras_set_error_query_ready(adev, true);
2461
2462         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2463         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2464
2465         amdgpu_device_fill_reset_magic(adev);
2466
2467         r = amdgpu_device_enable_mgpu_fan_boost();
2468         if (r)
2469                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2470
2471
2472         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2473                 mutex_lock(&mgpu_info.mutex);
2474
2475                 /*
2476                  * Reset device p-state to low as this was booted with high.
2477                  *
2478                  * This should be performed only after all devices from the same
2479                  * hive get initialized.
2480                  *
2481                  * However, it's unknown how many device in the hive in advance.
2482                  * As this is counted one by one during devices initializations.
2483                  *
2484                  * So, we wait for all XGMI interlinked devices initialized.
2485                  * This may bring some delays as those devices may come from
2486                  * different hives. But that should be OK.
2487                  */
2488                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2489                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2490                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2491                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2492                                         continue;
2493
2494                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2495                                                 AMDGPU_XGMI_PSTATE_MIN);
2496                                 if (r) {
2497                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2498                                         break;
2499                                 }
2500                         }
2501                 }
2502
2503                 mutex_unlock(&mgpu_info.mutex);
2504         }
2505
2506         return 0;
2507 }
2508
2509 /**
2510  * amdgpu_device_ip_fini - run fini for hardware IPs
2511  *
2512  * @adev: amdgpu_device pointer
2513  *
2514  * Main teardown pass for hardware IPs.  The list of all the hardware
2515  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2516  * are run.  hw_fini tears down the hardware associated with each IP
2517  * and sw_fini tears down any software state associated with each IP.
2518  * Returns 0 on success, negative error code on failure.
2519  */
2520 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2521 {
2522         int i, r;
2523
2524         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2525                 amdgpu_virt_release_ras_err_handler_data(adev);
2526
2527         amdgpu_ras_pre_fini(adev);
2528
2529         if (adev->gmc.xgmi.num_physical_nodes > 1)
2530                 amdgpu_xgmi_remove_device(adev);
2531
2532         amdgpu_amdkfd_device_fini(adev);
2533
2534         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2535         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2536
2537         /* need to disable SMC first */
2538         for (i = 0; i < adev->num_ip_blocks; i++) {
2539                 if (!adev->ip_blocks[i].status.hw)
2540                         continue;
2541                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2542                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2543                         /* XXX handle errors */
2544                         if (r) {
2545                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2546                                           adev->ip_blocks[i].version->funcs->name, r);
2547                         }
2548                         adev->ip_blocks[i].status.hw = false;
2549                         break;
2550                 }
2551         }
2552
2553         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2554                 if (!adev->ip_blocks[i].status.hw)
2555                         continue;
2556
2557                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2558                 /* XXX handle errors */
2559                 if (r) {
2560                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2561                                   adev->ip_blocks[i].version->funcs->name, r);
2562                 }
2563
2564                 adev->ip_blocks[i].status.hw = false;
2565         }
2566
2567
2568         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2569                 if (!adev->ip_blocks[i].status.sw)
2570                         continue;
2571
2572                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2573                         amdgpu_ucode_free_bo(adev);
2574                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2575                         amdgpu_device_wb_fini(adev);
2576                         amdgpu_device_vram_scratch_fini(adev);
2577                         amdgpu_ib_pool_fini(adev);
2578                 }
2579
2580                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2581                 /* XXX handle errors */
2582                 if (r) {
2583                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2584                                   adev->ip_blocks[i].version->funcs->name, r);
2585                 }
2586                 adev->ip_blocks[i].status.sw = false;
2587                 adev->ip_blocks[i].status.valid = false;
2588         }
2589
2590         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2591                 if (!adev->ip_blocks[i].status.late_initialized)
2592                         continue;
2593                 if (adev->ip_blocks[i].version->funcs->late_fini)
2594                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2595                 adev->ip_blocks[i].status.late_initialized = false;
2596         }
2597
2598         amdgpu_ras_fini(adev);
2599
2600         if (amdgpu_sriov_vf(adev))
2601                 if (amdgpu_virt_release_full_gpu(adev, false))
2602                         DRM_ERROR("failed to release exclusive mode on fini\n");
2603
2604         return 0;
2605 }
2606
2607 /**
2608  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2609  *
2610  * @work: work_struct.
2611  */
2612 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2613 {
2614         struct amdgpu_device *adev =
2615                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2616         int r;
2617
2618         r = amdgpu_ib_ring_tests(adev);
2619         if (r)
2620                 DRM_ERROR("ib ring test failed (%d).\n", r);
2621 }
2622
2623 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2624 {
2625         struct amdgpu_device *adev =
2626                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2627
2628         mutex_lock(&adev->gfx.gfx_off_mutex);
2629         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2630                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2631                         adev->gfx.gfx_off_state = true;
2632         }
2633         mutex_unlock(&adev->gfx.gfx_off_mutex);
2634 }
2635
2636 /**
2637  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2638  *
2639  * @adev: amdgpu_device pointer
2640  *
2641  * Main suspend function for hardware IPs.  The list of all the hardware
2642  * IPs that make up the asic is walked, clockgating is disabled and the
2643  * suspend callbacks are run.  suspend puts the hardware and software state
2644  * in each IP into a state suitable for suspend.
2645  * Returns 0 on success, negative error code on failure.
2646  */
2647 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2648 {
2649         int i, r;
2650
2651         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2652         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2653
2654         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2655                 if (!adev->ip_blocks[i].status.valid)
2656                         continue;
2657
2658                 /* displays are handled separately */
2659                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2660                         continue;
2661
2662                 /* XXX handle errors */
2663                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2664                 /* XXX handle errors */
2665                 if (r) {
2666                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2667                                   adev->ip_blocks[i].version->funcs->name, r);
2668                         return r;
2669                 }
2670
2671                 adev->ip_blocks[i].status.hw = false;
2672         }
2673
2674         return 0;
2675 }
2676
2677 /**
2678  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2679  *
2680  * @adev: amdgpu_device pointer
2681  *
2682  * Main suspend function for hardware IPs.  The list of all the hardware
2683  * IPs that make up the asic is walked, clockgating is disabled and the
2684  * suspend callbacks are run.  suspend puts the hardware and software state
2685  * in each IP into a state suitable for suspend.
2686  * Returns 0 on success, negative error code on failure.
2687  */
2688 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2689 {
2690         int i, r;
2691
2692         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2693                 if (!adev->ip_blocks[i].status.valid)
2694                         continue;
2695                 /* displays are handled in phase1 */
2696                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2697                         continue;
2698                 /* PSP lost connection when err_event_athub occurs */
2699                 if (amdgpu_ras_intr_triggered() &&
2700                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2701                         adev->ip_blocks[i].status.hw = false;
2702                         continue;
2703                 }
2704                 /* XXX handle errors */
2705                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2706                 /* XXX handle errors */
2707                 if (r) {
2708                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2709                                   adev->ip_blocks[i].version->funcs->name, r);
2710                 }
2711                 adev->ip_blocks[i].status.hw = false;
2712                 /* handle putting the SMC in the appropriate state */
2713                 if(!amdgpu_sriov_vf(adev)){
2714                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2715                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2716                                 if (r) {
2717                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2718                                                         adev->mp1_state, r);
2719                                         return r;
2720                                 }
2721                         }
2722                 }
2723                 adev->ip_blocks[i].status.hw = false;
2724         }
2725
2726         return 0;
2727 }
2728
2729 /**
2730  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2731  *
2732  * @adev: amdgpu_device pointer
2733  *
2734  * Main suspend function for hardware IPs.  The list of all the hardware
2735  * IPs that make up the asic is walked, clockgating is disabled and the
2736  * suspend callbacks are run.  suspend puts the hardware and software state
2737  * in each IP into a state suitable for suspend.
2738  * Returns 0 on success, negative error code on failure.
2739  */
2740 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2741 {
2742         int r;
2743
2744         if (amdgpu_sriov_vf(adev))
2745                 amdgpu_virt_request_full_gpu(adev, false);
2746
2747         r = amdgpu_device_ip_suspend_phase1(adev);
2748         if (r)
2749                 return r;
2750         r = amdgpu_device_ip_suspend_phase2(adev);
2751
2752         if (amdgpu_sriov_vf(adev))
2753                 amdgpu_virt_release_full_gpu(adev, false);
2754
2755         return r;
2756 }
2757
2758 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2759 {
2760         int i, r;
2761
2762         static enum amd_ip_block_type ip_order[] = {
2763                 AMD_IP_BLOCK_TYPE_GMC,
2764                 AMD_IP_BLOCK_TYPE_COMMON,
2765                 AMD_IP_BLOCK_TYPE_PSP,
2766                 AMD_IP_BLOCK_TYPE_IH,
2767         };
2768
2769         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2770                 int j;
2771                 struct amdgpu_ip_block *block;
2772
2773                 block = &adev->ip_blocks[i];
2774                 block->status.hw = false;
2775
2776                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2777
2778                         if (block->version->type != ip_order[j] ||
2779                                 !block->status.valid)
2780                                 continue;
2781
2782                         r = block->version->funcs->hw_init(adev);
2783                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2784                         if (r)
2785                                 return r;
2786                         block->status.hw = true;
2787                 }
2788         }
2789
2790         return 0;
2791 }
2792
2793 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2794 {
2795         int i, r;
2796
2797         static enum amd_ip_block_type ip_order[] = {
2798                 AMD_IP_BLOCK_TYPE_SMC,
2799                 AMD_IP_BLOCK_TYPE_DCE,
2800                 AMD_IP_BLOCK_TYPE_GFX,
2801                 AMD_IP_BLOCK_TYPE_SDMA,
2802                 AMD_IP_BLOCK_TYPE_UVD,
2803                 AMD_IP_BLOCK_TYPE_VCE,
2804                 AMD_IP_BLOCK_TYPE_VCN
2805         };
2806
2807         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2808                 int j;
2809                 struct amdgpu_ip_block *block;
2810
2811                 for (j = 0; j < adev->num_ip_blocks; j++) {
2812                         block = &adev->ip_blocks[j];
2813
2814                         if (block->version->type != ip_order[i] ||
2815                                 !block->status.valid ||
2816                                 block->status.hw)
2817                                 continue;
2818
2819                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2820                                 r = block->version->funcs->resume(adev);
2821                         else
2822                                 r = block->version->funcs->hw_init(adev);
2823
2824                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2825                         if (r)
2826                                 return r;
2827                         block->status.hw = true;
2828                 }
2829         }
2830
2831         return 0;
2832 }
2833
2834 /**
2835  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2836  *
2837  * @adev: amdgpu_device pointer
2838  *
2839  * First resume function for hardware IPs.  The list of all the hardware
2840  * IPs that make up the asic is walked and the resume callbacks are run for
2841  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2842  * after a suspend and updates the software state as necessary.  This
2843  * function is also used for restoring the GPU after a GPU reset.
2844  * Returns 0 on success, negative error code on failure.
2845  */
2846 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2847 {
2848         int i, r;
2849
2850         for (i = 0; i < adev->num_ip_blocks; i++) {
2851                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2852                         continue;
2853                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2854                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2855                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2856
2857                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2858                         if (r) {
2859                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2860                                           adev->ip_blocks[i].version->funcs->name, r);
2861                                 return r;
2862                         }
2863                         adev->ip_blocks[i].status.hw = true;
2864                 }
2865         }
2866
2867         return 0;
2868 }
2869
2870 /**
2871  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2872  *
2873  * @adev: amdgpu_device pointer
2874  *
2875  * First resume function for hardware IPs.  The list of all the hardware
2876  * IPs that make up the asic is walked and the resume callbacks are run for
2877  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2878  * functional state after a suspend and updates the software state as
2879  * necessary.  This function is also used for restoring the GPU after a GPU
2880  * reset.
2881  * Returns 0 on success, negative error code on failure.
2882  */
2883 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2884 {
2885         int i, r;
2886
2887         for (i = 0; i < adev->num_ip_blocks; i++) {
2888                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2889                         continue;
2890                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2891                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2892                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2893                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2894                         continue;
2895                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2896                 if (r) {
2897                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2898                                   adev->ip_blocks[i].version->funcs->name, r);
2899                         return r;
2900                 }
2901                 adev->ip_blocks[i].status.hw = true;
2902         }
2903
2904         return 0;
2905 }
2906
2907 /**
2908  * amdgpu_device_ip_resume - run resume for hardware IPs
2909  *
2910  * @adev: amdgpu_device pointer
2911  *
2912  * Main resume function for hardware IPs.  The hardware IPs
2913  * are split into two resume functions because they are
2914  * are also used in in recovering from a GPU reset and some additional
2915  * steps need to be take between them.  In this case (S3/S4) they are
2916  * run sequentially.
2917  * Returns 0 on success, negative error code on failure.
2918  */
2919 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2920 {
2921         int r;
2922
2923         r = amdgpu_device_ip_resume_phase1(adev);
2924         if (r)
2925                 return r;
2926
2927         r = amdgpu_device_fw_loading(adev);
2928         if (r)
2929                 return r;
2930
2931         r = amdgpu_device_ip_resume_phase2(adev);
2932
2933         return r;
2934 }
2935
2936 /**
2937  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2938  *
2939  * @adev: amdgpu_device pointer
2940  *
2941  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2942  */
2943 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2944 {
2945         if (amdgpu_sriov_vf(adev)) {
2946                 if (adev->is_atom_fw) {
2947                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2948                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2949                 } else {
2950                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2951                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2952                 }
2953
2954                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2955                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2956         }
2957 }
2958
2959 /**
2960  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2961  *
2962  * @asic_type: AMD asic type
2963  *
2964  * Check if there is DC (new modesetting infrastructre) support for an asic.
2965  * returns true if DC has support, false if not.
2966  */
2967 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2968 {
2969         switch (asic_type) {
2970 #if defined(CONFIG_DRM_AMD_DC)
2971 #if defined(CONFIG_DRM_AMD_DC_SI)
2972         case CHIP_TAHITI:
2973         case CHIP_PITCAIRN:
2974         case CHIP_VERDE:
2975         case CHIP_OLAND:
2976 #endif
2977         case CHIP_BONAIRE:
2978         case CHIP_KAVERI:
2979         case CHIP_KABINI:
2980         case CHIP_MULLINS:
2981                 /*
2982                  * We have systems in the wild with these ASICs that require
2983                  * LVDS and VGA support which is not supported with DC.
2984                  *
2985                  * Fallback to the non-DC driver here by default so as not to
2986                  * cause regressions.
2987                  */
2988                 return amdgpu_dc > 0;
2989         case CHIP_HAWAII:
2990         case CHIP_CARRIZO:
2991         case CHIP_STONEY:
2992         case CHIP_POLARIS10:
2993         case CHIP_POLARIS11:
2994         case CHIP_POLARIS12:
2995         case CHIP_VEGAM:
2996         case CHIP_TONGA:
2997         case CHIP_FIJI:
2998         case CHIP_VEGA10:
2999         case CHIP_VEGA12:
3000         case CHIP_VEGA20:
3001 #if defined(CONFIG_DRM_AMD_DC_DCN)
3002         case CHIP_RAVEN:
3003         case CHIP_NAVI10:
3004         case CHIP_NAVI14:
3005         case CHIP_NAVI12:
3006         case CHIP_RENOIR:
3007 #endif
3008 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
3009         case CHIP_SIENNA_CICHLID:
3010         case CHIP_NAVY_FLOUNDER:
3011         case CHIP_DIMGREY_CAVEFISH:
3012         case CHIP_VANGOGH:
3013 #endif
3014                 return amdgpu_dc != 0;
3015 #endif
3016         default:
3017                 if (amdgpu_dc > 0)
3018                         DRM_INFO("Display Core has been requested via kernel parameter "
3019                                          "but isn't supported by ASIC, ignoring\n");
3020                 return false;
3021         }
3022 }
3023
3024 /**
3025  * amdgpu_device_has_dc_support - check if dc is supported
3026  *
3027  * @adev: amdgpu_device pointer
3028  *
3029  * Returns true for supported, false for not supported
3030  */
3031 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3032 {
3033         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3034                 return false;
3035
3036         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3037 }
3038
3039
3040 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3041 {
3042         struct amdgpu_device *adev =
3043                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3044         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3045
3046         /* It's a bug to not have a hive within this function */
3047         if (WARN_ON(!hive))
3048                 return;
3049
3050         /*
3051          * Use task barrier to synchronize all xgmi reset works across the
3052          * hive. task_barrier_enter and task_barrier_exit will block
3053          * until all the threads running the xgmi reset works reach
3054          * those points. task_barrier_full will do both blocks.
3055          */
3056         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3057
3058                 task_barrier_enter(&hive->tb);
3059                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3060
3061                 if (adev->asic_reset_res)
3062                         goto fail;
3063
3064                 task_barrier_exit(&hive->tb);
3065                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3066
3067                 if (adev->asic_reset_res)
3068                         goto fail;
3069
3070                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3071                         adev->mmhub.funcs->reset_ras_error_count(adev);
3072         } else {
3073
3074                 task_barrier_full(&hive->tb);
3075                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3076         }
3077
3078 fail:
3079         if (adev->asic_reset_res)
3080                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3081                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3082         amdgpu_put_xgmi_hive(hive);
3083 }
3084
3085 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3086 {
3087         char *input = amdgpu_lockup_timeout;
3088         char *timeout_setting = NULL;
3089         int index = 0;
3090         long timeout;
3091         int ret = 0;
3092
3093         /*
3094          * By default timeout for non compute jobs is 10000.
3095          * And there is no timeout enforced on compute jobs.
3096          * In SR-IOV or passthrough mode, timeout for compute
3097          * jobs are 60000 by default.
3098          */
3099         adev->gfx_timeout = msecs_to_jiffies(10000);
3100         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3101         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3102                 adev->compute_timeout =  msecs_to_jiffies(60000);
3103         else
3104                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3105
3106         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3107                 while ((timeout_setting = strsep(&input, ",")) &&
3108                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3109                         ret = kstrtol(timeout_setting, 0, &timeout);
3110                         if (ret)
3111                                 return ret;
3112
3113                         if (timeout == 0) {
3114                                 index++;
3115                                 continue;
3116                         } else if (timeout < 0) {
3117                                 timeout = MAX_SCHEDULE_TIMEOUT;
3118                         } else {
3119                                 timeout = msecs_to_jiffies(timeout);
3120                         }
3121
3122                         switch (index++) {
3123                         case 0:
3124                                 adev->gfx_timeout = timeout;
3125                                 break;
3126                         case 1:
3127                                 adev->compute_timeout = timeout;
3128                                 break;
3129                         case 2:
3130                                 adev->sdma_timeout = timeout;
3131                                 break;
3132                         case 3:
3133                                 adev->video_timeout = timeout;
3134                                 break;
3135                         default:
3136                                 break;
3137                         }
3138                 }
3139                 /*
3140                  * There is only one value specified and
3141                  * it should apply to all non-compute jobs.
3142                  */
3143                 if (index == 1) {
3144                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3145                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3146                                 adev->compute_timeout = adev->gfx_timeout;
3147                 }
3148         }
3149
3150         return ret;
3151 }
3152
3153 static const struct attribute *amdgpu_dev_attributes[] = {
3154         &dev_attr_product_name.attr,
3155         &dev_attr_product_number.attr,
3156         &dev_attr_serial_number.attr,
3157         &dev_attr_pcie_replay_count.attr,
3158         NULL
3159 };
3160
3161
3162 /**
3163  * amdgpu_device_init - initialize the driver
3164  *
3165  * @adev: amdgpu_device pointer
3166  * @flags: driver flags
3167  *
3168  * Initializes the driver info and hw (all asics).
3169  * Returns 0 for success or an error on failure.
3170  * Called at driver startup.
3171  */
3172 int amdgpu_device_init(struct amdgpu_device *adev,
3173                        uint32_t flags)
3174 {
3175         struct drm_device *ddev = adev_to_drm(adev);
3176         struct pci_dev *pdev = adev->pdev;
3177         int r, i;
3178         bool boco = false;
3179         u32 max_MBps;
3180
3181         adev->shutdown = false;
3182         adev->flags = flags;
3183
3184         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3185                 adev->asic_type = amdgpu_force_asic_type;
3186         else
3187                 adev->asic_type = flags & AMD_ASIC_MASK;
3188
3189         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3190         if (amdgpu_emu_mode == 1)
3191                 adev->usec_timeout *= 10;
3192         adev->gmc.gart_size = 512 * 1024 * 1024;
3193         adev->accel_working = false;
3194         adev->num_rings = 0;
3195         adev->mman.buffer_funcs = NULL;
3196         adev->mman.buffer_funcs_ring = NULL;
3197         adev->vm_manager.vm_pte_funcs = NULL;
3198         adev->vm_manager.vm_pte_num_scheds = 0;
3199         adev->gmc.gmc_funcs = NULL;
3200         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3201         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3202
3203         adev->smc_rreg = &amdgpu_invalid_rreg;
3204         adev->smc_wreg = &amdgpu_invalid_wreg;
3205         adev->pcie_rreg = &amdgpu_invalid_rreg;
3206         adev->pcie_wreg = &amdgpu_invalid_wreg;
3207         adev->pciep_rreg = &amdgpu_invalid_rreg;
3208         adev->pciep_wreg = &amdgpu_invalid_wreg;
3209         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3210         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3211         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3212         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3213         adev->didt_rreg = &amdgpu_invalid_rreg;
3214         adev->didt_wreg = &amdgpu_invalid_wreg;
3215         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3216         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3217         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3218         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3219
3220         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3221                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3222                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3223
3224         /* mutex initialization are all done here so we
3225          * can recall function without having locking issues */
3226         atomic_set(&adev->irq.ih.lock, 0);
3227         mutex_init(&adev->firmware.mutex);
3228         mutex_init(&adev->pm.mutex);
3229         mutex_init(&adev->gfx.gpu_clock_mutex);
3230         mutex_init(&adev->srbm_mutex);
3231         mutex_init(&adev->gfx.pipe_reserve_mutex);
3232         mutex_init(&adev->gfx.gfx_off_mutex);
3233         mutex_init(&adev->grbm_idx_mutex);
3234         mutex_init(&adev->mn_lock);
3235         mutex_init(&adev->virt.vf_errors.lock);
3236         hash_init(adev->mn_hash);
3237         atomic_set(&adev->in_gpu_reset, 0);
3238         init_rwsem(&adev->reset_sem);
3239         mutex_init(&adev->psp.mutex);
3240         mutex_init(&adev->notifier_lock);
3241
3242         r = amdgpu_device_check_arguments(adev);
3243         if (r)
3244                 return r;
3245
3246         spin_lock_init(&adev->mmio_idx_lock);
3247         spin_lock_init(&adev->smc_idx_lock);
3248         spin_lock_init(&adev->pcie_idx_lock);
3249         spin_lock_init(&adev->uvd_ctx_idx_lock);
3250         spin_lock_init(&adev->didt_idx_lock);
3251         spin_lock_init(&adev->gc_cac_idx_lock);
3252         spin_lock_init(&adev->se_cac_idx_lock);
3253         spin_lock_init(&adev->audio_endpt_idx_lock);
3254         spin_lock_init(&adev->mm_stats.lock);
3255
3256         INIT_LIST_HEAD(&adev->shadow_list);
3257         mutex_init(&adev->shadow_list_lock);
3258
3259         INIT_DELAYED_WORK(&adev->delayed_init_work,
3260                           amdgpu_device_delayed_init_work_handler);
3261         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3262                           amdgpu_device_delay_enable_gfx_off);
3263
3264         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3265
3266         adev->gfx.gfx_off_req_count = 1;
3267         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3268
3269         atomic_set(&adev->throttling_logging_enabled, 1);
3270         /*
3271          * If throttling continues, logging will be performed every minute
3272          * to avoid log flooding. "-1" is subtracted since the thermal
3273          * throttling interrupt comes every second. Thus, the total logging
3274          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3275          * for throttling interrupt) = 60 seconds.
3276          */
3277         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3278         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3279
3280         /* Registers mapping */
3281         /* TODO: block userspace mapping of io register */
3282         if (adev->asic_type >= CHIP_BONAIRE) {
3283                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3284                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3285         } else {
3286                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3287                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3288         }
3289
3290         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3291         if (adev->rmmio == NULL) {
3292                 return -ENOMEM;
3293         }
3294         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3295         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3296
3297         /* io port mapping */
3298         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3299                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3300                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3301                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3302                         break;
3303                 }
3304         }
3305         if (adev->rio_mem == NULL)
3306                 DRM_INFO("PCI I/O BAR is not found.\n");
3307
3308         /* enable PCIE atomic ops */
3309         r = pci_enable_atomic_ops_to_root(adev->pdev,
3310                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3311                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3312         if (r) {
3313                 adev->have_atomics_support = false;
3314                 DRM_INFO("PCIE atomic ops is not supported\n");
3315         } else {
3316                 adev->have_atomics_support = true;
3317         }
3318
3319         amdgpu_device_get_pcie_info(adev);
3320
3321         if (amdgpu_mcbp)
3322                 DRM_INFO("MCBP is enabled\n");
3323
3324         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3325                 adev->enable_mes = true;
3326
3327         /* detect hw virtualization here */
3328         amdgpu_detect_virtualization(adev);
3329
3330         r = amdgpu_device_get_job_timeout_settings(adev);
3331         if (r) {
3332                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3333                 goto failed_unmap;
3334         }
3335
3336         /* early init functions */
3337         r = amdgpu_device_ip_early_init(adev);
3338         if (r)
3339                 goto failed_unmap;
3340
3341         /* doorbell bar mapping and doorbell index init*/
3342         amdgpu_device_doorbell_init(adev);
3343
3344         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3345         /* this will fail for cards that aren't VGA class devices, just
3346          * ignore it */
3347         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3348
3349         if (amdgpu_device_supports_boco(ddev))
3350                 boco = true;
3351         if (amdgpu_has_atpx() &&
3352             (amdgpu_is_atpx_hybrid() ||
3353              amdgpu_has_atpx_dgpu_power_cntl()) &&
3354             !pci_is_thunderbolt_attached(adev->pdev))
3355                 vga_switcheroo_register_client(adev->pdev,
3356                                                &amdgpu_switcheroo_ops, boco);
3357         if (boco)
3358                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3359
3360         if (amdgpu_emu_mode == 1) {
3361                 /* post the asic on emulation mode */
3362                 emu_soc_asic_init(adev);
3363                 goto fence_driver_init;
3364         }
3365
3366         /* detect if we are with an SRIOV vbios */
3367         amdgpu_device_detect_sriov_bios(adev);
3368
3369         /* check if we need to reset the asic
3370          *  E.g., driver was not cleanly unloaded previously, etc.
3371          */
3372         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3373                 r = amdgpu_asic_reset(adev);
3374                 if (r) {
3375                         dev_err(adev->dev, "asic reset on init failed\n");
3376                         goto failed;
3377                 }
3378         }
3379
3380         pci_enable_pcie_error_reporting(adev->ddev.pdev);
3381
3382         /* Post card if necessary */
3383         if (amdgpu_device_need_post(adev)) {
3384                 if (!adev->bios) {
3385                         dev_err(adev->dev, "no vBIOS found\n");
3386                         r = -EINVAL;
3387                         goto failed;
3388                 }
3389                 DRM_INFO("GPU posting now...\n");
3390                 r = amdgpu_device_asic_init(adev);
3391                 if (r) {
3392                         dev_err(adev->dev, "gpu post error!\n");
3393                         goto failed;
3394                 }
3395         }
3396
3397         if (adev->is_atom_fw) {
3398                 /* Initialize clocks */
3399                 r = amdgpu_atomfirmware_get_clock_info(adev);
3400                 if (r) {
3401                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3402                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3403                         goto failed;
3404                 }
3405         } else {
3406                 /* Initialize clocks */
3407                 r = amdgpu_atombios_get_clock_info(adev);
3408                 if (r) {
3409                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3410                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3411                         goto failed;
3412                 }
3413                 /* init i2c buses */
3414                 if (!amdgpu_device_has_dc_support(adev))
3415                         amdgpu_atombios_i2c_init(adev);
3416         }
3417
3418 fence_driver_init:
3419         /* Fence driver */
3420         r = amdgpu_fence_driver_init(adev);
3421         if (r) {
3422                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3423                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3424                 goto failed;
3425         }
3426
3427         /* init the mode config */
3428         drm_mode_config_init(adev_to_drm(adev));
3429
3430         r = amdgpu_device_ip_init(adev);
3431         if (r) {
3432                 /* failed in exclusive mode due to timeout */
3433                 if (amdgpu_sriov_vf(adev) &&
3434                     !amdgpu_sriov_runtime(adev) &&
3435                     amdgpu_virt_mmio_blocked(adev) &&
3436                     !amdgpu_virt_wait_reset(adev)) {
3437                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3438                         /* Don't send request since VF is inactive. */
3439                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3440                         adev->virt.ops = NULL;
3441                         r = -EAGAIN;
3442                         goto failed;
3443                 }
3444                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3445                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3446                 goto failed;
3447         }
3448
3449         dev_info(adev->dev,
3450                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3451                         adev->gfx.config.max_shader_engines,
3452                         adev->gfx.config.max_sh_per_se,
3453                         adev->gfx.config.max_cu_per_sh,
3454                         adev->gfx.cu_info.number);
3455
3456         adev->accel_working = true;
3457
3458         amdgpu_vm_check_compute_bug(adev);
3459
3460         /* Initialize the buffer migration limit. */
3461         if (amdgpu_moverate >= 0)
3462                 max_MBps = amdgpu_moverate;
3463         else
3464                 max_MBps = 8; /* Allow 8 MB/s. */
3465         /* Get a log2 for easy divisions. */
3466         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3467
3468         amdgpu_fbdev_init(adev);
3469
3470         r = amdgpu_pm_sysfs_init(adev);
3471         if (r) {
3472                 adev->pm_sysfs_en = false;
3473                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3474         } else
3475                 adev->pm_sysfs_en = true;
3476
3477         r = amdgpu_ucode_sysfs_init(adev);
3478         if (r) {
3479                 adev->ucode_sysfs_en = false;
3480                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3481         } else
3482                 adev->ucode_sysfs_en = true;
3483
3484         if ((amdgpu_testing & 1)) {
3485                 if (adev->accel_working)
3486                         amdgpu_test_moves(adev);
3487                 else
3488                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3489         }
3490         if (amdgpu_benchmarking) {
3491                 if (adev->accel_working)
3492                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3493                 else
3494                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3495         }
3496
3497         /*
3498          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3499          * Otherwise the mgpu fan boost feature will be skipped due to the
3500          * gpu instance is counted less.
3501          */
3502         amdgpu_register_gpu_instance(adev);
3503
3504         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3505          * explicit gating rather than handling it automatically.
3506          */
3507         r = amdgpu_device_ip_late_init(adev);
3508         if (r) {
3509                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3510                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3511                 goto failed;
3512         }
3513
3514         /* must succeed. */
3515         amdgpu_ras_resume(adev);
3516
3517         queue_delayed_work(system_wq, &adev->delayed_init_work,
3518                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3519
3520         if (amdgpu_sriov_vf(adev))
3521                 flush_delayed_work(&adev->delayed_init_work);
3522
3523         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3524         if (r)
3525                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3526
3527         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3528                 r = amdgpu_pmu_init(adev);
3529         if (r)
3530                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3531
3532         /* Have stored pci confspace at hand for restore in sudden PCI error */
3533         if (amdgpu_device_cache_pci_state(adev->pdev))
3534                 pci_restore_state(pdev);
3535
3536         return 0;
3537
3538 failed:
3539         amdgpu_vf_error_trans_all(adev);
3540         if (boco)
3541                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3542
3543 failed_unmap:
3544         iounmap(adev->rmmio);
3545         adev->rmmio = NULL;
3546
3547         return r;
3548 }
3549
3550 /**
3551  * amdgpu_device_fini - tear down the driver
3552  *
3553  * @adev: amdgpu_device pointer
3554  *
3555  * Tear down the driver info (all asics).
3556  * Called at driver shutdown.
3557  */
3558 void amdgpu_device_fini(struct amdgpu_device *adev)
3559 {
3560         dev_info(adev->dev, "amdgpu: finishing device.\n");
3561         flush_delayed_work(&adev->delayed_init_work);
3562         adev->shutdown = true;
3563
3564         kfree(adev->pci_state);
3565
3566         /* make sure IB test finished before entering exclusive mode
3567          * to avoid preemption on IB test
3568          * */
3569         if (amdgpu_sriov_vf(adev)) {
3570                 amdgpu_virt_request_full_gpu(adev, false);
3571                 amdgpu_virt_fini_data_exchange(adev);
3572         }
3573
3574         /* disable all interrupts */
3575         amdgpu_irq_disable_all(adev);
3576         if (adev->mode_info.mode_config_initialized){
3577                 if (!amdgpu_device_has_dc_support(adev))
3578                         drm_helper_force_disable_all(adev_to_drm(adev));
3579                 else
3580                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3581         }
3582         amdgpu_fence_driver_fini(adev);
3583         if (adev->pm_sysfs_en)
3584                 amdgpu_pm_sysfs_fini(adev);
3585         amdgpu_fbdev_fini(adev);
3586         amdgpu_device_ip_fini(adev);
3587         release_firmware(adev->firmware.gpu_info_fw);
3588         adev->firmware.gpu_info_fw = NULL;
3589         adev->accel_working = false;
3590         /* free i2c buses */
3591         if (!amdgpu_device_has_dc_support(adev))
3592                 amdgpu_i2c_fini(adev);
3593
3594         if (amdgpu_emu_mode != 1)
3595                 amdgpu_atombios_fini(adev);
3596
3597         kfree(adev->bios);
3598         adev->bios = NULL;
3599         if (amdgpu_has_atpx() &&
3600             (amdgpu_is_atpx_hybrid() ||
3601              amdgpu_has_atpx_dgpu_power_cntl()) &&
3602             !pci_is_thunderbolt_attached(adev->pdev))
3603                 vga_switcheroo_unregister_client(adev->pdev);
3604         if (amdgpu_device_supports_boco(adev_to_drm(adev)))
3605                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3606         vga_client_register(adev->pdev, NULL, NULL, NULL);
3607         if (adev->rio_mem)
3608                 pci_iounmap(adev->pdev, adev->rio_mem);
3609         adev->rio_mem = NULL;
3610         iounmap(adev->rmmio);
3611         adev->rmmio = NULL;
3612         amdgpu_device_doorbell_fini(adev);
3613
3614         if (adev->ucode_sysfs_en)
3615                 amdgpu_ucode_sysfs_fini(adev);
3616
3617         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3618         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3619                 amdgpu_pmu_fini(adev);
3620         if (adev->mman.discovery_bin)
3621                 amdgpu_discovery_fini(adev);
3622 }
3623
3624
3625 /*
3626  * Suspend & resume.
3627  */
3628 /**
3629  * amdgpu_device_suspend - initiate device suspend
3630  *
3631  * @dev: drm dev pointer
3632  * @fbcon : notify the fbdev of suspend
3633  *
3634  * Puts the hw in the suspend state (all asics).
3635  * Returns 0 for success or an error on failure.
3636  * Called at driver suspend.
3637  */
3638 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3639 {
3640         struct amdgpu_device *adev;
3641         struct drm_crtc *crtc;
3642         struct drm_connector *connector;
3643         struct drm_connector_list_iter iter;
3644         int r;
3645
3646         adev = drm_to_adev(dev);
3647
3648         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3649                 return 0;
3650
3651         adev->in_suspend = true;
3652         drm_kms_helper_poll_disable(dev);
3653
3654         if (fbcon)
3655                 amdgpu_fbdev_set_suspend(adev, 1);
3656
3657         cancel_delayed_work_sync(&adev->delayed_init_work);
3658
3659         if (!amdgpu_device_has_dc_support(adev)) {
3660                 /* turn off display hw */
3661                 drm_modeset_lock_all(dev);
3662                 drm_connector_list_iter_begin(dev, &iter);
3663                 drm_for_each_connector_iter(connector, &iter)
3664                         drm_helper_connector_dpms(connector,
3665                                                   DRM_MODE_DPMS_OFF);
3666                 drm_connector_list_iter_end(&iter);
3667                 drm_modeset_unlock_all(dev);
3668                         /* unpin the front buffers and cursors */
3669                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3670                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3671                         struct drm_framebuffer *fb = crtc->primary->fb;
3672                         struct amdgpu_bo *robj;
3673
3674                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3675                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3676                                 r = amdgpu_bo_reserve(aobj, true);
3677                                 if (r == 0) {
3678                                         amdgpu_bo_unpin(aobj);
3679                                         amdgpu_bo_unreserve(aobj);
3680                                 }
3681                         }
3682
3683                         if (fb == NULL || fb->obj[0] == NULL) {
3684                                 continue;
3685                         }
3686                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3687                         /* don't unpin kernel fb objects */
3688                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3689                                 r = amdgpu_bo_reserve(robj, true);
3690                                 if (r == 0) {
3691                                         amdgpu_bo_unpin(robj);
3692                                         amdgpu_bo_unreserve(robj);
3693                                 }
3694                         }
3695                 }
3696         }
3697
3698         amdgpu_ras_suspend(adev);
3699
3700         r = amdgpu_device_ip_suspend_phase1(adev);
3701
3702         amdgpu_amdkfd_suspend(adev, !fbcon);
3703
3704         /* evict vram memory */
3705         amdgpu_bo_evict_vram(adev);
3706
3707         amdgpu_fence_driver_suspend(adev);
3708
3709         r = amdgpu_device_ip_suspend_phase2(adev);
3710
3711         /* evict remaining vram memory
3712          * This second call to evict vram is to evict the gart page table
3713          * using the CPU.
3714          */
3715         amdgpu_bo_evict_vram(adev);
3716
3717         return 0;
3718 }
3719
3720 /**
3721  * amdgpu_device_resume - initiate device resume
3722  *
3723  * @dev: drm dev pointer
3724  * @fbcon : notify the fbdev of resume
3725  *
3726  * Bring the hw back to operating state (all asics).
3727  * Returns 0 for success or an error on failure.
3728  * Called at driver resume.
3729  */
3730 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3731 {
3732         struct drm_connector *connector;
3733         struct drm_connector_list_iter iter;
3734         struct amdgpu_device *adev = drm_to_adev(dev);
3735         struct drm_crtc *crtc;
3736         int r = 0;
3737
3738         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3739                 return 0;
3740
3741         /* post card */
3742         if (amdgpu_device_need_post(adev)) {
3743                 r = amdgpu_device_asic_init(adev);
3744                 if (r)
3745                         dev_err(adev->dev, "amdgpu asic init failed\n");
3746         }
3747
3748         r = amdgpu_device_ip_resume(adev);
3749         if (r) {
3750                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3751                 return r;
3752         }
3753         amdgpu_fence_driver_resume(adev);
3754
3755
3756         r = amdgpu_device_ip_late_init(adev);
3757         if (r)
3758                 return r;
3759
3760         queue_delayed_work(system_wq, &adev->delayed_init_work,
3761                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3762
3763         if (!amdgpu_device_has_dc_support(adev)) {
3764                 /* pin cursors */
3765                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3766                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3767
3768                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3769                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3770                                 r = amdgpu_bo_reserve(aobj, true);
3771                                 if (r == 0) {
3772                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3773                                         if (r != 0)
3774                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3775                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3776                                         amdgpu_bo_unreserve(aobj);
3777                                 }
3778                         }
3779                 }
3780         }
3781         r = amdgpu_amdkfd_resume(adev, !fbcon);
3782         if (r)
3783                 return r;
3784
3785         /* Make sure IB tests flushed */
3786         flush_delayed_work(&adev->delayed_init_work);
3787
3788         /* blat the mode back in */
3789         if (fbcon) {
3790                 if (!amdgpu_device_has_dc_support(adev)) {
3791                         /* pre DCE11 */
3792                         drm_helper_resume_force_mode(dev);
3793
3794                         /* turn on display hw */
3795                         drm_modeset_lock_all(dev);
3796
3797                         drm_connector_list_iter_begin(dev, &iter);
3798                         drm_for_each_connector_iter(connector, &iter)
3799                                 drm_helper_connector_dpms(connector,
3800                                                           DRM_MODE_DPMS_ON);
3801                         drm_connector_list_iter_end(&iter);
3802
3803                         drm_modeset_unlock_all(dev);
3804                 }
3805                 amdgpu_fbdev_set_suspend(adev, 0);
3806         }
3807
3808         drm_kms_helper_poll_enable(dev);
3809
3810         amdgpu_ras_resume(adev);
3811
3812         /*
3813          * Most of the connector probing functions try to acquire runtime pm
3814          * refs to ensure that the GPU is powered on when connector polling is
3815          * performed. Since we're calling this from a runtime PM callback,
3816          * trying to acquire rpm refs will cause us to deadlock.
3817          *
3818          * Since we're guaranteed to be holding the rpm lock, it's safe to
3819          * temporarily disable the rpm helpers so this doesn't deadlock us.
3820          */
3821 #ifdef CONFIG_PM
3822         dev->dev->power.disable_depth++;
3823 #endif
3824         if (!amdgpu_device_has_dc_support(adev))
3825                 drm_helper_hpd_irq_event(dev);
3826         else
3827                 drm_kms_helper_hotplug_event(dev);
3828 #ifdef CONFIG_PM
3829         dev->dev->power.disable_depth--;
3830 #endif
3831         adev->in_suspend = false;
3832
3833         return 0;
3834 }
3835
3836 /**
3837  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3838  *
3839  * @adev: amdgpu_device pointer
3840  *
3841  * The list of all the hardware IPs that make up the asic is walked and
3842  * the check_soft_reset callbacks are run.  check_soft_reset determines
3843  * if the asic is still hung or not.
3844  * Returns true if any of the IPs are still in a hung state, false if not.
3845  */
3846 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3847 {
3848         int i;
3849         bool asic_hang = false;
3850
3851         if (amdgpu_sriov_vf(adev))
3852                 return true;
3853
3854         if (amdgpu_asic_need_full_reset(adev))
3855                 return true;
3856
3857         for (i = 0; i < adev->num_ip_blocks; i++) {
3858                 if (!adev->ip_blocks[i].status.valid)
3859                         continue;
3860                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3861                         adev->ip_blocks[i].status.hang =
3862                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3863                 if (adev->ip_blocks[i].status.hang) {
3864                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3865                         asic_hang = true;
3866                 }
3867         }
3868         return asic_hang;
3869 }
3870
3871 /**
3872  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3873  *
3874  * @adev: amdgpu_device pointer
3875  *
3876  * The list of all the hardware IPs that make up the asic is walked and the
3877  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3878  * handles any IP specific hardware or software state changes that are
3879  * necessary for a soft reset to succeed.
3880  * Returns 0 on success, negative error code on failure.
3881  */
3882 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3883 {
3884         int i, r = 0;
3885
3886         for (i = 0; i < adev->num_ip_blocks; i++) {
3887                 if (!adev->ip_blocks[i].status.valid)
3888                         continue;
3889                 if (adev->ip_blocks[i].status.hang &&
3890                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3891                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3892                         if (r)
3893                                 return r;
3894                 }
3895         }
3896
3897         return 0;
3898 }
3899
3900 /**
3901  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3902  *
3903  * @adev: amdgpu_device pointer
3904  *
3905  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3906  * reset is necessary to recover.
3907  * Returns true if a full asic reset is required, false if not.
3908  */
3909 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3910 {
3911         int i;
3912
3913         if (amdgpu_asic_need_full_reset(adev))
3914                 return true;
3915
3916         for (i = 0; i < adev->num_ip_blocks; i++) {
3917                 if (!adev->ip_blocks[i].status.valid)
3918                         continue;
3919                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3920                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3921                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3922                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3923                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3924                         if (adev->ip_blocks[i].status.hang) {
3925                                 dev_info(adev->dev, "Some block need full reset!\n");
3926                                 return true;
3927                         }
3928                 }
3929         }
3930         return false;
3931 }
3932
3933 /**
3934  * amdgpu_device_ip_soft_reset - do a soft reset
3935  *
3936  * @adev: amdgpu_device pointer
3937  *
3938  * The list of all the hardware IPs that make up the asic is walked and the
3939  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3940  * IP specific hardware or software state changes that are necessary to soft
3941  * reset the IP.
3942  * Returns 0 on success, negative error code on failure.
3943  */
3944 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3945 {
3946         int i, r = 0;
3947
3948         for (i = 0; i < adev->num_ip_blocks; i++) {
3949                 if (!adev->ip_blocks[i].status.valid)
3950                         continue;
3951                 if (adev->ip_blocks[i].status.hang &&
3952                     adev->ip_blocks[i].version->funcs->soft_reset) {
3953                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3954                         if (r)
3955                                 return r;
3956                 }
3957         }
3958
3959         return 0;
3960 }
3961
3962 /**
3963  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3964  *
3965  * @adev: amdgpu_device pointer
3966  *
3967  * The list of all the hardware IPs that make up the asic is walked and the
3968  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3969  * handles any IP specific hardware or software state changes that are
3970  * necessary after the IP has been soft reset.
3971  * Returns 0 on success, negative error code on failure.
3972  */
3973 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3974 {
3975         int i, r = 0;
3976
3977         for (i = 0; i < adev->num_ip_blocks; i++) {
3978                 if (!adev->ip_blocks[i].status.valid)
3979                         continue;
3980                 if (adev->ip_blocks[i].status.hang &&
3981                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3982                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3983                 if (r)
3984                         return r;
3985         }
3986
3987         return 0;
3988 }
3989
3990 /**
3991  * amdgpu_device_recover_vram - Recover some VRAM contents
3992  *
3993  * @adev: amdgpu_device pointer
3994  *
3995  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3996  * restore things like GPUVM page tables after a GPU reset where
3997  * the contents of VRAM might be lost.
3998  *
3999  * Returns:
4000  * 0 on success, negative error code on failure.
4001  */
4002 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4003 {
4004         struct dma_fence *fence = NULL, *next = NULL;
4005         struct amdgpu_bo *shadow;
4006         long r = 1, tmo;
4007
4008         if (amdgpu_sriov_runtime(adev))
4009                 tmo = msecs_to_jiffies(8000);
4010         else
4011                 tmo = msecs_to_jiffies(100);
4012
4013         dev_info(adev->dev, "recover vram bo from shadow start\n");
4014         mutex_lock(&adev->shadow_list_lock);
4015         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4016
4017                 /* No need to recover an evicted BO */
4018                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4019                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4020                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4021                         continue;
4022
4023                 r = amdgpu_bo_restore_shadow(shadow, &next);
4024                 if (r)
4025                         break;
4026
4027                 if (fence) {
4028                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4029                         dma_fence_put(fence);
4030                         fence = next;
4031                         if (tmo == 0) {
4032                                 r = -ETIMEDOUT;
4033                                 break;
4034                         } else if (tmo < 0) {
4035                                 r = tmo;
4036                                 break;
4037                         }
4038                 } else {
4039                         fence = next;
4040                 }
4041         }
4042         mutex_unlock(&adev->shadow_list_lock);
4043
4044         if (fence)
4045                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4046         dma_fence_put(fence);
4047
4048         if (r < 0 || tmo <= 0) {
4049                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4050                 return -EIO;
4051         }
4052
4053         dev_info(adev->dev, "recover vram bo from shadow done\n");
4054         return 0;
4055 }
4056
4057
4058 /**
4059  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4060  *
4061  * @adev: amdgpu_device pointer
4062  * @from_hypervisor: request from hypervisor
4063  *
4064  * do VF FLR and reinitialize Asic
4065  * return 0 means succeeded otherwise failed
4066  */
4067 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4068                                      bool from_hypervisor)
4069 {
4070         int r;
4071
4072         if (from_hypervisor)
4073                 r = amdgpu_virt_request_full_gpu(adev, true);
4074         else
4075                 r = amdgpu_virt_reset_gpu(adev);
4076         if (r)
4077                 return r;
4078
4079         amdgpu_amdkfd_pre_reset(adev);
4080
4081         /* Resume IP prior to SMC */
4082         r = amdgpu_device_ip_reinit_early_sriov(adev);
4083         if (r)
4084                 goto error;
4085
4086         amdgpu_virt_init_data_exchange(adev);
4087         /* we need recover gart prior to run SMC/CP/SDMA resume */
4088         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4089
4090         r = amdgpu_device_fw_loading(adev);
4091         if (r)
4092                 return r;
4093
4094         /* now we are okay to resume SMC/CP/SDMA */
4095         r = amdgpu_device_ip_reinit_late_sriov(adev);
4096         if (r)
4097                 goto error;
4098
4099         amdgpu_irq_gpu_reset_resume_helper(adev);
4100         r = amdgpu_ib_ring_tests(adev);
4101         amdgpu_amdkfd_post_reset(adev);
4102
4103 error:
4104         amdgpu_virt_release_full_gpu(adev, true);
4105         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4106                 amdgpu_inc_vram_lost(adev);
4107                 r = amdgpu_device_recover_vram(adev);
4108         }
4109
4110         return r;
4111 }
4112
4113 /**
4114  * amdgpu_device_has_job_running - check if there is any job in mirror list
4115  *
4116  * @adev: amdgpu_device pointer
4117  *
4118  * check if there is any job in mirror list
4119  */
4120 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4121 {
4122         int i;
4123         struct drm_sched_job *job;
4124
4125         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4126                 struct amdgpu_ring *ring = adev->rings[i];
4127
4128                 if (!ring || !ring->sched.thread)
4129                         continue;
4130
4131                 spin_lock(&ring->sched.job_list_lock);
4132                 job = list_first_entry_or_null(&ring->sched.ring_mirror_list,
4133                                 struct drm_sched_job, node);
4134                 spin_unlock(&ring->sched.job_list_lock);
4135                 if (job)
4136                         return true;
4137         }
4138         return false;
4139 }
4140
4141 /**
4142  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4143  *
4144  * @adev: amdgpu_device pointer
4145  *
4146  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4147  * a hung GPU.
4148  */
4149 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4150 {
4151         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4152                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4153                 return false;
4154         }
4155
4156         if (amdgpu_gpu_recovery == 0)
4157                 goto disabled;
4158
4159         if (amdgpu_sriov_vf(adev))
4160                 return true;
4161
4162         if (amdgpu_gpu_recovery == -1) {
4163                 switch (adev->asic_type) {
4164                 case CHIP_BONAIRE:
4165                 case CHIP_HAWAII:
4166                 case CHIP_TOPAZ:
4167                 case CHIP_TONGA:
4168                 case CHIP_FIJI:
4169                 case CHIP_POLARIS10:
4170                 case CHIP_POLARIS11:
4171                 case CHIP_POLARIS12:
4172                 case CHIP_VEGAM:
4173                 case CHIP_VEGA20:
4174                 case CHIP_VEGA10:
4175                 case CHIP_VEGA12:
4176                 case CHIP_RAVEN:
4177                 case CHIP_ARCTURUS:
4178                 case CHIP_RENOIR:
4179                 case CHIP_NAVI10:
4180                 case CHIP_NAVI14:
4181                 case CHIP_NAVI12:
4182                 case CHIP_SIENNA_CICHLID:
4183                         break;
4184                 default:
4185                         goto disabled;
4186                 }
4187         }
4188
4189         return true;
4190
4191 disabled:
4192                 dev_info(adev->dev, "GPU recovery disabled.\n");
4193                 return false;
4194 }
4195
4196
4197 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4198                                         struct amdgpu_job *job,
4199                                         bool *need_full_reset_arg)
4200 {
4201         int i, r = 0;
4202         bool need_full_reset  = *need_full_reset_arg;
4203
4204         amdgpu_debugfs_wait_dump(adev);
4205
4206         if (amdgpu_sriov_vf(adev)) {
4207                 /* stop the data exchange thread */
4208                 amdgpu_virt_fini_data_exchange(adev);
4209         }
4210
4211         /* block all schedulers and reset given job's ring */
4212         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4213                 struct amdgpu_ring *ring = adev->rings[i];
4214
4215                 if (!ring || !ring->sched.thread)
4216                         continue;
4217
4218                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4219                 amdgpu_fence_driver_force_completion(ring);
4220         }
4221
4222         if(job)
4223                 drm_sched_increase_karma(&job->base);
4224
4225         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4226         if (!amdgpu_sriov_vf(adev)) {
4227
4228                 if (!need_full_reset)
4229                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4230
4231                 if (!need_full_reset) {
4232                         amdgpu_device_ip_pre_soft_reset(adev);
4233                         r = amdgpu_device_ip_soft_reset(adev);
4234                         amdgpu_device_ip_post_soft_reset(adev);
4235                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4236                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4237                                 need_full_reset = true;
4238                         }
4239                 }
4240
4241                 if (need_full_reset)
4242                         r = amdgpu_device_ip_suspend(adev);
4243
4244                 *need_full_reset_arg = need_full_reset;
4245         }
4246
4247         return r;
4248 }
4249
4250 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4251                                struct list_head *device_list_handle,
4252                                bool *need_full_reset_arg,
4253                                bool skip_hw_reset)
4254 {
4255         struct amdgpu_device *tmp_adev = NULL;
4256         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4257         int r = 0;
4258
4259         /*
4260          * ASIC reset has to be done on all HGMI hive nodes ASAP
4261          * to allow proper links negotiation in FW (within 1 sec)
4262          */
4263         if (!skip_hw_reset && need_full_reset) {
4264                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4265                         /* For XGMI run all resets in parallel to speed up the process */
4266                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4267                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4268                                         r = -EALREADY;
4269                         } else
4270                                 r = amdgpu_asic_reset(tmp_adev);
4271
4272                         if (r) {
4273                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4274                                          r, adev_to_drm(tmp_adev)->unique);
4275                                 break;
4276                         }
4277                 }
4278
4279                 /* For XGMI wait for all resets to complete before proceed */
4280                 if (!r) {
4281                         list_for_each_entry(tmp_adev, device_list_handle,
4282                                             gmc.xgmi.head) {
4283                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4284                                         flush_work(&tmp_adev->xgmi_reset_work);
4285                                         r = tmp_adev->asic_reset_res;
4286                                         if (r)
4287                                                 break;
4288                                 }
4289                         }
4290                 }
4291         }
4292
4293         if (!r && amdgpu_ras_intr_triggered()) {
4294                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4295                         if (tmp_adev->mmhub.funcs &&
4296                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4297                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4298                 }
4299
4300                 amdgpu_ras_intr_cleared();
4301         }
4302
4303         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4304                 if (need_full_reset) {
4305                         /* post card */
4306                         if (amdgpu_device_asic_init(tmp_adev))
4307                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4308
4309                         if (!r) {
4310                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4311                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4312                                 if (r)
4313                                         goto out;
4314
4315                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4316                                 if (vram_lost) {
4317                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4318                                         amdgpu_inc_vram_lost(tmp_adev);
4319                                 }
4320
4321                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4322                                 if (r)
4323                                         goto out;
4324
4325                                 r = amdgpu_device_fw_loading(tmp_adev);
4326                                 if (r)
4327                                         return r;
4328
4329                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4330                                 if (r)
4331                                         goto out;
4332
4333                                 if (vram_lost)
4334                                         amdgpu_device_fill_reset_magic(tmp_adev);
4335
4336                                 /*
4337                                  * Add this ASIC as tracked as reset was already
4338                                  * complete successfully.
4339                                  */
4340                                 amdgpu_register_gpu_instance(tmp_adev);
4341
4342                                 r = amdgpu_device_ip_late_init(tmp_adev);
4343                                 if (r)
4344                                         goto out;
4345
4346                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4347
4348                                 /*
4349                                  * The GPU enters bad state once faulty pages
4350                                  * by ECC has reached the threshold, and ras
4351                                  * recovery is scheduled next. So add one check
4352                                  * here to break recovery if it indeed exceeds
4353                                  * bad page threshold, and remind user to
4354                                  * retire this GPU or setting one bigger
4355                                  * bad_page_threshold value to fix this once
4356                                  * probing driver again.
4357                                  */
4358                                 if (!amdgpu_ras_check_err_threshold(tmp_adev)) {
4359                                         /* must succeed. */
4360                                         amdgpu_ras_resume(tmp_adev);
4361                                 } else {
4362                                         r = -EINVAL;
4363                                         goto out;
4364                                 }
4365
4366                                 /* Update PSP FW topology after reset */
4367                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4368                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4369                         }
4370                 }
4371
4372 out:
4373                 if (!r) {
4374                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4375                         r = amdgpu_ib_ring_tests(tmp_adev);
4376                         if (r) {
4377                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4378                                 r = amdgpu_device_ip_suspend(tmp_adev);
4379                                 need_full_reset = true;
4380                                 r = -EAGAIN;
4381                                 goto end;
4382                         }
4383                 }
4384
4385                 if (!r)
4386                         r = amdgpu_device_recover_vram(tmp_adev);
4387                 else
4388                         tmp_adev->asic_reset_res = r;
4389         }
4390
4391 end:
4392         *need_full_reset_arg = need_full_reset;
4393         return r;
4394 }
4395
4396 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4397                                 struct amdgpu_hive_info *hive)
4398 {
4399         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4400                 return false;
4401
4402         if (hive) {
4403                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4404         } else {
4405                 down_write(&adev->reset_sem);
4406         }
4407
4408         atomic_inc(&adev->gpu_reset_counter);
4409         switch (amdgpu_asic_reset_method(adev)) {
4410         case AMD_RESET_METHOD_MODE1:
4411                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4412                 break;
4413         case AMD_RESET_METHOD_MODE2:
4414                 adev->mp1_state = PP_MP1_STATE_RESET;
4415                 break;
4416         default:
4417                 adev->mp1_state = PP_MP1_STATE_NONE;
4418                 break;
4419         }
4420
4421         return true;
4422 }
4423
4424 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4425 {
4426         amdgpu_vf_error_trans_all(adev);
4427         adev->mp1_state = PP_MP1_STATE_NONE;
4428         atomic_set(&adev->in_gpu_reset, 0);
4429         up_write(&adev->reset_sem);
4430 }
4431
4432 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4433 {
4434         struct pci_dev *p = NULL;
4435
4436         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4437                         adev->pdev->bus->number, 1);
4438         if (p) {
4439                 pm_runtime_enable(&(p->dev));
4440                 pm_runtime_resume(&(p->dev));
4441         }
4442 }
4443
4444 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4445 {
4446         enum amd_reset_method reset_method;
4447         struct pci_dev *p = NULL;
4448         u64 expires;
4449
4450         /*
4451          * For now, only BACO and mode1 reset are confirmed
4452          * to suffer the audio issue without proper suspended.
4453          */
4454         reset_method = amdgpu_asic_reset_method(adev);
4455         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4456              (reset_method != AMD_RESET_METHOD_MODE1))
4457                 return -EINVAL;
4458
4459         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4460                         adev->pdev->bus->number, 1);
4461         if (!p)
4462                 return -ENODEV;
4463
4464         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4465         if (!expires)
4466                 /*
4467                  * If we cannot get the audio device autosuspend delay,
4468                  * a fixed 4S interval will be used. Considering 3S is
4469                  * the audio controller default autosuspend delay setting.
4470                  * 4S used here is guaranteed to cover that.
4471                  */
4472                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4473
4474         while (!pm_runtime_status_suspended(&(p->dev))) {
4475                 if (!pm_runtime_suspend(&(p->dev)))
4476                         break;
4477
4478                 if (expires < ktime_get_mono_fast_ns()) {
4479                         dev_warn(adev->dev, "failed to suspend display audio\n");
4480                         /* TODO: abort the succeeding gpu reset? */
4481                         return -ETIMEDOUT;
4482                 }
4483         }
4484
4485         pm_runtime_disable(&(p->dev));
4486
4487         return 0;
4488 }
4489
4490 /**
4491  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4492  *
4493  * @adev: amdgpu_device pointer
4494  * @job: which job trigger hang
4495  *
4496  * Attempt to reset the GPU if it has hung (all asics).
4497  * Attempt to do soft-reset or full-reset and reinitialize Asic
4498  * Returns 0 for success or an error on failure.
4499  */
4500
4501 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4502                               struct amdgpu_job *job)
4503 {
4504         struct list_head device_list, *device_list_handle =  NULL;
4505         bool need_full_reset = false;
4506         bool job_signaled = false;
4507         struct amdgpu_hive_info *hive = NULL;
4508         struct amdgpu_device *tmp_adev = NULL;
4509         int i, r = 0;
4510         bool need_emergency_restart = false;
4511         bool audio_suspended = false;
4512
4513         /*
4514          * Special case: RAS triggered and full reset isn't supported
4515          */
4516         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4517
4518         /*
4519          * Flush RAM to disk so that after reboot
4520          * the user can read log and see why the system rebooted.
4521          */
4522         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4523                 DRM_WARN("Emergency reboot.");
4524
4525                 ksys_sync_helper();
4526                 emergency_restart();
4527         }
4528
4529         dev_info(adev->dev, "GPU %s begin!\n",
4530                 need_emergency_restart ? "jobs stop":"reset");
4531
4532         /*
4533          * Here we trylock to avoid chain of resets executing from
4534          * either trigger by jobs on different adevs in XGMI hive or jobs on
4535          * different schedulers for same device while this TO handler is running.
4536          * We always reset all schedulers for device and all devices for XGMI
4537          * hive so that should take care of them too.
4538          */
4539         hive = amdgpu_get_xgmi_hive(adev);
4540         if (hive) {
4541                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4542                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4543                                 job ? job->base.id : -1, hive->hive_id);
4544                         amdgpu_put_xgmi_hive(hive);
4545                         return 0;
4546                 }
4547                 mutex_lock(&hive->hive_lock);
4548         }
4549
4550         /*
4551          * Build list of devices to reset.
4552          * In case we are in XGMI hive mode, resort the device list
4553          * to put adev in the 1st position.
4554          */
4555         INIT_LIST_HEAD(&device_list);
4556         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4557                 if (!hive)
4558                         return -ENODEV;
4559                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4560                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4561                 device_list_handle = &hive->device_list;
4562         } else {
4563                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4564                 device_list_handle = &device_list;
4565         }
4566
4567         /* block all schedulers and reset given job's ring */
4568         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4569                 if (!amdgpu_device_lock_adev(tmp_adev, hive)) {
4570                         dev_info(tmp_adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4571                                   job ? job->base.id : -1);
4572                         r = 0;
4573                         goto skip_recovery;
4574                 }
4575
4576                 /*
4577                  * Try to put the audio codec into suspend state
4578                  * before gpu reset started.
4579                  *
4580                  * Due to the power domain of the graphics device
4581                  * is shared with AZ power domain. Without this,
4582                  * we may change the audio hardware from behind
4583                  * the audio driver's back. That will trigger
4584                  * some audio codec errors.
4585                  */
4586                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4587                         audio_suspended = true;
4588
4589                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4590
4591                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4592
4593                 if (!amdgpu_sriov_vf(tmp_adev))
4594                         amdgpu_amdkfd_pre_reset(tmp_adev);
4595
4596                 /*
4597                  * Mark these ASICs to be reseted as untracked first
4598                  * And add them back after reset completed
4599                  */
4600                 amdgpu_unregister_gpu_instance(tmp_adev);
4601
4602                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4603
4604                 /* disable ras on ALL IPs */
4605                 if (!need_emergency_restart &&
4606                       amdgpu_device_ip_need_full_reset(tmp_adev))
4607                         amdgpu_ras_suspend(tmp_adev);
4608
4609                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4610                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4611
4612                         if (!ring || !ring->sched.thread)
4613                                 continue;
4614
4615                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4616
4617                         if (need_emergency_restart)
4618                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4619                 }
4620         }
4621
4622         if (need_emergency_restart)
4623                 goto skip_sched_resume;
4624
4625         /*
4626          * Must check guilty signal here since after this point all old
4627          * HW fences are force signaled.
4628          *
4629          * job->base holds a reference to parent fence
4630          */
4631         if (job && job->base.s_fence->parent &&
4632             dma_fence_is_signaled(job->base.s_fence->parent)) {
4633                 job_signaled = true;
4634                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4635                 goto skip_hw_reset;
4636         }
4637
4638 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4639         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4640                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4641                                                  (tmp_adev == adev) ? job : NULL,
4642                                                  &need_full_reset);
4643                 /*TODO Should we stop ?*/
4644                 if (r) {
4645                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4646                                   r, adev_to_drm(tmp_adev)->unique);
4647                         tmp_adev->asic_reset_res = r;
4648                 }
4649         }
4650
4651         /* Actual ASIC resets if needed.*/
4652         /* TODO Implement XGMI hive reset logic for SRIOV */
4653         if (amdgpu_sriov_vf(adev)) {
4654                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4655                 if (r)
4656                         adev->asic_reset_res = r;
4657         } else {
4658                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4659                 if (r && r == -EAGAIN)
4660                         goto retry;
4661         }
4662
4663 skip_hw_reset:
4664
4665         /* Post ASIC reset for all devs .*/
4666         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4667
4668                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4669                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4670
4671                         if (!ring || !ring->sched.thread)
4672                                 continue;
4673
4674                         /* No point to resubmit jobs if we didn't HW reset*/
4675                         if (!tmp_adev->asic_reset_res && !job_signaled)
4676                                 drm_sched_resubmit_jobs(&ring->sched);
4677
4678                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4679                 }
4680
4681                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4682                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4683                 }
4684
4685                 tmp_adev->asic_reset_res = 0;
4686
4687                 if (r) {
4688                         /* bad news, how to tell it to userspace ? */
4689                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4690                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4691                 } else {
4692                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4693                 }
4694         }
4695
4696 skip_sched_resume:
4697         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4698                 /*unlock kfd: SRIOV would do it separately */
4699                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4700                         amdgpu_amdkfd_post_reset(tmp_adev);
4701                 if (audio_suspended)
4702                         amdgpu_device_resume_display_audio(tmp_adev);
4703                 amdgpu_device_unlock_adev(tmp_adev);
4704         }
4705
4706 skip_recovery:
4707         if (hive) {
4708                 atomic_set(&hive->in_reset, 0);
4709                 mutex_unlock(&hive->hive_lock);
4710                 amdgpu_put_xgmi_hive(hive);
4711         }
4712
4713         if (r)
4714                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4715         return r;
4716 }
4717
4718 /**
4719  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4720  *
4721  * @adev: amdgpu_device pointer
4722  *
4723  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4724  * and lanes) of the slot the device is in. Handles APUs and
4725  * virtualized environments where PCIE config space may not be available.
4726  */
4727 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4728 {
4729         struct pci_dev *pdev;
4730         enum pci_bus_speed speed_cap, platform_speed_cap;
4731         enum pcie_link_width platform_link_width;
4732
4733         if (amdgpu_pcie_gen_cap)
4734                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4735
4736         if (amdgpu_pcie_lane_cap)
4737                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4738
4739         /* covers APUs as well */
4740         if (pci_is_root_bus(adev->pdev->bus)) {
4741                 if (adev->pm.pcie_gen_mask == 0)
4742                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4743                 if (adev->pm.pcie_mlw_mask == 0)
4744                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4745                 return;
4746         }
4747
4748         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4749                 return;
4750
4751         pcie_bandwidth_available(adev->pdev, NULL,
4752                                  &platform_speed_cap, &platform_link_width);
4753
4754         if (adev->pm.pcie_gen_mask == 0) {
4755                 /* asic caps */
4756                 pdev = adev->pdev;
4757                 speed_cap = pcie_get_speed_cap(pdev);
4758                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4759                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4760                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4761                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4762                 } else {
4763                         if (speed_cap == PCIE_SPEED_16_0GT)
4764                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4765                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4766                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4767                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4768                         else if (speed_cap == PCIE_SPEED_8_0GT)
4769                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4770                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4771                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4772                         else if (speed_cap == PCIE_SPEED_5_0GT)
4773                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4774                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4775                         else
4776                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4777                 }
4778                 /* platform caps */
4779                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4780                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4781                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4782                 } else {
4783                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4784                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4785                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4786                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4787                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4788                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4789                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4790                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4791                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4792                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4793                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4794                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4795                         else
4796                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4797
4798                 }
4799         }
4800         if (adev->pm.pcie_mlw_mask == 0) {
4801                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4802                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4803                 } else {
4804                         switch (platform_link_width) {
4805                         case PCIE_LNK_X32:
4806                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4807                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4808                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4809                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4810                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4811                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4812                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4813                                 break;
4814                         case PCIE_LNK_X16:
4815                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4816                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4817                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4818                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4819                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4820                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4821                                 break;
4822                         case PCIE_LNK_X12:
4823                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4824                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4825                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4826                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4827                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4828                                 break;
4829                         case PCIE_LNK_X8:
4830                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4831                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4832                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4833                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4834                                 break;
4835                         case PCIE_LNK_X4:
4836                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4837                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4838                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4839                                 break;
4840                         case PCIE_LNK_X2:
4841                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4842                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4843                                 break;
4844                         case PCIE_LNK_X1:
4845                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4846                                 break;
4847                         default:
4848                                 break;
4849                         }
4850                 }
4851         }
4852 }
4853
4854 int amdgpu_device_baco_enter(struct drm_device *dev)
4855 {
4856         struct amdgpu_device *adev = drm_to_adev(dev);
4857         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4858
4859         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4860                 return -ENOTSUPP;
4861
4862         if (ras && ras->supported)
4863                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4864
4865         return amdgpu_dpm_baco_enter(adev);
4866 }
4867
4868 int amdgpu_device_baco_exit(struct drm_device *dev)
4869 {
4870         struct amdgpu_device *adev = drm_to_adev(dev);
4871         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4872         int ret = 0;
4873
4874         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
4875                 return -ENOTSUPP;
4876
4877         ret = amdgpu_dpm_baco_exit(adev);
4878         if (ret)
4879                 return ret;
4880
4881         if (ras && ras->supported)
4882                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4883
4884         return 0;
4885 }
4886
4887 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
4888 {
4889         int i;
4890
4891         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4892                 struct amdgpu_ring *ring = adev->rings[i];
4893
4894                 if (!ring || !ring->sched.thread)
4895                         continue;
4896
4897                 cancel_delayed_work_sync(&ring->sched.work_tdr);
4898         }
4899 }
4900
4901 /**
4902  * amdgpu_pci_error_detected - Called when a PCI error is detected.
4903  * @pdev: PCI device struct
4904  * @state: PCI channel state
4905  *
4906  * Description: Called when a PCI error is detected.
4907  *
4908  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
4909  */
4910 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
4911 {
4912         struct drm_device *dev = pci_get_drvdata(pdev);
4913         struct amdgpu_device *adev = drm_to_adev(dev);
4914         int i;
4915
4916         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
4917
4918         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4919                 DRM_WARN("No support for XGMI hive yet...");
4920                 return PCI_ERS_RESULT_DISCONNECT;
4921         }
4922
4923         switch (state) {
4924         case pci_channel_io_normal:
4925                 return PCI_ERS_RESULT_CAN_RECOVER;
4926         /* Fatal error, prepare for slot reset */
4927         case pci_channel_io_frozen:
4928                 /*
4929                  * Cancel and wait for all TDRs in progress if failing to
4930                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
4931                  *
4932                  * Locking adev->reset_sem will prevent any external access
4933                  * to GPU during PCI error recovery
4934                  */
4935                 while (!amdgpu_device_lock_adev(adev, NULL))
4936                         amdgpu_cancel_all_tdr(adev);
4937
4938                 /*
4939                  * Block any work scheduling as we do for regular GPU reset
4940                  * for the duration of the recovery
4941                  */
4942                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4943                         struct amdgpu_ring *ring = adev->rings[i];
4944
4945                         if (!ring || !ring->sched.thread)
4946                                 continue;
4947
4948                         drm_sched_stop(&ring->sched, NULL);
4949                 }
4950                 return PCI_ERS_RESULT_NEED_RESET;
4951         case pci_channel_io_perm_failure:
4952                 /* Permanent error, prepare for device removal */
4953                 return PCI_ERS_RESULT_DISCONNECT;
4954         }
4955
4956         return PCI_ERS_RESULT_NEED_RESET;
4957 }
4958
4959 /**
4960  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
4961  * @pdev: pointer to PCI device
4962  */
4963 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
4964 {
4965
4966         DRM_INFO("PCI error: mmio enabled callback!!\n");
4967
4968         /* TODO - dump whatever for debugging purposes */
4969
4970         /* This called only if amdgpu_pci_error_detected returns
4971          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
4972          * works, no need to reset slot.
4973          */
4974
4975         return PCI_ERS_RESULT_RECOVERED;
4976 }
4977
4978 /**
4979  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
4980  * @pdev: PCI device struct
4981  *
4982  * Description: This routine is called by the pci error recovery
4983  * code after the PCI slot has been reset, just before we
4984  * should resume normal operations.
4985  */
4986 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
4987 {
4988         struct drm_device *dev = pci_get_drvdata(pdev);
4989         struct amdgpu_device *adev = drm_to_adev(dev);
4990         int r, i;
4991         bool need_full_reset = true;
4992         u32 memsize;
4993         struct list_head device_list;
4994
4995         DRM_INFO("PCI error: slot reset callback!!\n");
4996
4997         INIT_LIST_HEAD(&device_list);
4998         list_add_tail(&adev->gmc.xgmi.head, &device_list);
4999
5000         /* wait for asic to come out of reset */
5001         msleep(500);
5002
5003         /* Restore PCI confspace */
5004         amdgpu_device_load_pci_state(pdev);
5005
5006         /* confirm  ASIC came out of reset */
5007         for (i = 0; i < adev->usec_timeout; i++) {
5008                 memsize = amdgpu_asic_get_config_memsize(adev);
5009
5010                 if (memsize != 0xffffffff)
5011                         break;
5012                 udelay(1);
5013         }
5014         if (memsize == 0xffffffff) {
5015                 r = -ETIME;
5016                 goto out;
5017         }
5018
5019         adev->in_pci_err_recovery = true;
5020         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5021         adev->in_pci_err_recovery = false;
5022         if (r)
5023                 goto out;
5024
5025         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5026
5027 out:
5028         if (!r) {
5029                 if (amdgpu_device_cache_pci_state(adev->pdev))
5030                         pci_restore_state(adev->pdev);
5031
5032                 DRM_INFO("PCIe error recovery succeeded\n");
5033         } else {
5034                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5035                 amdgpu_device_unlock_adev(adev);
5036         }
5037
5038         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5039 }
5040
5041 /**
5042  * amdgpu_pci_resume() - resume normal ops after PCI reset
5043  * @pdev: pointer to PCI device
5044  *
5045  * Called when the error recovery driver tells us that its
5046  * OK to resume normal operation. Use completion to allow
5047  * halted scsi ops to resume.
5048  */
5049 void amdgpu_pci_resume(struct pci_dev *pdev)
5050 {
5051         struct drm_device *dev = pci_get_drvdata(pdev);
5052         struct amdgpu_device *adev = drm_to_adev(dev);
5053         int i;
5054
5055
5056         DRM_INFO("PCI error: resume callback!!\n");
5057
5058         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5059                 struct amdgpu_ring *ring = adev->rings[i];
5060
5061                 if (!ring || !ring->sched.thread)
5062                         continue;
5063
5064
5065                 drm_sched_resubmit_jobs(&ring->sched);
5066                 drm_sched_start(&ring->sched, true);
5067         }
5068
5069         amdgpu_device_unlock_adev(adev);
5070 }
5071
5072 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5073 {
5074         struct drm_device *dev = pci_get_drvdata(pdev);
5075         struct amdgpu_device *adev = drm_to_adev(dev);
5076         int r;
5077
5078         r = pci_save_state(pdev);
5079         if (!r) {
5080                 kfree(adev->pci_state);
5081
5082                 adev->pci_state = pci_store_saved_state(pdev);
5083
5084                 if (!adev->pci_state) {
5085                         DRM_ERROR("Failed to store PCI saved state");
5086                         return false;
5087                 }
5088         } else {
5089                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5090                 return false;
5091         }
5092
5093         return true;
5094 }
5095
5096 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5097 {
5098         struct drm_device *dev = pci_get_drvdata(pdev);
5099         struct amdgpu_device *adev = drm_to_adev(dev);
5100         int r;
5101
5102         if (!adev->pci_state)
5103                 return false;
5104
5105         r = pci_load_saved_state(pdev, adev->pci_state);
5106
5107         if (!r) {
5108                 pci_restore_state(pdev);
5109         } else {
5110                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5111                 return false;
5112         }
5113
5114         return true;
5115 }
5116
5117