drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/vangogh_gpu_info.bin");
  84
  85 #define AMDGPU_RESUME_MS                2000
  86
  87 const char *amdgpu_asic_name[] = {
  88         "TAHITI",
  89         "PITCAIRN",
  90         "VERDE",
  91         "OLAND",
  92         "HAINAN",
  93         "BONAIRE",
  94         "KAVERI",
  95         "KABINI",
  96         "HAWAII",
  97         "MULLINS",
  98         "TOPAZ",
  99         "TONGA",
 100         "FIJI",
 101         "CARRIZO",
 102         "STONEY",
 103         "POLARIS10",
 104         "POLARIS11",
 105         "POLARIS12",
 106         "VEGAM",
 107         "VEGA10",
 108         "VEGA12",
 109         "VEGA20",
 110         "RAVEN",
 111         "ARCTURUS",
 112         "RENOIR",
 113         "ALDEBARAN",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "VANGOGH",
 120         "DIMGREY_CAVEFISH",
 121         "LAST",
 122 };
 123
 124 /**
 125  * DOC: pcie_replay_count
 126  *
 127  * The amdgpu driver provides a sysfs API for reporting the total number
 128  * of PCIe replays (NAKs)
 129  * The file pcie_replay_count is used for this and returns the total
 130  * number of replays as a sum of the NAKs generated and NAKs received
 131  */
 132
 133 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 134                 struct device_attribute *attr, char *buf)
 135 {
 136         struct drm_device *ddev = dev_get_drvdata(dev);
 137         struct amdgpu_device *adev = drm_to_adev(ddev);
 138         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 139
 140         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 141 }
 142
 143 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 144                 amdgpu_device_get_pcie_replay_count, NULL);
 145
 146 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 147
 148 /**
 149  * DOC: product_name
 150  *
 151  * The amdgpu driver provides a sysfs API for reporting the product name
 152  * for the device
 153  * The file serial_number is used for this and returns the product name
 154  * as returned from the FRU.
 155  * NOTE: This is only available for certain server cards
 156  */
 157
 158 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 159                 struct device_attribute *attr, char *buf)
 160 {
 161         struct drm_device *ddev = dev_get_drvdata(dev);
 162         struct amdgpu_device *adev = drm_to_adev(ddev);
 163
 164         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 165 }
 166
 167 static DEVICE_ATTR(product_name, S_IRUGO,
 168                 amdgpu_device_get_product_name, NULL);
 169
 170 /**
 171  * DOC: product_number
 172  *
 173  * The amdgpu driver provides a sysfs API for reporting the part number
 174  * for the device
 175  * The file serial_number is used for this and returns the part number
 176  * as returned from the FRU.
 177  * NOTE: This is only available for certain server cards
 178  */
 179
 180 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 181                 struct device_attribute *attr, char *buf)
 182 {
 183         struct drm_device *ddev = dev_get_drvdata(dev);
 184         struct amdgpu_device *adev = drm_to_adev(ddev);
 185
 186         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 187 }
 188
 189 static DEVICE_ATTR(product_number, S_IRUGO,
 190                 amdgpu_device_get_product_number, NULL);
 191
 192 /**
 193  * DOC: serial_number
 194  *
 195  * The amdgpu driver provides a sysfs API for reporting the serial number
 196  * for the device
 197  * The file serial_number is used for this and returns the serial number
 198  * as returned from the FRU.
 199  * NOTE: This is only available for certain server cards
 200  */
 201
 202 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 203                 struct device_attribute *attr, char *buf)
 204 {
 205         struct drm_device *ddev = dev_get_drvdata(dev);
 206         struct amdgpu_device *adev = drm_to_adev(ddev);
 207
 208         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 209 }
 210
 211 static DEVICE_ATTR(serial_number, S_IRUGO,
 212                 amdgpu_device_get_serial_number, NULL);
 213
 214 /**
 215  * amdgpu_device_supports_atpx - Is the device a dGPU with HG/PX power control
 216  *
 217  * @dev: drm_device pointer
 218  *
 219  * Returns true if the device is a dGPU with HG/PX power control,
 220  * otherwise return false.
 221  */
 222 bool amdgpu_device_supports_atpx(struct drm_device *dev)
 223 {
 224         struct amdgpu_device *adev = drm_to_adev(dev);
 225
 226         if (adev->flags & AMD_IS_PX)
 227                 return true;
 228         return false;
 229 }
 230
 231 /**
 232  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 233  *
 234  * @dev: drm_device pointer
 235  *
 236  * Returns true if the device is a dGPU with HG/PX power control,
 237  * otherwise return false.
 238  */
 239 bool amdgpu_device_supports_boco(struct drm_device *dev)
 240 {
 241         struct amdgpu_device *adev = drm_to_adev(dev);
 242
 243         if (adev->has_pr3)
 244                 return true;
 245         return false;
 246 }
 247
 248 /**
 249  * amdgpu_device_supports_baco - Does the device support BACO
 250  *
 251  * @dev: drm_device pointer
 252  *
 253  * Returns true if the device supporte BACO,
 254  * otherwise return false.
 255  */
 256 bool amdgpu_device_supports_baco(struct drm_device *dev)
 257 {
 258         struct amdgpu_device *adev = drm_to_adev(dev);
 259
 260         return amdgpu_asic_supports_baco(adev);
 261 }
 262
 263 /*
 264  * VRAM access helper functions
 265  */
 266
 267 /**
 268  * amdgpu_device_vram_access - read/write a buffer in vram
 269  *
 270  * @adev: amdgpu_device pointer
 271  * @pos: offset of the buffer in vram
 272  * @buf: virtual address of the buffer in system memory
 273  * @size: read/write size, sizeof(@buf) must > @size
 274  * @write: true - write to vram, otherwise - read from vram
 275  */
 276 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 277                                uint32_t *buf, size_t size, bool write)
 278 {
 279         unsigned long flags;
 280         uint32_t hi = ~0;
 281         uint64_t last;
 282
 283
 284 #ifdef CONFIG_64BIT
 285         last = min(pos + size, adev->gmc.visible_vram_size);
 286         if (last > pos) {
 287                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 288                 size_t count = last - pos;
 289
 290                 if (write) {
 291                         memcpy_toio(addr, buf, count);
 292                         mb();
 293                         amdgpu_asic_flush_hdp(adev, NULL);
 294                 } else {
 295                         amdgpu_asic_invalidate_hdp(adev, NULL);
 296                         mb();
 297                         memcpy_fromio(buf, addr, count);
 298                 }
 299
 300                 if (count == size)
 301                         return;
 302
 303                 pos += count;
 304                 buf += count / 4;
 305                 size -= count;
 306         }
 307 #endif
 308
 309         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 310         for (last = pos + size; pos < last; pos += 4) {
 311                 uint32_t tmp = pos >> 31;
 312
 313                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 314                 if (tmp != hi) {
 315                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 316                         hi = tmp;
 317                 }
 318                 if (write)
 319                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 320                 else
 321                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 322         }
 323         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 324 }
 325
 326 /*
 327  * register access helper functions.
 328  */
 329 /**
 330  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 331  *
 332  * @adev: amdgpu_device pointer
 333  * @reg: dword aligned register offset
 334  * @acc_flags: access flags which require special behavior
 335  *
 336  * Returns the 32 bit value from the offset specified.
 337  */
 338 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 339                             uint32_t reg, uint32_t acc_flags)
 340 {
 341         uint32_t ret;
 342
 343         if (adev->in_pci_err_recovery)
 344                 return 0;
 345
 346         if ((reg * 4) < adev->rmmio_size) {
 347                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 348                     amdgpu_sriov_runtime(adev) &&
 349                     down_read_trylock(&adev->reset_sem)) {
 350                         ret = amdgpu_kiq_rreg(adev, reg);
 351                         up_read(&adev->reset_sem);
 352                 } else {
 353                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 354                 }
 355         } else {
 356                 ret = adev->pcie_rreg(adev, reg * 4);
 357         }
 358
 359         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 360
 361         return ret;
 362 }
 363
 364 /*
 365  * MMIO register read with bytes helper functions
 366  * @offset:bytes offset from MMIO start
 367  *
 368 */
 369
 370 /**
 371  * amdgpu_mm_rreg8 - read a memory mapped IO register
 372  *
 373  * @adev: amdgpu_device pointer
 374  * @offset: byte aligned register offset
 375  *
 376  * Returns the 8 bit value from the offset specified.
 377  */
 378 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 379 {
 380         if (adev->in_pci_err_recovery)
 381                 return 0;
 382
 383         if (offset < adev->rmmio_size)
 384                 return (readb(adev->rmmio + offset));
 385         BUG();
 386 }
 387
 388 /*
 389  * MMIO register write with bytes helper functions
 390  * @offset:bytes offset from MMIO start
 391  * @value: the value want to be written to the register
 392  *
 393 */
 394 /**
 395  * amdgpu_mm_wreg8 - read a memory mapped IO register
 396  *
 397  * @adev: amdgpu_device pointer
 398  * @offset: byte aligned register offset
 399  * @value: 8 bit value to write
 400  *
 401  * Writes the value specified to the offset specified.
 402  */
 403 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 404 {
 405         if (adev->in_pci_err_recovery)
 406                 return;
 407
 408         if (offset < adev->rmmio_size)
 409                 writeb(value, adev->rmmio + offset);
 410         else
 411                 BUG();
 412 }
 413
 414 /**
 415  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 416  *
 417  * @adev: amdgpu_device pointer
 418  * @reg: dword aligned register offset
 419  * @v: 32 bit value to write to the register
 420  * @acc_flags: access flags which require special behavior
 421  *
 422  * Writes the value specified to the offset specified.
 423  */
 424 void amdgpu_device_wreg(struct amdgpu_device *adev,
 425                         uint32_t reg, uint32_t v,
 426                         uint32_t acc_flags)
 427 {
 428         if (adev->in_pci_err_recovery)
 429                 return;
 430
 431         if ((reg * 4) < adev->rmmio_size) {
 432                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 433                     amdgpu_sriov_runtime(adev) &&
 434                     down_read_trylock(&adev->reset_sem)) {
 435                         amdgpu_kiq_wreg(adev, reg, v);
 436                         up_read(&adev->reset_sem);
 437                 } else {
 438                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 439                 }
 440         } else {
 441                 adev->pcie_wreg(adev, reg * 4, v);
 442         }
 443
 444         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 445 }
 446
 447 /*
 448  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 449  *
 450  * this function is invoked only the debugfs register access
 451  * */
 452 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 453                              uint32_t reg, uint32_t v)
 454 {
 455         if (adev->in_pci_err_recovery)
 456                 return;
 457
 458         if (amdgpu_sriov_fullaccess(adev) &&
 459             adev->gfx.rlc.funcs &&
 460             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 461                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 462                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 463         } else {
 464                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 465         }
 466 }
 467
 468 /**
 469  * amdgpu_io_rreg - read an IO register
 470  *
 471  * @adev: amdgpu_device pointer
 472  * @reg: dword aligned register offset
 473  *
 474  * Returns the 32 bit value from the offset specified.
 475  */
 476 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 477 {
 478         if (adev->in_pci_err_recovery)
 479                 return 0;
 480
 481         if ((reg * 4) < adev->rio_mem_size)
 482                 return ioread32(adev->rio_mem + (reg * 4));
 483         else {
 484                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 485                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 486         }
 487 }
 488
 489 /**
 490  * amdgpu_io_wreg - write to an IO register
 491  *
 492  * @adev: amdgpu_device pointer
 493  * @reg: dword aligned register offset
 494  * @v: 32 bit value to write to the register
 495  *
 496  * Writes the value specified to the offset specified.
 497  */
 498 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 499 {
 500         if (adev->in_pci_err_recovery)
 501                 return;
 502
 503         if ((reg * 4) < adev->rio_mem_size)
 504                 iowrite32(v, adev->rio_mem + (reg * 4));
 505         else {
 506                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 507                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 508         }
 509 }
 510
 511 /**
 512  * amdgpu_mm_rdoorbell - read a doorbell dword
 513  *
 514  * @adev: amdgpu_device pointer
 515  * @index: doorbell index
 516  *
 517  * Returns the value in the doorbell aperture at the
 518  * requested doorbell index (CIK).
 519  */
 520 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 521 {
 522         if (adev->in_pci_err_recovery)
 523                 return 0;
 524
 525         if (index < adev->doorbell.num_doorbells) {
 526                 return readl(adev->doorbell.ptr + index);
 527         } else {
 528                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 529                 return 0;
 530         }
 531 }
 532
 533 /**
 534  * amdgpu_mm_wdoorbell - write a doorbell dword
 535  *
 536  * @adev: amdgpu_device pointer
 537  * @index: doorbell index
 538  * @v: value to write
 539  *
 540  * Writes @v to the doorbell aperture at the
 541  * requested doorbell index (CIK).
 542  */
 543 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 544 {
 545         if (adev->in_pci_err_recovery)
 546                 return;
 547
 548         if (index < adev->doorbell.num_doorbells) {
 549                 writel(v, adev->doorbell.ptr + index);
 550         } else {
 551                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 552         }
 553 }
 554
 555 /**
 556  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 557  *
 558  * @adev: amdgpu_device pointer
 559  * @index: doorbell index
 560  *
 561  * Returns the value in the doorbell aperture at the
 562  * requested doorbell index (VEGA10+).
 563  */
 564 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 565 {
 566         if (adev->in_pci_err_recovery)
 567                 return 0;
 568
 569         if (index < adev->doorbell.num_doorbells) {
 570                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 571         } else {
 572                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 573                 return 0;
 574         }
 575 }
 576
 577 /**
 578  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 579  *
 580  * @adev: amdgpu_device pointer
 581  * @index: doorbell index
 582  * @v: value to write
 583  *
 584  * Writes @v to the doorbell aperture at the
 585  * requested doorbell index (VEGA10+).
 586  */
 587 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 588 {
 589         if (adev->in_pci_err_recovery)
 590                 return;
 591
 592         if (index < adev->doorbell.num_doorbells) {
 593                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 594         } else {
 595                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 596         }
 597 }
 598
 599 /**
 600  * amdgpu_device_indirect_rreg - read an indirect register
 601  *
 602  * @adev: amdgpu_device pointer
 603  * @pcie_index: mmio register offset
 604  * @pcie_data: mmio register offset
 605  * @reg_addr: indirect register address to read from
 606  *
 607  * Returns the value of indirect register @reg_addr
 608  */
 609 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 610                                 u32 pcie_index, u32 pcie_data,
 611                                 u32 reg_addr)
 612 {
 613         unsigned long flags;
 614         u32 r;
 615         void __iomem *pcie_index_offset;
 616         void __iomem *pcie_data_offset;
 617
 618         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 619         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 620         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 621
 622         writel(reg_addr, pcie_index_offset);
 623         readl(pcie_index_offset);
 624         r = readl(pcie_data_offset);
 625         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 626
 627         return r;
 628 }
 629
 630 /**
 631  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 632  *
 633  * @adev: amdgpu_device pointer
 634  * @pcie_index: mmio register offset
 635  * @pcie_data: mmio register offset
 636  * @reg_addr: indirect register address to read from
 637  *
 638  * Returns the value of indirect register @reg_addr
 639  */
 640 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 641                                   u32 pcie_index, u32 pcie_data,
 642                                   u32 reg_addr)
 643 {
 644         unsigned long flags;
 645         u64 r;
 646         void __iomem *pcie_index_offset;
 647         void __iomem *pcie_data_offset;
 648
 649         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 650         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 651         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 652
 653         /* read low 32 bits */
 654         writel(reg_addr, pcie_index_offset);
 655         readl(pcie_index_offset);
 656         r = readl(pcie_data_offset);
 657         /* read high 32 bits */
 658         writel(reg_addr + 4, pcie_index_offset);
 659         readl(pcie_index_offset);
 660         r |= ((u64)readl(pcie_data_offset) << 32);
 661         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 662
 663         return r;
 664 }
 665
 666 /**
 667  * amdgpu_device_indirect_wreg - write an indirect register address
 668  *
 669  * @adev: amdgpu_device pointer
 670  * @pcie_index: mmio register offset
 671  * @pcie_data: mmio register offset
 672  * @reg_addr: indirect register offset
 673  * @reg_data: indirect register data
 674  *
 675  */
 676 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 677                                  u32 pcie_index, u32 pcie_data,
 678                                  u32 reg_addr, u32 reg_data)
 679 {
 680         unsigned long flags;
 681         void __iomem *pcie_index_offset;
 682         void __iomem *pcie_data_offset;
 683
 684         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 685         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 686         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 687
 688         writel(reg_addr, pcie_index_offset);
 689         readl(pcie_index_offset);
 690         writel(reg_data, pcie_data_offset);
 691         readl(pcie_data_offset);
 692         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 693 }
 694
 695 /**
 696  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 697  *
 698  * @adev: amdgpu_device pointer
 699  * @pcie_index: mmio register offset
 700  * @pcie_data: mmio register offset
 701  * @reg_addr: indirect register offset
 702  * @reg_data: indirect register data
 703  *
 704  */
 705 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 706                                    u32 pcie_index, u32 pcie_data,
 707                                    u32 reg_addr, u64 reg_data)
 708 {
 709         unsigned long flags;
 710         void __iomem *pcie_index_offset;
 711         void __iomem *pcie_data_offset;
 712
 713         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 714         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 715         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 716
 717         /* write low 32 bits */
 718         writel(reg_addr, pcie_index_offset);
 719         readl(pcie_index_offset);
 720         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 721         readl(pcie_data_offset);
 722         /* write high 32 bits */
 723         writel(reg_addr + 4, pcie_index_offset);
 724         readl(pcie_index_offset);
 725         writel((u32)(reg_data >> 32), pcie_data_offset);
 726         readl(pcie_data_offset);
 727         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 728 }
 729
 730 /**
 731  * amdgpu_invalid_rreg - dummy reg read function
 732  *
 733  * @adev: amdgpu_device pointer
 734  * @reg: offset of register
 735  *
 736  * Dummy register read function.  Used for register blocks
 737  * that certain asics don't have (all asics).
 738  * Returns the value in the register.
 739  */
 740 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 741 {
 742         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 743         BUG();
 744         return 0;
 745 }
 746
 747 /**
 748  * amdgpu_invalid_wreg - dummy reg write function
 749  *
 750  * @adev: amdgpu_device pointer
 751  * @reg: offset of register
 752  * @v: value to write to the register
 753  *
 754  * Dummy register read function.  Used for register blocks
 755  * that certain asics don't have (all asics).
 756  */
 757 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 758 {
 759         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 760                   reg, v);
 761         BUG();
 762 }
 763
 764 /**
 765  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 766  *
 767  * @adev: amdgpu_device pointer
 768  * @reg: offset of register
 769  *
 770  * Dummy register read function.  Used for register blocks
 771  * that certain asics don't have (all asics).
 772  * Returns the value in the register.
 773  */
 774 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 775 {
 776         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 777         BUG();
 778         return 0;
 779 }
 780
 781 /**
 782  * amdgpu_invalid_wreg64 - dummy reg write function
 783  *
 784  * @adev: amdgpu_device pointer
 785  * @reg: offset of register
 786  * @v: value to write to the register
 787  *
 788  * Dummy register read function.  Used for register blocks
 789  * that certain asics don't have (all asics).
 790  */
 791 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 792 {
 793         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 794                   reg, v);
 795         BUG();
 796 }
 797
 798 /**
 799  * amdgpu_block_invalid_rreg - dummy reg read function
 800  *
 801  * @adev: amdgpu_device pointer
 802  * @block: offset of instance
 803  * @reg: offset of register
 804  *
 805  * Dummy register read function.  Used for register blocks
 806  * that certain asics don't have (all asics).
 807  * Returns the value in the register.
 808  */
 809 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 810                                           uint32_t block, uint32_t reg)
 811 {
 812         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 813                   reg, block);
 814         BUG();
 815         return 0;
 816 }
 817
 818 /**
 819  * amdgpu_block_invalid_wreg - dummy reg write function
 820  *
 821  * @adev: amdgpu_device pointer
 822  * @block: offset of instance
 823  * @reg: offset of register
 824  * @v: value to write to the register
 825  *
 826  * Dummy register read function.  Used for register blocks
 827  * that certain asics don't have (all asics).
 828  */
 829 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 830                                       uint32_t block,
 831                                       uint32_t reg, uint32_t v)
 832 {
 833         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 834                   reg, block, v);
 835         BUG();
 836 }
 837
 838 /**
 839  * amdgpu_device_asic_init - Wrapper for atom asic_init
 840  *
 841  * @adev: amdgpu_device pointer
 842  *
 843  * Does any asic specific work and then calls atom asic init.
 844  */
 845 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 846 {
 847         amdgpu_asic_pre_asic_init(adev);
 848
 849         return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 850 }
 851
 852 /**
 853  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 854  *
 855  * @adev: amdgpu_device pointer
 856  *
 857  * Allocates a scratch page of VRAM for use by various things in the
 858  * driver.
 859  */
 860 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 861 {
 862         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 863                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 864                                        &adev->vram_scratch.robj,
 865                                        &adev->vram_scratch.gpu_addr,
 866                                        (void **)&adev->vram_scratch.ptr);
 867 }
 868
 869 /**
 870  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 871  *
 872  * @adev: amdgpu_device pointer
 873  *
 874  * Frees the VRAM scratch page.
 875  */
 876 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 877 {
 878         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 879 }
 880
 881 /**
 882  * amdgpu_device_program_register_sequence - program an array of registers.
 883  *
 884  * @adev: amdgpu_device pointer
 885  * @registers: pointer to the register array
 886  * @array_size: size of the register array
 887  *
 888  * Programs an array or registers with and and or masks.
 889  * This is a helper for setting golden registers.
 890  */
 891 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 892                                              const u32 *registers,
 893                                              const u32 array_size)
 894 {
 895         u32 tmp, reg, and_mask, or_mask;
 896         int i;
 897
 898         if (array_size % 3)
 899                 return;
 900
 901         for (i = 0; i < array_size; i +=3) {
 902                 reg = registers[i + 0];
 903                 and_mask = registers[i + 1];
 904                 or_mask = registers[i + 2];
 905
 906                 if (and_mask == 0xffffffff) {
 907                         tmp = or_mask;
 908                 } else {
 909                         tmp = RREG32(reg);
 910                         tmp &= ~and_mask;
 911                         if (adev->family >= AMDGPU_FAMILY_AI)
 912                                 tmp |= (or_mask & and_mask);
 913                         else
 914                                 tmp |= or_mask;
 915                 }
 916                 WREG32(reg, tmp);
 917         }
 918 }
 919
 920 /**
 921  * amdgpu_device_pci_config_reset - reset the GPU
 922  *
 923  * @adev: amdgpu_device pointer
 924  *
 925  * Resets the GPU using the pci config reset sequence.
 926  * Only applicable to asics prior to vega10.
 927  */
 928 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 929 {
 930         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 931 }
 932
 933 /**
 934  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
 935  *
 936  * @adev: amdgpu_device pointer
 937  *
 938  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
 939  */
 940 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
 941 {
 942         return pci_reset_function(adev->pdev);
 943 }
 944
 945 /*
 946  * GPU doorbell aperture helpers function.
 947  */
 948 /**
 949  * amdgpu_device_doorbell_init - Init doorbell driver information.
 950  *
 951  * @adev: amdgpu_device pointer
 952  *
 953  * Init doorbell driver information (CIK)
 954  * Returns 0 on success, error on failure.
 955  */
 956 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 957 {
 958
 959         /* No doorbell on SI hardware generation */
 960         if (adev->asic_type < CHIP_BONAIRE) {
 961                 adev->doorbell.base = 0;
 962                 adev->doorbell.size = 0;
 963                 adev->doorbell.num_doorbells = 0;
 964                 adev->doorbell.ptr = NULL;
 965                 return 0;
 966         }
 967
 968         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 969                 return -EINVAL;
 970
 971         amdgpu_asic_init_doorbell_index(adev);
 972
 973         /* doorbell bar mapping */
 974         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 975         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 976
 977         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 978                                              adev->doorbell_index.max_assignment+1);
 979         if (adev->doorbell.num_doorbells == 0)
 980                 return -EINVAL;
 981
 982         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 983          * paging queue doorbell use the second page. The
 984          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 985          * doorbells are in the first page. So with paging queue enabled,
 986          * the max num_doorbells should + 1 page (0x400 in dword)
 987          */
 988         if (adev->asic_type >= CHIP_VEGA10)
 989                 adev->doorbell.num_doorbells += 0x400;
 990
 991         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 992                                      adev->doorbell.num_doorbells *
 993                                      sizeof(u32));
 994         if (adev->doorbell.ptr == NULL)
 995                 return -ENOMEM;
 996
 997         return 0;
 998 }
 999
1000 /**
1001  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
1002  *
1003  * @adev: amdgpu_device pointer
1004  *
1005  * Tear down doorbell driver information (CIK)
1006  */
1007 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
1008 {
1009         iounmap(adev->doorbell.ptr);
1010         adev->doorbell.ptr = NULL;
1011 }
1012
1013
1014
1015 /*
1016  * amdgpu_device_wb_*()
1017  * Writeback is the method by which the GPU updates special pages in memory
1018  * with the status of certain GPU events (fences, ring pointers,etc.).
1019  */
1020
1021 /**
1022  * amdgpu_device_wb_fini - Disable Writeback and free memory
1023  *
1024  * @adev: amdgpu_device pointer
1025  *
1026  * Disables Writeback and frees the Writeback memory (all asics).
1027  * Used at driver shutdown.
1028  */
1029 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1030 {
1031         if (adev->wb.wb_obj) {
1032                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1033                                       &adev->wb.gpu_addr,
1034                                       (void **)&adev->wb.wb);
1035                 adev->wb.wb_obj = NULL;
1036         }
1037 }
1038
1039 /**
1040  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
1041  *
1042  * @adev: amdgpu_device pointer
1043  *
1044  * Initializes writeback and allocates writeback memory (all asics).
1045  * Used at driver startup.
1046  * Returns 0 on success or an -error on failure.
1047  */
1048 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1049 {
1050         int r;
1051
1052         if (adev->wb.wb_obj == NULL) {
1053                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1054                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1055                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1056                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1057                                             (void **)&adev->wb.wb);
1058                 if (r) {
1059                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1060                         return r;
1061                 }
1062
1063                 adev->wb.num_wb = AMDGPU_MAX_WB;
1064                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1065
1066                 /* clear wb memory */
1067                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1068         }
1069
1070         return 0;
1071 }
1072
1073 /**
1074  * amdgpu_device_wb_get - Allocate a wb entry
1075  *
1076  * @adev: amdgpu_device pointer
1077  * @wb: wb index
1078  *
1079  * Allocate a wb slot for use by the driver (all asics).
1080  * Returns 0 on success or -EINVAL on failure.
1081  */
1082 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1083 {
1084         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1085
1086         if (offset < adev->wb.num_wb) {
1087                 __set_bit(offset, adev->wb.used);
1088                 *wb = offset << 3; /* convert to dw offset */
1089                 return 0;
1090         } else {
1091                 return -EINVAL;
1092         }
1093 }
1094
1095 /**
1096  * amdgpu_device_wb_free - Free a wb entry
1097  *
1098  * @adev: amdgpu_device pointer
1099  * @wb: wb index
1100  *
1101  * Free a wb slot allocated for use by the driver (all asics)
1102  */
1103 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1104 {
1105         wb >>= 3;
1106         if (wb < adev->wb.num_wb)
1107                 __clear_bit(wb, adev->wb.used);
1108 }
1109
1110 /**
1111  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1112  *
1113  * @adev: amdgpu_device pointer
1114  *
1115  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1116  * to fail, but if any of the BARs is not accessible after the size we abort
1117  * driver loading by returning -ENODEV.
1118  */
1119 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1120 {
1121         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1122         struct pci_bus *root;
1123         struct resource *res;
1124         unsigned i;
1125         u16 cmd;
1126         int r;
1127
1128         /* Bypass for VF */
1129         if (amdgpu_sriov_vf(adev))
1130                 return 0;
1131
1132         /* skip if the bios has already enabled large BAR */
1133         if (adev->gmc.real_vram_size &&
1134             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1135                 return 0;
1136
1137         /* Check if the root BUS has 64bit memory resources */
1138         root = adev->pdev->bus;
1139         while (root->parent)
1140                 root = root->parent;
1141
1142         pci_bus_for_each_resource(root, res, i) {
1143                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1144                     res->start > 0x100000000ull)
1145                         break;
1146         }
1147
1148         /* Trying to resize is pointless without a root hub window above 4GB */
1149         if (!res)
1150                 return 0;
1151
1152         /* Limit the BAR size to what is available */
1153         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1154                         rbar_size);
1155
1156         /* Disable memory decoding while we change the BAR addresses and size */
1157         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1158         pci_write_config_word(adev->pdev, PCI_COMMAND,
1159                               cmd & ~PCI_COMMAND_MEMORY);
1160
1161         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1162         amdgpu_device_doorbell_fini(adev);
1163         if (adev->asic_type >= CHIP_BONAIRE)
1164                 pci_release_resource(adev->pdev, 2);
1165
1166         pci_release_resource(adev->pdev, 0);
1167
1168         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1169         if (r == -ENOSPC)
1170                 DRM_INFO("Not enough PCI address space for a large BAR.");
1171         else if (r && r != -ENOTSUPP)
1172                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1173
1174         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1175
1176         /* When the doorbell or fb BAR isn't available we have no chance of
1177          * using the device.
1178          */
1179         r = amdgpu_device_doorbell_init(adev);
1180         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1181                 return -ENODEV;
1182
1183         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1184
1185         return 0;
1186 }
1187
1188 /*
1189  * GPU helpers function.
1190  */
1191 /**
1192  * amdgpu_device_need_post - check if the hw need post or not
1193  *
1194  * @adev: amdgpu_device pointer
1195  *
1196  * Check if the asic has been initialized (all asics) at driver startup
1197  * or post is needed if  hw reset is performed.
1198  * Returns true if need or false if not.
1199  */
1200 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1201 {
1202         uint32_t reg;
1203
1204         if (amdgpu_sriov_vf(adev))
1205                 return false;
1206
1207         if (amdgpu_passthrough(adev)) {
1208                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1209                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1210                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1211                  * vpost executed for smc version below 22.15
1212                  */
1213                 if (adev->asic_type == CHIP_FIJI) {
1214                         int err;
1215                         uint32_t fw_ver;
1216                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1217                         /* force vPost if error occured */
1218                         if (err)
1219                                 return true;
1220
1221                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1222                         if (fw_ver < 0x00160e00)
1223                                 return true;
1224                 }
1225         }
1226
1227         /* Don't post if we need to reset whole hive on init */
1228         if (adev->gmc.xgmi.pending_reset)
1229                 return false;
1230
1231         if (adev->has_hw_reset) {
1232                 adev->has_hw_reset = false;
1233                 return true;
1234         }
1235
1236         /* bios scratch used on CIK+ */
1237         if (adev->asic_type >= CHIP_BONAIRE)
1238                 return amdgpu_atombios_scratch_need_asic_init(adev);
1239
1240         /* check MEM_SIZE for older asics */
1241         reg = amdgpu_asic_get_config_memsize(adev);
1242
1243         if ((reg != 0) && (reg != 0xffffffff))
1244                 return false;
1245
1246         return true;
1247 }
1248
1249 /* if we get transitioned to only one device, take VGA back */
1250 /**
1251  * amdgpu_device_vga_set_decode - enable/disable vga decode
1252  *
1253  * @cookie: amdgpu_device pointer
1254  * @state: enable/disable vga decode
1255  *
1256  * Enable/disable vga decode (all asics).
1257  * Returns VGA resource flags.
1258  */
1259 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1260 {
1261         struct amdgpu_device *adev = cookie;
1262         amdgpu_asic_set_vga_state(adev, state);
1263         if (state)
1264                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1265                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1266         else
1267                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1268 }
1269
1270 /**
1271  * amdgpu_device_check_block_size - validate the vm block size
1272  *
1273  * @adev: amdgpu_device pointer
1274  *
1275  * Validates the vm block size specified via module parameter.
1276  * The vm block size defines number of bits in page table versus page directory,
1277  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1278  * page table and the remaining bits are in the page directory.
1279  */
1280 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1281 {
1282         /* defines number of bits in page table versus page directory,
1283          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1284          * page table and the remaining bits are in the page directory */
1285         if (amdgpu_vm_block_size == -1)
1286                 return;
1287
1288         if (amdgpu_vm_block_size < 9) {
1289                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1290                          amdgpu_vm_block_size);
1291                 amdgpu_vm_block_size = -1;
1292         }
1293 }
1294
1295 /**
1296  * amdgpu_device_check_vm_size - validate the vm size
1297  *
1298  * @adev: amdgpu_device pointer
1299  *
1300  * Validates the vm size in GB specified via module parameter.
1301  * The VM size is the size of the GPU virtual memory space in GB.
1302  */
1303 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1304 {
1305         /* no need to check the default value */
1306         if (amdgpu_vm_size == -1)
1307                 return;
1308
1309         if (amdgpu_vm_size < 1) {
1310                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1311                          amdgpu_vm_size);
1312                 amdgpu_vm_size = -1;
1313         }
1314 }
1315
1316 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1317 {
1318         struct sysinfo si;
1319         bool is_os_64 = (sizeof(void *) == 8);
1320         uint64_t total_memory;
1321         uint64_t dram_size_seven_GB = 0x1B8000000;
1322         uint64_t dram_size_three_GB = 0xB8000000;
1323
1324         if (amdgpu_smu_memory_pool_size == 0)
1325                 return;
1326
1327         if (!is_os_64) {
1328                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1329                 goto def_value;
1330         }
1331         si_meminfo(&si);
1332         total_memory = (uint64_t)si.totalram * si.mem_unit;
1333
1334         if ((amdgpu_smu_memory_pool_size == 1) ||
1335                 (amdgpu_smu_memory_pool_size == 2)) {
1336                 if (total_memory < dram_size_three_GB)
1337                         goto def_value1;
1338         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1339                 (amdgpu_smu_memory_pool_size == 8)) {
1340                 if (total_memory < dram_size_seven_GB)
1341                         goto def_value1;
1342         } else {
1343                 DRM_WARN("Smu memory pool size not supported\n");
1344                 goto def_value;
1345         }
1346         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1347
1348         return;
1349
1350 def_value1:
1351         DRM_WARN("No enough system memory\n");
1352 def_value:
1353         adev->pm.smu_prv_buffer_size = 0;
1354 }
1355
1356 /**
1357  * amdgpu_device_check_arguments - validate module params
1358  *
1359  * @adev: amdgpu_device pointer
1360  *
1361  * Validates certain module parameters and updates
1362  * the associated values used by the driver (all asics).
1363  */
1364 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1365 {
1366         if (amdgpu_sched_jobs < 4) {
1367                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1368                          amdgpu_sched_jobs);
1369                 amdgpu_sched_jobs = 4;
1370         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1371                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1372                          amdgpu_sched_jobs);
1373                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1374         }
1375
1376         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1377                 /* gart size must be greater or equal to 32M */
1378                 dev_warn(adev->dev, "gart size (%d) too small\n",
1379                          amdgpu_gart_size);
1380                 amdgpu_gart_size = -1;
1381         }
1382
1383         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1384                 /* gtt size must be greater or equal to 32M */
1385                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1386                                  amdgpu_gtt_size);
1387                 amdgpu_gtt_size = -1;
1388         }
1389
1390         /* valid range is between 4 and 9 inclusive */
1391         if (amdgpu_vm_fragment_size != -1 &&
1392             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1393                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1394                 amdgpu_vm_fragment_size = -1;
1395         }
1396
1397         if (amdgpu_sched_hw_submission < 2) {
1398                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1399                          amdgpu_sched_hw_submission);
1400                 amdgpu_sched_hw_submission = 2;
1401         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1402                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1403                          amdgpu_sched_hw_submission);
1404                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1405         }
1406
1407         amdgpu_device_check_smu_prv_buffer_size(adev);
1408
1409         amdgpu_device_check_vm_size(adev);
1410
1411         amdgpu_device_check_block_size(adev);
1412
1413         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1414
1415         amdgpu_gmc_tmz_set(adev);
1416
1417         amdgpu_gmc_noretry_set(adev);
1418
1419         return 0;
1420 }
1421
1422 /**
1423  * amdgpu_switcheroo_set_state - set switcheroo state
1424  *
1425  * @pdev: pci dev pointer
1426  * @state: vga_switcheroo state
1427  *
1428  * Callback for the switcheroo driver.  Suspends or resumes the
1429  * the asics before or after it is powered up using ACPI methods.
1430  */
1431 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1432                                         enum vga_switcheroo_state state)
1433 {
1434         struct drm_device *dev = pci_get_drvdata(pdev);
1435         int r;
1436
1437         if (amdgpu_device_supports_atpx(dev) && state == VGA_SWITCHEROO_OFF)
1438                 return;
1439
1440         if (state == VGA_SWITCHEROO_ON) {
1441                 pr_info("switched on\n");
1442                 /* don't suspend or resume card normally */
1443                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1444
1445                 pci_set_power_state(pdev, PCI_D0);
1446                 amdgpu_device_load_pci_state(pdev);
1447                 r = pci_enable_device(pdev);
1448                 if (r)
1449                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1450                 amdgpu_device_resume(dev, true);
1451
1452                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1453         } else {
1454                 pr_info("switched off\n");
1455                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1456                 amdgpu_device_suspend(dev, true);
1457                 amdgpu_device_cache_pci_state(pdev);
1458                 /* Shut down the device */
1459                 pci_disable_device(pdev);
1460                 pci_set_power_state(pdev, PCI_D3cold);
1461                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1462         }
1463 }
1464
1465 /**
1466  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1467  *
1468  * @pdev: pci dev pointer
1469  *
1470  * Callback for the switcheroo driver.  Check of the switcheroo
1471  * state can be changed.
1472  * Returns true if the state can be changed, false if not.
1473  */
1474 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1475 {
1476         struct drm_device *dev = pci_get_drvdata(pdev);
1477
1478         /*
1479         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1480         * locking inversion with the driver load path. And the access here is
1481         * completely racy anyway. So don't bother with locking for now.
1482         */
1483         return atomic_read(&dev->open_count) == 0;
1484 }
1485
1486 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1487         .set_gpu_state = amdgpu_switcheroo_set_state,
1488         .reprobe = NULL,
1489         .can_switch = amdgpu_switcheroo_can_switch,
1490 };
1491
1492 /**
1493  * amdgpu_device_ip_set_clockgating_state - set the CG state
1494  *
1495  * @dev: amdgpu_device pointer
1496  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1497  * @state: clockgating state (gate or ungate)
1498  *
1499  * Sets the requested clockgating state for all instances of
1500  * the hardware IP specified.
1501  * Returns the error code from the last instance.
1502  */
1503 int amdgpu_device_ip_set_clockgating_state(void *dev,
1504                                            enum amd_ip_block_type block_type,
1505                                            enum amd_clockgating_state state)
1506 {
1507         struct amdgpu_device *adev = dev;
1508         int i, r = 0;
1509
1510         for (i = 0; i < adev->num_ip_blocks; i++) {
1511                 if (!adev->ip_blocks[i].status.valid)
1512                         continue;
1513                 if (adev->ip_blocks[i].version->type != block_type)
1514                         continue;
1515                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1516                         continue;
1517                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1518                         (void *)adev, state);
1519                 if (r)
1520                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1521                                   adev->ip_blocks[i].version->funcs->name, r);
1522         }
1523         return r;
1524 }
1525
1526 /**
1527  * amdgpu_device_ip_set_powergating_state - set the PG state
1528  *
1529  * @dev: amdgpu_device pointer
1530  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1531  * @state: powergating state (gate or ungate)
1532  *
1533  * Sets the requested powergating state for all instances of
1534  * the hardware IP specified.
1535  * Returns the error code from the last instance.
1536  */
1537 int amdgpu_device_ip_set_powergating_state(void *dev,
1538                                            enum amd_ip_block_type block_type,
1539                                            enum amd_powergating_state state)
1540 {
1541         struct amdgpu_device *adev = dev;
1542         int i, r = 0;
1543
1544         for (i = 0; i < adev->num_ip_blocks; i++) {
1545                 if (!adev->ip_blocks[i].status.valid)
1546                         continue;
1547                 if (adev->ip_blocks[i].version->type != block_type)
1548                         continue;
1549                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1550                         continue;
1551                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1552                         (void *)adev, state);
1553                 if (r)
1554                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1555                                   adev->ip_blocks[i].version->funcs->name, r);
1556         }
1557         return r;
1558 }
1559
1560 /**
1561  * amdgpu_device_ip_get_clockgating_state - get the CG state
1562  *
1563  * @adev: amdgpu_device pointer
1564  * @flags: clockgating feature flags
1565  *
1566  * Walks the list of IPs on the device and updates the clockgating
1567  * flags for each IP.
1568  * Updates @flags with the feature flags for each hardware IP where
1569  * clockgating is enabled.
1570  */
1571 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1572                                             u32 *flags)
1573 {
1574         int i;
1575
1576         for (i = 0; i < adev->num_ip_blocks; i++) {
1577                 if (!adev->ip_blocks[i].status.valid)
1578                         continue;
1579                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1580                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1581         }
1582 }
1583
1584 /**
1585  * amdgpu_device_ip_wait_for_idle - wait for idle
1586  *
1587  * @adev: amdgpu_device pointer
1588  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1589  *
1590  * Waits for the request hardware IP to be idle.
1591  * Returns 0 for success or a negative error code on failure.
1592  */
1593 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1594                                    enum amd_ip_block_type block_type)
1595 {
1596         int i, r;
1597
1598         for (i = 0; i < adev->num_ip_blocks; i++) {
1599                 if (!adev->ip_blocks[i].status.valid)
1600                         continue;
1601                 if (adev->ip_blocks[i].version->type == block_type) {
1602                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1603                         if (r)
1604                                 return r;
1605                         break;
1606                 }
1607         }
1608         return 0;
1609
1610 }
1611
1612 /**
1613  * amdgpu_device_ip_is_idle - is the hardware IP idle
1614  *
1615  * @adev: amdgpu_device pointer
1616  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1617  *
1618  * Check if the hardware IP is idle or not.
1619  * Returns true if it the IP is idle, false if not.
1620  */
1621 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1622                               enum amd_ip_block_type block_type)
1623 {
1624         int i;
1625
1626         for (i = 0; i < adev->num_ip_blocks; i++) {
1627                 if (!adev->ip_blocks[i].status.valid)
1628                         continue;
1629                 if (adev->ip_blocks[i].version->type == block_type)
1630                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1631         }
1632         return true;
1633
1634 }
1635
1636 /**
1637  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1638  *
1639  * @adev: amdgpu_device pointer
1640  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1641  *
1642  * Returns a pointer to the hardware IP block structure
1643  * if it exists for the asic, otherwise NULL.
1644  */
1645 struct amdgpu_ip_block *
1646 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1647                               enum amd_ip_block_type type)
1648 {
1649         int i;
1650
1651         for (i = 0; i < adev->num_ip_blocks; i++)
1652                 if (adev->ip_blocks[i].version->type == type)
1653                         return &adev->ip_blocks[i];
1654
1655         return NULL;
1656 }
1657
1658 /**
1659  * amdgpu_device_ip_block_version_cmp
1660  *
1661  * @adev: amdgpu_device pointer
1662  * @type: enum amd_ip_block_type
1663  * @major: major version
1664  * @minor: minor version
1665  *
1666  * return 0 if equal or greater
1667  * return 1 if smaller or the ip_block doesn't exist
1668  */
1669 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1670                                        enum amd_ip_block_type type,
1671                                        u32 major, u32 minor)
1672 {
1673         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1674
1675         if (ip_block && ((ip_block->version->major > major) ||
1676                         ((ip_block->version->major == major) &&
1677                         (ip_block->version->minor >= minor))))
1678                 return 0;
1679
1680         return 1;
1681 }
1682
1683 /**
1684  * amdgpu_device_ip_block_add
1685  *
1686  * @adev: amdgpu_device pointer
1687  * @ip_block_version: pointer to the IP to add
1688  *
1689  * Adds the IP block driver information to the collection of IPs
1690  * on the asic.
1691  */
1692 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1693                                const struct amdgpu_ip_block_version *ip_block_version)
1694 {
1695         if (!ip_block_version)
1696                 return -EINVAL;
1697
1698         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1699                   ip_block_version->funcs->name);
1700
1701         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1702
1703         return 0;
1704 }
1705
1706 /**
1707  * amdgpu_device_enable_virtual_display - enable virtual display feature
1708  *
1709  * @adev: amdgpu_device pointer
1710  *
1711  * Enabled the virtual display feature if the user has enabled it via
1712  * the module parameter virtual_display.  This feature provides a virtual
1713  * display hardware on headless boards or in virtualized environments.
1714  * This function parses and validates the configuration string specified by
1715  * the user and configues the virtual display configuration (number of
1716  * virtual connectors, crtcs, etc.) specified.
1717  */
1718 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1719 {
1720         adev->enable_virtual_display = false;
1721
1722         if (amdgpu_virtual_display) {
1723                 const char *pci_address_name = pci_name(adev->pdev);
1724                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1725
1726                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1727                 pciaddstr_tmp = pciaddstr;
1728                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1729                         pciaddname = strsep(&pciaddname_tmp, ",");
1730                         if (!strcmp("all", pciaddname)
1731                             || !strcmp(pci_address_name, pciaddname)) {
1732                                 long num_crtc;
1733                                 int res = -1;
1734
1735                                 adev->enable_virtual_display = true;
1736
1737                                 if (pciaddname_tmp)
1738                                         res = kstrtol(pciaddname_tmp, 10,
1739                                                       &num_crtc);
1740
1741                                 if (!res) {
1742                                         if (num_crtc < 1)
1743                                                 num_crtc = 1;
1744                                         if (num_crtc > 6)
1745                                                 num_crtc = 6;
1746                                         adev->mode_info.num_crtc = num_crtc;
1747                                 } else {
1748                                         adev->mode_info.num_crtc = 1;
1749                                 }
1750                                 break;
1751                         }
1752                 }
1753
1754                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1755                          amdgpu_virtual_display, pci_address_name,
1756                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1757
1758                 kfree(pciaddstr);
1759         }
1760 }
1761
1762 /**
1763  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1764  *
1765  * @adev: amdgpu_device pointer
1766  *
1767  * Parses the asic configuration parameters specified in the gpu info
1768  * firmware and makes them availale to the driver for use in configuring
1769  * the asic.
1770  * Returns 0 on success, -EINVAL on failure.
1771  */
1772 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1773 {
1774         const char *chip_name;
1775         char fw_name[40];
1776         int err;
1777         const struct gpu_info_firmware_header_v1_0 *hdr;
1778
1779         adev->firmware.gpu_info_fw = NULL;
1780
1781         if (adev->mman.discovery_bin) {
1782                 amdgpu_discovery_get_gfx_info(adev);
1783
1784                 /*
1785                  * FIXME: The bounding box is still needed by Navi12, so
1786                  * temporarily read it from gpu_info firmware. Should be droped
1787                  * when DAL no longer needs it.
1788                  */
1789                 if (adev->asic_type != CHIP_NAVI12)
1790                         return 0;
1791         }
1792
1793         switch (adev->asic_type) {
1794 #ifdef CONFIG_DRM_AMDGPU_SI
1795         case CHIP_VERDE:
1796         case CHIP_TAHITI:
1797         case CHIP_PITCAIRN:
1798         case CHIP_OLAND:
1799         case CHIP_HAINAN:
1800 #endif
1801 #ifdef CONFIG_DRM_AMDGPU_CIK
1802         case CHIP_BONAIRE:
1803         case CHIP_HAWAII:
1804         case CHIP_KAVERI:
1805         case CHIP_KABINI:
1806         case CHIP_MULLINS:
1807 #endif
1808         case CHIP_TOPAZ:
1809         case CHIP_TONGA:
1810         case CHIP_FIJI:
1811         case CHIP_POLARIS10:
1812         case CHIP_POLARIS11:
1813         case CHIP_POLARIS12:
1814         case CHIP_VEGAM:
1815         case CHIP_CARRIZO:
1816         case CHIP_STONEY:
1817         case CHIP_VEGA20:
1818         case CHIP_ALDEBARAN:
1819         case CHIP_SIENNA_CICHLID:
1820         case CHIP_NAVY_FLOUNDER:
1821         case CHIP_DIMGREY_CAVEFISH:
1822         default:
1823                 return 0;
1824         case CHIP_VEGA10:
1825                 chip_name = "vega10";
1826                 break;
1827         case CHIP_VEGA12:
1828                 chip_name = "vega12";
1829                 break;
1830         case CHIP_RAVEN:
1831                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1832                         chip_name = "raven2";
1833                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1834                         chip_name = "picasso";
1835                 else
1836                         chip_name = "raven";
1837                 break;
1838         case CHIP_ARCTURUS:
1839                 chip_name = "arcturus";
1840                 break;
1841         case CHIP_RENOIR:
1842                 if (adev->apu_flags & AMD_APU_IS_RENOIR)
1843                         chip_name = "renoir";
1844                 else
1845                         chip_name = "green_sardine";
1846                 break;
1847         case CHIP_NAVI10:
1848                 chip_name = "navi10";
1849                 break;
1850         case CHIP_NAVI14:
1851                 chip_name = "navi14";
1852                 break;
1853         case CHIP_NAVI12:
1854                 chip_name = "navi12";
1855                 break;
1856         case CHIP_VANGOGH:
1857                 chip_name = "vangogh";
1858                 break;
1859         }
1860
1861         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1862         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1863         if (err) {
1864                 dev_err(adev->dev,
1865                         "Failed to load gpu_info firmware \"%s\"\n",
1866                         fw_name);
1867                 goto out;
1868         }
1869         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1870         if (err) {
1871                 dev_err(adev->dev,
1872                         "Failed to validate gpu_info firmware \"%s\"\n",
1873                         fw_name);
1874                 goto out;
1875         }
1876
1877         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1878         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1879
1880         switch (hdr->version_major) {
1881         case 1:
1882         {
1883                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1884                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1885                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1886
1887                 /*
1888                  * Should be droped when DAL no longer needs it.
1889                  */
1890                 if (adev->asic_type == CHIP_NAVI12)
1891                         goto parse_soc_bounding_box;
1892
1893                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1894                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1895                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1896                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1897                 adev->gfx.config.max_texture_channel_caches =
1898                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1899                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1900                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1901                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1902                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1903                 adev->gfx.config.double_offchip_lds_buf =
1904                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1905                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1906                 adev->gfx.cu_info.max_waves_per_simd =
1907                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1908                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1909                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1910                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1911                 if (hdr->version_minor >= 1) {
1912                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1913                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1914                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1915                         adev->gfx.config.num_sc_per_sh =
1916                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1917                         adev->gfx.config.num_packer_per_sc =
1918                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1919                 }
1920
1921 parse_soc_bounding_box:
1922                 /*
1923                  * soc bounding box info is not integrated in disocovery table,
1924                  * we always need to parse it from gpu info firmware if needed.
1925                  */
1926                 if (hdr->version_minor == 2) {
1927                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1928                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1929                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1930                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1931                 }
1932                 break;
1933         }
1934         default:
1935                 dev_err(adev->dev,
1936                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1937                 err = -EINVAL;
1938                 goto out;
1939         }
1940 out:
1941         return err;
1942 }
1943
1944 /**
1945  * amdgpu_device_ip_early_init - run early init for hardware IPs
1946  *
1947  * @adev: amdgpu_device pointer
1948  *
1949  * Early initialization pass for hardware IPs.  The hardware IPs that make
1950  * up each asic are discovered each IP's early_init callback is run.  This
1951  * is the first stage in initializing the asic.
1952  * Returns 0 on success, negative error code on failure.
1953  */
1954 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1955 {
1956         int i, r;
1957
1958         amdgpu_device_enable_virtual_display(adev);
1959
1960         if (amdgpu_sriov_vf(adev)) {
1961                 r = amdgpu_virt_request_full_gpu(adev, true);
1962                 if (r)
1963                         return r;
1964         }
1965
1966         switch (adev->asic_type) {
1967 #ifdef CONFIG_DRM_AMDGPU_SI
1968         case CHIP_VERDE:
1969         case CHIP_TAHITI:
1970         case CHIP_PITCAIRN:
1971         case CHIP_OLAND:
1972         case CHIP_HAINAN:
1973                 adev->family = AMDGPU_FAMILY_SI;
1974                 r = si_set_ip_blocks(adev);
1975                 if (r)
1976                         return r;
1977                 break;
1978 #endif
1979 #ifdef CONFIG_DRM_AMDGPU_CIK
1980         case CHIP_BONAIRE:
1981         case CHIP_HAWAII:
1982         case CHIP_KAVERI:
1983         case CHIP_KABINI:
1984         case CHIP_MULLINS:
1985                 if (adev->flags & AMD_IS_APU)
1986                         adev->family = AMDGPU_FAMILY_KV;
1987                 else
1988                         adev->family = AMDGPU_FAMILY_CI;
1989
1990                 r = cik_set_ip_blocks(adev);
1991                 if (r)
1992                         return r;
1993                 break;
1994 #endif
1995         case CHIP_TOPAZ:
1996         case CHIP_TONGA:
1997         case CHIP_FIJI:
1998         case CHIP_POLARIS10:
1999         case CHIP_POLARIS11:
2000         case CHIP_POLARIS12:
2001         case CHIP_VEGAM:
2002         case CHIP_CARRIZO:
2003         case CHIP_STONEY:
2004                 if (adev->flags & AMD_IS_APU)
2005                         adev->family = AMDGPU_FAMILY_CZ;
2006                 else
2007                         adev->family = AMDGPU_FAMILY_VI;
2008
2009                 r = vi_set_ip_blocks(adev);
2010                 if (r)
2011                         return r;
2012                 break;
2013         case CHIP_VEGA10:
2014         case CHIP_VEGA12:
2015         case CHIP_VEGA20:
2016         case CHIP_RAVEN:
2017         case CHIP_ARCTURUS:
2018         case CHIP_RENOIR:
2019         case CHIP_ALDEBARAN:
2020                 if (adev->flags & AMD_IS_APU)
2021                         adev->family = AMDGPU_FAMILY_RV;
2022                 else
2023                         adev->family = AMDGPU_FAMILY_AI;
2024
2025                 r = soc15_set_ip_blocks(adev);
2026                 if (r)
2027                         return r;
2028                 break;
2029         case  CHIP_NAVI10:
2030         case  CHIP_NAVI14:
2031         case  CHIP_NAVI12:
2032         case  CHIP_SIENNA_CICHLID:
2033         case  CHIP_NAVY_FLOUNDER:
2034         case  CHIP_DIMGREY_CAVEFISH:
2035         case CHIP_VANGOGH:
2036                 if (adev->asic_type == CHIP_VANGOGH)
2037                         adev->family = AMDGPU_FAMILY_VGH;
2038                 else
2039                         adev->family = AMDGPU_FAMILY_NV;
2040
2041                 r = nv_set_ip_blocks(adev);
2042                 if (r)
2043                         return r;
2044                 break;
2045         default:
2046                 /* FIXME: not supported yet */
2047                 return -EINVAL;
2048         }
2049
2050         amdgpu_amdkfd_device_probe(adev);
2051
2052         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2053         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2054                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2055         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2056                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2057
2058         for (i = 0; i < adev->num_ip_blocks; i++) {
2059                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2060                         DRM_ERROR("disabled ip block: %d <%s>\n",
2061                                   i, adev->ip_blocks[i].version->funcs->name);
2062                         adev->ip_blocks[i].status.valid = false;
2063                 } else {
2064                         if (adev->ip_blocks[i].version->funcs->early_init) {
2065                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2066                                 if (r == -ENOENT) {
2067                                         adev->ip_blocks[i].status.valid = false;
2068                                 } else if (r) {
2069                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2070                                                   adev->ip_blocks[i].version->funcs->name, r);
2071                                         return r;
2072                                 } else {
2073                                         adev->ip_blocks[i].status.valid = true;
2074                                 }
2075                         } else {
2076                                 adev->ip_blocks[i].status.valid = true;
2077                         }
2078                 }
2079                 /* get the vbios after the asic_funcs are set up */
2080                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2081                         r = amdgpu_device_parse_gpu_info_fw(adev);
2082                         if (r)
2083                                 return r;
2084
2085                         /* Read BIOS */
2086                         if (!amdgpu_get_bios(adev))
2087                                 return -EINVAL;
2088
2089                         r = amdgpu_atombios_init(adev);
2090                         if (r) {
2091                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2092                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2093                                 return r;
2094                         }
2095                 }
2096         }
2097
2098         adev->cg_flags &= amdgpu_cg_mask;
2099         adev->pg_flags &= amdgpu_pg_mask;
2100
2101         return 0;
2102 }
2103
2104 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2105 {
2106         int i, r;
2107
2108         for (i = 0; i < adev->num_ip_blocks; i++) {
2109                 if (!adev->ip_blocks[i].status.sw)
2110                         continue;
2111                 if (adev->ip_blocks[i].status.hw)
2112                         continue;
2113                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2114                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2115                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2116                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2117                         if (r) {
2118                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2119                                           adev->ip_blocks[i].version->funcs->name, r);
2120                                 return r;
2121                         }
2122                         adev->ip_blocks[i].status.hw = true;
2123                 }
2124         }
2125
2126         return 0;
2127 }
2128
2129 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2130 {
2131         int i, r;
2132
2133         for (i = 0; i < adev->num_ip_blocks; i++) {
2134                 if (!adev->ip_blocks[i].status.sw)
2135                         continue;
2136                 if (adev->ip_blocks[i].status.hw)
2137                         continue;
2138                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2139                 if (r) {
2140                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2141                                   adev->ip_blocks[i].version->funcs->name, r);
2142                         return r;
2143                 }
2144                 adev->ip_blocks[i].status.hw = true;
2145         }
2146
2147         return 0;
2148 }
2149
2150 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2151 {
2152         int r = 0;
2153         int i;
2154         uint32_t smu_version;
2155
2156         if (adev->asic_type >= CHIP_VEGA10) {
2157                 for (i = 0; i < adev->num_ip_blocks; i++) {
2158                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2159                                 continue;
2160
2161                         if (!adev->ip_blocks[i].status.sw)
2162                                 continue;
2163
2164                         /* no need to do the fw loading again if already done*/
2165                         if (adev->ip_blocks[i].status.hw == true)
2166                                 break;
2167
2168                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2169                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2170                                 if (r) {
2171                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2172                                                           adev->ip_blocks[i].version->funcs->name, r);
2173                                         return r;
2174                                 }
2175                         } else {
2176                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2177                                 if (r) {
2178                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2179                                                           adev->ip_blocks[i].version->funcs->name, r);
2180                                         return r;
2181                                 }
2182                         }
2183
2184                         adev->ip_blocks[i].status.hw = true;
2185                         break;
2186                 }
2187         }
2188
2189         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2190                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2191
2192         return r;
2193 }
2194
2195 /**
2196  * amdgpu_device_ip_init - run init for hardware IPs
2197  *
2198  * @adev: amdgpu_device pointer
2199  *
2200  * Main initialization pass for hardware IPs.  The list of all the hardware
2201  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2202  * are run.  sw_init initializes the software state associated with each IP
2203  * and hw_init initializes the hardware associated with each IP.
2204  * Returns 0 on success, negative error code on failure.
2205  */
2206 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2207 {
2208         int i, r;
2209
2210         r = amdgpu_ras_init(adev);
2211         if (r)
2212                 return r;
2213
2214         for (i = 0; i < adev->num_ip_blocks; i++) {
2215                 if (!adev->ip_blocks[i].status.valid)
2216                         continue;
2217                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2218                 if (r) {
2219                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2220                                   adev->ip_blocks[i].version->funcs->name, r);
2221                         goto init_failed;
2222                 }
2223                 adev->ip_blocks[i].status.sw = true;
2224
2225                 /* need to do gmc hw init early so we can allocate gpu mem */
2226                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2227                         r = amdgpu_device_vram_scratch_init(adev);
2228                         if (r) {
2229                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2230                                 goto init_failed;
2231                         }
2232                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2233                         if (r) {
2234                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2235                                 goto init_failed;
2236                         }
2237                         r = amdgpu_device_wb_init(adev);
2238                         if (r) {
2239                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2240                                 goto init_failed;
2241                         }
2242                         adev->ip_blocks[i].status.hw = true;
2243
2244                         /* right after GMC hw init, we create CSA */
2245                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2246                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2247                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2248                                                                 AMDGPU_CSA_SIZE);
2249                                 if (r) {
2250                                         DRM_ERROR("allocate CSA failed %d\n", r);
2251                                         goto init_failed;
2252                                 }
2253                         }
2254                 }
2255         }
2256
2257         if (amdgpu_sriov_vf(adev))
2258                 amdgpu_virt_init_data_exchange(adev);
2259
2260         r = amdgpu_ib_pool_init(adev);
2261         if (r) {
2262                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2263                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2264                 goto init_failed;
2265         }
2266
2267         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2268         if (r)
2269                 goto init_failed;
2270
2271         r = amdgpu_device_ip_hw_init_phase1(adev);
2272         if (r)
2273                 goto init_failed;
2274
2275         r = amdgpu_device_fw_loading(adev);
2276         if (r)
2277                 goto init_failed;
2278
2279         r = amdgpu_device_ip_hw_init_phase2(adev);
2280         if (r)
2281                 goto init_failed;
2282
2283         /*
2284          * retired pages will be loaded from eeprom and reserved here,
2285          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2286          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2287          * for I2C communication which only true at this point.
2288          *
2289          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2290          * failure from bad gpu situation and stop amdgpu init process
2291          * accordingly. For other failed cases, it will still release all
2292          * the resource and print error message, rather than returning one
2293          * negative value to upper level.
2294          *
2295          * Note: theoretically, this should be called before all vram allocations
2296          * to protect retired page from abusing
2297          */
2298         r = amdgpu_ras_recovery_init(adev);
2299         if (r)
2300                 goto init_failed;
2301
2302         if (adev->gmc.xgmi.num_physical_nodes > 1)
2303                 amdgpu_xgmi_add_device(adev);
2304
2305         /* Don't init kfd if whole hive need to be reset during init */
2306         if (!adev->gmc.xgmi.pending_reset)
2307                 amdgpu_amdkfd_device_init(adev);
2308
2309         amdgpu_fru_get_product_info(adev);
2310
2311 init_failed:
2312         if (amdgpu_sriov_vf(adev))
2313                 amdgpu_virt_release_full_gpu(adev, true);
2314
2315         return r;
2316 }
2317
2318 /**
2319  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2320  *
2321  * @adev: amdgpu_device pointer
2322  *
2323  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2324  * this function before a GPU reset.  If the value is retained after a
2325  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2326  */
2327 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2328 {
2329         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2330 }
2331
2332 /**
2333  * amdgpu_device_check_vram_lost - check if vram is valid
2334  *
2335  * @adev: amdgpu_device pointer
2336  *
2337  * Checks the reset magic value written to the gart pointer in VRAM.
2338  * The driver calls this after a GPU reset to see if the contents of
2339  * VRAM is lost or now.
2340  * returns true if vram is lost, false if not.
2341  */
2342 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2343 {
2344         if (memcmp(adev->gart.ptr, adev->reset_magic,
2345                         AMDGPU_RESET_MAGIC_NUM))
2346                 return true;
2347
2348         if (!amdgpu_in_reset(adev))
2349                 return false;
2350
2351         /*
2352          * For all ASICs with baco/mode1 reset, the VRAM is
2353          * always assumed to be lost.
2354          */
2355         switch (amdgpu_asic_reset_method(adev)) {
2356         case AMD_RESET_METHOD_BACO:
2357         case AMD_RESET_METHOD_MODE1:
2358                 return true;
2359         default:
2360                 return false;
2361         }
2362 }
2363
2364 /**
2365  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2366  *
2367  * @adev: amdgpu_device pointer
2368  * @state: clockgating state (gate or ungate)
2369  *
2370  * The list of all the hardware IPs that make up the asic is walked and the
2371  * set_clockgating_state callbacks are run.
2372  * Late initialization pass enabling clockgating for hardware IPs.
2373  * Fini or suspend, pass disabling clockgating for hardware IPs.
2374  * Returns 0 on success, negative error code on failure.
2375  */
2376
2377 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2378                                                 enum amd_clockgating_state state)
2379 {
2380         int i, j, r;
2381
2382         if (amdgpu_emu_mode == 1)
2383                 return 0;
2384
2385         for (j = 0; j < adev->num_ip_blocks; j++) {
2386                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2387                 if (!adev->ip_blocks[i].status.late_initialized)
2388                         continue;
2389                 /* skip CG for VCE/UVD, it's handled specially */
2390                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2391                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2392                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2393                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2394                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2395                         /* enable clockgating to save power */
2396                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2397                                                                                      state);
2398                         if (r) {
2399                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2400                                           adev->ip_blocks[i].version->funcs->name, r);
2401                                 return r;
2402                         }
2403                 }
2404         }
2405
2406         return 0;
2407 }
2408
2409 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2410 {
2411         int i, j, r;
2412
2413         if (amdgpu_emu_mode == 1)
2414                 return 0;
2415
2416         for (j = 0; j < adev->num_ip_blocks; j++) {
2417                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2418                 if (!adev->ip_blocks[i].status.late_initialized)
2419                         continue;
2420                 /* skip CG for VCE/UVD, it's handled specially */
2421                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2422                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2423                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2424                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2425                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2426                         /* enable powergating to save power */
2427                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2428                                                                                         state);
2429                         if (r) {
2430                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2431                                           adev->ip_blocks[i].version->funcs->name, r);
2432                                 return r;
2433                         }
2434                 }
2435         }
2436         return 0;
2437 }
2438
2439 static int amdgpu_device_enable_mgpu_fan_boost(void)
2440 {
2441         struct amdgpu_gpu_instance *gpu_ins;
2442         struct amdgpu_device *adev;
2443         int i, ret = 0;
2444
2445         mutex_lock(&mgpu_info.mutex);
2446
2447         /*
2448          * MGPU fan boost feature should be enabled
2449          * only when there are two or more dGPUs in
2450          * the system
2451          */
2452         if (mgpu_info.num_dgpu < 2)
2453                 goto out;
2454
2455         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2456                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2457                 adev = gpu_ins->adev;
2458                 if (!(adev->flags & AMD_IS_APU) &&
2459                     !gpu_ins->mgpu_fan_enabled) {
2460                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2461                         if (ret)
2462                                 break;
2463
2464                         gpu_ins->mgpu_fan_enabled = 1;
2465                 }
2466         }
2467
2468 out:
2469         mutex_unlock(&mgpu_info.mutex);
2470
2471         return ret;
2472 }
2473
2474 /**
2475  * amdgpu_device_ip_late_init - run late init for hardware IPs
2476  *
2477  * @adev: amdgpu_device pointer
2478  *
2479  * Late initialization pass for hardware IPs.  The list of all the hardware
2480  * IPs that make up the asic is walked and the late_init callbacks are run.
2481  * late_init covers any special initialization that an IP requires
2482  * after all of the have been initialized or something that needs to happen
2483  * late in the init process.
2484  * Returns 0 on success, negative error code on failure.
2485  */
2486 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2487 {
2488         struct amdgpu_gpu_instance *gpu_instance;
2489         int i = 0, r;
2490
2491         for (i = 0; i < adev->num_ip_blocks; i++) {
2492                 if (!adev->ip_blocks[i].status.hw)
2493                         continue;
2494                 if (adev->ip_blocks[i].version->funcs->late_init) {
2495                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2496                         if (r) {
2497                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2498                                           adev->ip_blocks[i].version->funcs->name, r);
2499                                 return r;
2500                         }
2501                 }
2502                 adev->ip_blocks[i].status.late_initialized = true;
2503         }
2504
2505         amdgpu_ras_set_error_query_ready(adev, true);
2506
2507         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2508         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2509
2510         amdgpu_device_fill_reset_magic(adev);
2511
2512         r = amdgpu_device_enable_mgpu_fan_boost();
2513         if (r)
2514                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2515
2516
2517         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2518                 mutex_lock(&mgpu_info.mutex);
2519
2520                 /*
2521                  * Reset device p-state to low as this was booted with high.
2522                  *
2523                  * This should be performed only after all devices from the same
2524                  * hive get initialized.
2525                  *
2526                  * However, it's unknown how many device in the hive in advance.
2527                  * As this is counted one by one during devices initializations.
2528                  *
2529                  * So, we wait for all XGMI interlinked devices initialized.
2530                  * This may bring some delays as those devices may come from
2531                  * different hives. But that should be OK.
2532                  */
2533                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2534                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2535                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2536                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2537                                         continue;
2538
2539                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2540                                                 AMDGPU_XGMI_PSTATE_MIN);
2541                                 if (r) {
2542                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2543                                         break;
2544                                 }
2545                         }
2546                 }
2547
2548                 mutex_unlock(&mgpu_info.mutex);
2549         }
2550
2551         return 0;
2552 }
2553
2554 /**
2555  * amdgpu_device_ip_fini - run fini for hardware IPs
2556  *
2557  * @adev: amdgpu_device pointer
2558  *
2559  * Main teardown pass for hardware IPs.  The list of all the hardware
2560  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2561  * are run.  hw_fini tears down the hardware associated with each IP
2562  * and sw_fini tears down any software state associated with each IP.
2563  * Returns 0 on success, negative error code on failure.
2564  */
2565 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2566 {
2567         int i, r;
2568
2569         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2570                 amdgpu_virt_release_ras_err_handler_data(adev);
2571
2572         amdgpu_ras_pre_fini(adev);
2573
2574         if (adev->gmc.xgmi.num_physical_nodes > 1)
2575                 amdgpu_xgmi_remove_device(adev);
2576
2577         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2578         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2579
2580         amdgpu_amdkfd_device_fini(adev);
2581
2582         /* need to disable SMC first */
2583         for (i = 0; i < adev->num_ip_blocks; i++) {
2584                 if (!adev->ip_blocks[i].status.hw)
2585                         continue;
2586                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2587                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2588                         /* XXX handle errors */
2589                         if (r) {
2590                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2591                                           adev->ip_blocks[i].version->funcs->name, r);
2592                         }
2593                         adev->ip_blocks[i].status.hw = false;
2594                         break;
2595                 }
2596         }
2597
2598         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2599                 if (!adev->ip_blocks[i].status.hw)
2600                         continue;
2601
2602                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2603                 /* XXX handle errors */
2604                 if (r) {
2605                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2606                                   adev->ip_blocks[i].version->funcs->name, r);
2607                 }
2608
2609                 adev->ip_blocks[i].status.hw = false;
2610         }
2611
2612
2613         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2614                 if (!adev->ip_blocks[i].status.sw)
2615                         continue;
2616
2617                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2618                         amdgpu_ucode_free_bo(adev);
2619                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2620                         amdgpu_device_wb_fini(adev);
2621                         amdgpu_device_vram_scratch_fini(adev);
2622                         amdgpu_ib_pool_fini(adev);
2623                 }
2624
2625                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2626                 /* XXX handle errors */
2627                 if (r) {
2628                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2629                                   adev->ip_blocks[i].version->funcs->name, r);
2630                 }
2631                 adev->ip_blocks[i].status.sw = false;
2632                 adev->ip_blocks[i].status.valid = false;
2633         }
2634
2635         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2636                 if (!adev->ip_blocks[i].status.late_initialized)
2637                         continue;
2638                 if (adev->ip_blocks[i].version->funcs->late_fini)
2639                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2640                 adev->ip_blocks[i].status.late_initialized = false;
2641         }
2642
2643         amdgpu_ras_fini(adev);
2644
2645         if (amdgpu_sriov_vf(adev))
2646                 if (amdgpu_virt_release_full_gpu(adev, false))
2647                         DRM_ERROR("failed to release exclusive mode on fini\n");
2648
2649         return 0;
2650 }
2651
2652 /**
2653  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2654  *
2655  * @work: work_struct.
2656  */
2657 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2658 {
2659         struct amdgpu_device *adev =
2660                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2661         int r;
2662
2663         r = amdgpu_ib_ring_tests(adev);
2664         if (r)
2665                 DRM_ERROR("ib ring test failed (%d).\n", r);
2666 }
2667
2668 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2669 {
2670         struct amdgpu_device *adev =
2671                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2672
2673         mutex_lock(&adev->gfx.gfx_off_mutex);
2674         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2675                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2676                         adev->gfx.gfx_off_state = true;
2677         }
2678         mutex_unlock(&adev->gfx.gfx_off_mutex);
2679 }
2680
2681 /**
2682  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2683  *
2684  * @adev: amdgpu_device pointer
2685  *
2686  * Main suspend function for hardware IPs.  The list of all the hardware
2687  * IPs that make up the asic is walked, clockgating is disabled and the
2688  * suspend callbacks are run.  suspend puts the hardware and software state
2689  * in each IP into a state suitable for suspend.
2690  * Returns 0 on success, negative error code on failure.
2691  */
2692 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2693 {
2694         int i, r;
2695
2696         if (adev->in_poweroff_reboot_com ||
2697             !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev)) {
2698                 amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2699                 amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2700         }
2701
2702         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2703                 if (!adev->ip_blocks[i].status.valid)
2704                         continue;
2705
2706                 /* displays are handled separately */
2707                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2708                         continue;
2709
2710                 /* XXX handle errors */
2711                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2712                 /* XXX handle errors */
2713                 if (r) {
2714                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2715                                   adev->ip_blocks[i].version->funcs->name, r);
2716                         return r;
2717                 }
2718
2719                 adev->ip_blocks[i].status.hw = false;
2720         }
2721
2722         return 0;
2723 }
2724
2725 /**
2726  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2727  *
2728  * @adev: amdgpu_device pointer
2729  *
2730  * Main suspend function for hardware IPs.  The list of all the hardware
2731  * IPs that make up the asic is walked, clockgating is disabled and the
2732  * suspend callbacks are run.  suspend puts the hardware and software state
2733  * in each IP into a state suitable for suspend.
2734  * Returns 0 on success, negative error code on failure.
2735  */
2736 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2737 {
2738         int i, r;
2739
2740         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2741                 if (!adev->ip_blocks[i].status.valid)
2742                         continue;
2743                 /* displays are handled in phase1 */
2744                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2745                         continue;
2746                 /* PSP lost connection when err_event_athub occurs */
2747                 if (amdgpu_ras_intr_triggered() &&
2748                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2749                         adev->ip_blocks[i].status.hw = false;
2750                         continue;
2751                 }
2752
2753                 /* skip unnecessary suspend if we do not initialize them yet */
2754                 if (adev->gmc.xgmi.pending_reset &&
2755                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2756                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
2757                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2758                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
2759                         adev->ip_blocks[i].status.hw = false;
2760                         continue;
2761                 }
2762                 /* XXX handle errors */
2763                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2764                 /* XXX handle errors */
2765                 if (r) {
2766                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2767                                   adev->ip_blocks[i].version->funcs->name, r);
2768                 }
2769                 adev->ip_blocks[i].status.hw = false;
2770                 /* handle putting the SMC in the appropriate state */
2771                 if(!amdgpu_sriov_vf(adev)){
2772                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2773                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2774                                 if (r) {
2775                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2776                                                         adev->mp1_state, r);
2777                                         return r;
2778                                 }
2779                         }
2780                 }
2781                 adev->ip_blocks[i].status.hw = false;
2782         }
2783
2784         return 0;
2785 }
2786
2787 /**
2788  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2789  *
2790  * @adev: amdgpu_device pointer
2791  *
2792  * Main suspend function for hardware IPs.  The list of all the hardware
2793  * IPs that make up the asic is walked, clockgating is disabled and the
2794  * suspend callbacks are run.  suspend puts the hardware and software state
2795  * in each IP into a state suitable for suspend.
2796  * Returns 0 on success, negative error code on failure.
2797  */
2798 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2799 {
2800         int r;
2801
2802         if (amdgpu_sriov_vf(adev)) {
2803                 amdgpu_virt_fini_data_exchange(adev);
2804                 amdgpu_virt_request_full_gpu(adev, false);
2805         }
2806
2807         r = amdgpu_device_ip_suspend_phase1(adev);
2808         if (r)
2809                 return r;
2810         r = amdgpu_device_ip_suspend_phase2(adev);
2811
2812         if (amdgpu_sriov_vf(adev))
2813                 amdgpu_virt_release_full_gpu(adev, false);
2814
2815         return r;
2816 }
2817
2818 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2819 {
2820         int i, r;
2821
2822         static enum amd_ip_block_type ip_order[] = {
2823                 AMD_IP_BLOCK_TYPE_GMC,
2824                 AMD_IP_BLOCK_TYPE_COMMON,
2825                 AMD_IP_BLOCK_TYPE_PSP,
2826                 AMD_IP_BLOCK_TYPE_IH,
2827         };
2828
2829         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2830                 int j;
2831                 struct amdgpu_ip_block *block;
2832
2833                 block = &adev->ip_blocks[i];
2834                 block->status.hw = false;
2835
2836                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
2837
2838                         if (block->version->type != ip_order[j] ||
2839                                 !block->status.valid)
2840                                 continue;
2841
2842                         r = block->version->funcs->hw_init(adev);
2843                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2844                         if (r)
2845                                 return r;
2846                         block->status.hw = true;
2847                 }
2848         }
2849
2850         return 0;
2851 }
2852
2853 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2854 {
2855         int i, r;
2856
2857         static enum amd_ip_block_type ip_order[] = {
2858                 AMD_IP_BLOCK_TYPE_SMC,
2859                 AMD_IP_BLOCK_TYPE_DCE,
2860                 AMD_IP_BLOCK_TYPE_GFX,
2861                 AMD_IP_BLOCK_TYPE_SDMA,
2862                 AMD_IP_BLOCK_TYPE_UVD,
2863                 AMD_IP_BLOCK_TYPE_VCE,
2864                 AMD_IP_BLOCK_TYPE_VCN
2865         };
2866
2867         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2868                 int j;
2869                 struct amdgpu_ip_block *block;
2870
2871                 for (j = 0; j < adev->num_ip_blocks; j++) {
2872                         block = &adev->ip_blocks[j];
2873
2874                         if (block->version->type != ip_order[i] ||
2875                                 !block->status.valid ||
2876                                 block->status.hw)
2877                                 continue;
2878
2879                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2880                                 r = block->version->funcs->resume(adev);
2881                         else
2882                                 r = block->version->funcs->hw_init(adev);
2883
2884                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2885                         if (r)
2886                                 return r;
2887                         block->status.hw = true;
2888                 }
2889         }
2890
2891         return 0;
2892 }
2893
2894 /**
2895  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2896  *
2897  * @adev: amdgpu_device pointer
2898  *
2899  * First resume function for hardware IPs.  The list of all the hardware
2900  * IPs that make up the asic is walked and the resume callbacks are run for
2901  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2902  * after a suspend and updates the software state as necessary.  This
2903  * function is also used for restoring the GPU after a GPU reset.
2904  * Returns 0 on success, negative error code on failure.
2905  */
2906 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2907 {
2908         int i, r;
2909
2910         for (i = 0; i < adev->num_ip_blocks; i++) {
2911                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2912                         continue;
2913                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2914                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2915                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2916
2917                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2918                         if (r) {
2919                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2920                                           adev->ip_blocks[i].version->funcs->name, r);
2921                                 return r;
2922                         }
2923                         adev->ip_blocks[i].status.hw = true;
2924                 }
2925         }
2926
2927         return 0;
2928 }
2929
2930 /**
2931  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2932  *
2933  * @adev: amdgpu_device pointer
2934  *
2935  * First resume function for hardware IPs.  The list of all the hardware
2936  * IPs that make up the asic is walked and the resume callbacks are run for
2937  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2938  * functional state after a suspend and updates the software state as
2939  * necessary.  This function is also used for restoring the GPU after a GPU
2940  * reset.
2941  * Returns 0 on success, negative error code on failure.
2942  */
2943 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2944 {
2945         int i, r;
2946
2947         for (i = 0; i < adev->num_ip_blocks; i++) {
2948                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2949                         continue;
2950                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2951                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2952                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2953                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2954                         continue;
2955                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2956                 if (r) {
2957                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2958                                   adev->ip_blocks[i].version->funcs->name, r);
2959                         return r;
2960                 }
2961                 adev->ip_blocks[i].status.hw = true;
2962         }
2963
2964         return 0;
2965 }
2966
2967 /**
2968  * amdgpu_device_ip_resume - run resume for hardware IPs
2969  *
2970  * @adev: amdgpu_device pointer
2971  *
2972  * Main resume function for hardware IPs.  The hardware IPs
2973  * are split into two resume functions because they are
2974  * are also used in in recovering from a GPU reset and some additional
2975  * steps need to be take between them.  In this case (S3/S4) they are
2976  * run sequentially.
2977  * Returns 0 on success, negative error code on failure.
2978  */
2979 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2980 {
2981         int r;
2982
2983         r = amdgpu_device_ip_resume_phase1(adev);
2984         if (r)
2985                 return r;
2986
2987         r = amdgpu_device_fw_loading(adev);
2988         if (r)
2989                 return r;
2990
2991         r = amdgpu_device_ip_resume_phase2(adev);
2992
2993         return r;
2994 }
2995
2996 /**
2997  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2998  *
2999  * @adev: amdgpu_device pointer
3000  *
3001  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3002  */
3003 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3004 {
3005         if (amdgpu_sriov_vf(adev)) {
3006                 if (adev->is_atom_fw) {
3007                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
3008                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3009                 } else {
3010                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3011                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3012                 }
3013
3014                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3015                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3016         }
3017 }
3018
3019 /**
3020  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3021  *
3022  * @asic_type: AMD asic type
3023  *
3024  * Check if there is DC (new modesetting infrastructre) support for an asic.
3025  * returns true if DC has support, false if not.
3026  */
3027 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3028 {
3029         switch (asic_type) {
3030 #if defined(CONFIG_DRM_AMD_DC)
3031 #if defined(CONFIG_DRM_AMD_DC_SI)
3032         case CHIP_TAHITI:
3033         case CHIP_PITCAIRN:
3034         case CHIP_VERDE:
3035         case CHIP_OLAND:
3036 #endif
3037         case CHIP_BONAIRE:
3038         case CHIP_KAVERI:
3039         case CHIP_KABINI:
3040         case CHIP_MULLINS:
3041                 /*
3042                  * We have systems in the wild with these ASICs that require
3043                  * LVDS and VGA support which is not supported with DC.
3044                  *
3045                  * Fallback to the non-DC driver here by default so as not to
3046                  * cause regressions.
3047                  */
3048                 return amdgpu_dc > 0;
3049         case CHIP_HAWAII:
3050         case CHIP_CARRIZO:
3051         case CHIP_STONEY:
3052         case CHIP_POLARIS10:
3053         case CHIP_POLARIS11:
3054         case CHIP_POLARIS12:
3055         case CHIP_VEGAM:
3056         case CHIP_TONGA:
3057         case CHIP_FIJI:
3058         case CHIP_VEGA10:
3059         case CHIP_VEGA12:
3060         case CHIP_VEGA20:
3061 #if defined(CONFIG_DRM_AMD_DC_DCN)
3062         case CHIP_RAVEN:
3063         case CHIP_NAVI10:
3064         case CHIP_NAVI14:
3065         case CHIP_NAVI12:
3066         case CHIP_RENOIR:
3067         case CHIP_SIENNA_CICHLID:
3068         case CHIP_NAVY_FLOUNDER:
3069         case CHIP_DIMGREY_CAVEFISH:
3070         case CHIP_VANGOGH:
3071 #endif
3072                 return amdgpu_dc != 0;
3073 #endif
3074         default:
3075                 if (amdgpu_dc > 0)
3076                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3077                                          "but isn't supported by ASIC, ignoring\n");
3078                 return false;
3079         }
3080 }
3081
3082 /**
3083  * amdgpu_device_has_dc_support - check if dc is supported
3084  *
3085  * @adev: amdgpu_device pointer
3086  *
3087  * Returns true for supported, false for not supported
3088  */
3089 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3090 {
3091         if (amdgpu_sriov_vf(adev) || adev->enable_virtual_display)
3092                 return false;
3093
3094         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3095 }
3096
3097
3098 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3099 {
3100         struct amdgpu_device *adev =
3101                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3102         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3103
3104         /* It's a bug to not have a hive within this function */
3105         if (WARN_ON(!hive))
3106                 return;
3107
3108         /*
3109          * Use task barrier to synchronize all xgmi reset works across the
3110          * hive. task_barrier_enter and task_barrier_exit will block
3111          * until all the threads running the xgmi reset works reach
3112          * those points. task_barrier_full will do both blocks.
3113          */
3114         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3115
3116                 task_barrier_enter(&hive->tb);
3117                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3118
3119                 if (adev->asic_reset_res)
3120                         goto fail;
3121
3122                 task_barrier_exit(&hive->tb);
3123                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3124
3125                 if (adev->asic_reset_res)
3126                         goto fail;
3127
3128                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
3129                         adev->mmhub.funcs->reset_ras_error_count(adev);
3130         } else {
3131
3132                 task_barrier_full(&hive->tb);
3133                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3134         }
3135
3136 fail:
3137         if (adev->asic_reset_res)
3138                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3139                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3140         amdgpu_put_xgmi_hive(hive);
3141 }
3142
3143 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3144 {
3145         char *input = amdgpu_lockup_timeout;
3146         char *timeout_setting = NULL;
3147         int index = 0;
3148         long timeout;
3149         int ret = 0;
3150
3151         /*
3152          * By default timeout for non compute jobs is 10000.
3153          * And there is no timeout enforced on compute jobs.
3154          * In SR-IOV or passthrough mode, timeout for compute
3155          * jobs are 60000 by default.
3156          */
3157         adev->gfx_timeout = msecs_to_jiffies(10000);
3158         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3159         if (amdgpu_sriov_vf(adev))
3160                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3161                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3162         else if (amdgpu_passthrough(adev))
3163                 adev->compute_timeout =  msecs_to_jiffies(60000);
3164         else
3165                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
3166
3167         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3168                 while ((timeout_setting = strsep(&input, ",")) &&
3169                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3170                         ret = kstrtol(timeout_setting, 0, &timeout);
3171                         if (ret)
3172                                 return ret;
3173
3174                         if (timeout == 0) {
3175                                 index++;
3176                                 continue;
3177                         } else if (timeout < 0) {
3178                                 timeout = MAX_SCHEDULE_TIMEOUT;
3179                         } else {
3180                                 timeout = msecs_to_jiffies(timeout);
3181                         }
3182
3183                         switch (index++) {
3184                         case 0:
3185                                 adev->gfx_timeout = timeout;
3186                                 break;
3187                         case 1:
3188                                 adev->compute_timeout = timeout;
3189                                 break;
3190                         case 2:
3191                                 adev->sdma_timeout = timeout;
3192                                 break;
3193                         case 3:
3194                                 adev->video_timeout = timeout;
3195                                 break;
3196                         default:
3197                                 break;
3198                         }
3199                 }
3200                 /*
3201                  * There is only one value specified and
3202                  * it should apply to all non-compute jobs.
3203                  */
3204                 if (index == 1) {
3205                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3206                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3207                                 adev->compute_timeout = adev->gfx_timeout;
3208                 }
3209         }
3210
3211         return ret;
3212 }
3213
3214 static const struct attribute *amdgpu_dev_attributes[] = {
3215         &dev_attr_product_name.attr,
3216         &dev_attr_product_number.attr,
3217         &dev_attr_serial_number.attr,
3218         &dev_attr_pcie_replay_count.attr,
3219         NULL
3220 };
3221
3222
3223 /**
3224  * amdgpu_device_init - initialize the driver
3225  *
3226  * @adev: amdgpu_device pointer
3227  * @flags: driver flags
3228  *
3229  * Initializes the driver info and hw (all asics).
3230  * Returns 0 for success or an error on failure.
3231  * Called at driver startup.
3232  */
3233 int amdgpu_device_init(struct amdgpu_device *adev,
3234                        uint32_t flags)
3235 {
3236         struct drm_device *ddev = adev_to_drm(adev);
3237         struct pci_dev *pdev = adev->pdev;
3238         int r, i;
3239         bool atpx = false;
3240         u32 max_MBps;
3241
3242         adev->shutdown = false;
3243         adev->flags = flags;
3244
3245         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3246                 adev->asic_type = amdgpu_force_asic_type;
3247         else
3248                 adev->asic_type = flags & AMD_ASIC_MASK;
3249
3250         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3251         if (amdgpu_emu_mode == 1)
3252                 adev->usec_timeout *= 10;
3253         adev->gmc.gart_size = 512 * 1024 * 1024;
3254         adev->accel_working = false;
3255         adev->num_rings = 0;
3256         adev->mman.buffer_funcs = NULL;
3257         adev->mman.buffer_funcs_ring = NULL;
3258         adev->vm_manager.vm_pte_funcs = NULL;
3259         adev->vm_manager.vm_pte_num_scheds = 0;
3260         adev->gmc.gmc_funcs = NULL;
3261         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3262         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3263
3264         adev->smc_rreg = &amdgpu_invalid_rreg;
3265         adev->smc_wreg = &amdgpu_invalid_wreg;
3266         adev->pcie_rreg = &amdgpu_invalid_rreg;
3267         adev->pcie_wreg = &amdgpu_invalid_wreg;
3268         adev->pciep_rreg = &amdgpu_invalid_rreg;
3269         adev->pciep_wreg = &amdgpu_invalid_wreg;
3270         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3271         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3272         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3273         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3274         adev->didt_rreg = &amdgpu_invalid_rreg;
3275         adev->didt_wreg = &amdgpu_invalid_wreg;
3276         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3277         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3278         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3279         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3280
3281         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3282                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3283                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3284
3285         /* mutex initialization are all done here so we
3286          * can recall function without having locking issues */
3287         atomic_set(&adev->irq.ih.lock, 0);
3288         mutex_init(&adev->firmware.mutex);
3289         mutex_init(&adev->pm.mutex);
3290         mutex_init(&adev->gfx.gpu_clock_mutex);
3291         mutex_init(&adev->srbm_mutex);
3292         mutex_init(&adev->gfx.pipe_reserve_mutex);
3293         mutex_init(&adev->gfx.gfx_off_mutex);
3294         mutex_init(&adev->grbm_idx_mutex);
3295         mutex_init(&adev->mn_lock);
3296         mutex_init(&adev->virt.vf_errors.lock);
3297         hash_init(adev->mn_hash);
3298         atomic_set(&adev->in_gpu_reset, 0);
3299         init_rwsem(&adev->reset_sem);
3300         mutex_init(&adev->psp.mutex);
3301         mutex_init(&adev->notifier_lock);
3302
3303         r = amdgpu_device_check_arguments(adev);
3304         if (r)
3305                 return r;
3306
3307         spin_lock_init(&adev->mmio_idx_lock);
3308         spin_lock_init(&adev->smc_idx_lock);
3309         spin_lock_init(&adev->pcie_idx_lock);
3310         spin_lock_init(&adev->uvd_ctx_idx_lock);
3311         spin_lock_init(&adev->didt_idx_lock);
3312         spin_lock_init(&adev->gc_cac_idx_lock);
3313         spin_lock_init(&adev->se_cac_idx_lock);
3314         spin_lock_init(&adev->audio_endpt_idx_lock);
3315         spin_lock_init(&adev->mm_stats.lock);
3316
3317         INIT_LIST_HEAD(&adev->shadow_list);
3318         mutex_init(&adev->shadow_list_lock);
3319
3320         INIT_LIST_HEAD(&adev->reset_list);
3321
3322         INIT_DELAYED_WORK(&adev->delayed_init_work,
3323                           amdgpu_device_delayed_init_work_handler);
3324         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3325                           amdgpu_device_delay_enable_gfx_off);
3326
3327         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3328
3329         adev->gfx.gfx_off_req_count = 1;
3330         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3331
3332         atomic_set(&adev->throttling_logging_enabled, 1);
3333         /*
3334          * If throttling continues, logging will be performed every minute
3335          * to avoid log flooding. "-1" is subtracted since the thermal
3336          * throttling interrupt comes every second. Thus, the total logging
3337          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3338          * for throttling interrupt) = 60 seconds.
3339          */
3340         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3341         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3342
3343         /* Registers mapping */
3344         /* TODO: block userspace mapping of io register */
3345         if (adev->asic_type >= CHIP_BONAIRE) {
3346                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3347                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3348         } else {
3349                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3350                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3351         }
3352
3353         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3354         if (adev->rmmio == NULL) {
3355                 return -ENOMEM;
3356         }
3357         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3358         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3359
3360         /* io port mapping */
3361         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3362                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3363                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3364                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3365                         break;
3366                 }
3367         }
3368         if (adev->rio_mem == NULL)
3369                 DRM_INFO("PCI I/O BAR is not found.\n");
3370
3371         /* enable PCIE atomic ops */
3372         r = pci_enable_atomic_ops_to_root(adev->pdev,
3373                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3374                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3375         if (r) {
3376                 adev->have_atomics_support = false;
3377                 DRM_INFO("PCIE atomic ops is not supported\n");
3378         } else {
3379                 adev->have_atomics_support = true;
3380         }
3381
3382         amdgpu_device_get_pcie_info(adev);
3383
3384         if (amdgpu_mcbp)
3385                 DRM_INFO("MCBP is enabled\n");
3386
3387         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3388                 adev->enable_mes = true;
3389
3390         /* detect hw virtualization here */
3391         amdgpu_detect_virtualization(adev);
3392
3393         r = amdgpu_device_get_job_timeout_settings(adev);
3394         if (r) {
3395                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3396                 goto failed_unmap;
3397         }
3398
3399         /* early init functions */
3400         r = amdgpu_device_ip_early_init(adev);
3401         if (r)
3402                 goto failed_unmap;
3403
3404         /* doorbell bar mapping and doorbell index init*/
3405         amdgpu_device_doorbell_init(adev);
3406
3407         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3408         /* this will fail for cards that aren't VGA class devices, just
3409          * ignore it */
3410         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3411                 vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3412
3413         if (amdgpu_device_supports_atpx(ddev))
3414                 atpx = true;
3415         if (amdgpu_has_atpx() &&
3416             (amdgpu_is_atpx_hybrid() ||
3417              amdgpu_has_atpx_dgpu_power_cntl()) &&
3418             !pci_is_thunderbolt_attached(adev->pdev))
3419                 vga_switcheroo_register_client(adev->pdev,
3420                                                &amdgpu_switcheroo_ops, atpx);
3421         if (atpx)
3422                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3423
3424         if (amdgpu_emu_mode == 1) {
3425                 /* post the asic on emulation mode */
3426                 emu_soc_asic_init(adev);
3427                 goto fence_driver_init;
3428         }
3429
3430         /* detect if we are with an SRIOV vbios */
3431         amdgpu_device_detect_sriov_bios(adev);
3432
3433         /* check if we need to reset the asic
3434          *  E.g., driver was not cleanly unloaded previously, etc.
3435          */
3436         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3437                 if (adev->gmc.xgmi.num_physical_nodes) {
3438                         dev_info(adev->dev, "Pending hive reset.\n");
3439                         adev->gmc.xgmi.pending_reset = true;
3440                         /* Only need to init necessary block for SMU to handle the reset */
3441                         for (i = 0; i < adev->num_ip_blocks; i++) {
3442                                 if (!adev->ip_blocks[i].status.valid)
3443                                         continue;
3444                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3445                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3446                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3447                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3448                                         DRM_DEBUG("IP %s disabed for hw_init.\n",
3449                                                 adev->ip_blocks[i].version->funcs->name);
3450                                         adev->ip_blocks[i].status.hw = true;
3451                                 }
3452                         }
3453                 } else {
3454                         r = amdgpu_asic_reset(adev);
3455                         if (r) {
3456                                 dev_err(adev->dev, "asic reset on init failed\n");
3457                                 goto failed;
3458                         }
3459                 }
3460         }
3461
3462         pci_enable_pcie_error_reporting(adev->pdev);
3463
3464         /* Post card if necessary */
3465         if (amdgpu_device_need_post(adev)) {
3466                 if (!adev->bios) {
3467                         dev_err(adev->dev, "no vBIOS found\n");
3468                         r = -EINVAL;
3469                         goto failed;
3470                 }
3471                 DRM_INFO("GPU posting now...\n");
3472                 r = amdgpu_device_asic_init(adev);
3473                 if (r) {
3474                         dev_err(adev->dev, "gpu post error!\n");
3475                         goto failed;
3476                 }
3477         }
3478
3479         if (adev->is_atom_fw) {
3480                 /* Initialize clocks */
3481                 r = amdgpu_atomfirmware_get_clock_info(adev);
3482                 if (r) {
3483                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3484                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3485                         goto failed;
3486                 }
3487         } else {
3488                 /* Initialize clocks */
3489                 r = amdgpu_atombios_get_clock_info(adev);
3490                 if (r) {
3491                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3492                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3493                         goto failed;
3494                 }
3495                 /* init i2c buses */
3496                 if (!amdgpu_device_has_dc_support(adev))
3497                         amdgpu_atombios_i2c_init(adev);
3498         }
3499
3500 fence_driver_init:
3501         /* Fence driver */
3502         r = amdgpu_fence_driver_init(adev);
3503         if (r) {
3504                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3505                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3506                 goto failed;
3507         }
3508
3509         /* init the mode config */
3510         drm_mode_config_init(adev_to_drm(adev));
3511
3512         r = amdgpu_device_ip_init(adev);
3513         if (r) {
3514                 /* failed in exclusive mode due to timeout */
3515                 if (amdgpu_sriov_vf(adev) &&
3516                     !amdgpu_sriov_runtime(adev) &&
3517                     amdgpu_virt_mmio_blocked(adev) &&
3518                     !amdgpu_virt_wait_reset(adev)) {
3519                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3520                         /* Don't send request since VF is inactive. */
3521                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3522                         adev->virt.ops = NULL;
3523                         r = -EAGAIN;
3524                         goto failed;
3525                 }
3526                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3527                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3528                 goto failed;
3529         }
3530
3531         dev_info(adev->dev,
3532                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3533                         adev->gfx.config.max_shader_engines,
3534                         adev->gfx.config.max_sh_per_se,
3535                         adev->gfx.config.max_cu_per_sh,
3536                         adev->gfx.cu_info.number);
3537
3538         adev->accel_working = true;
3539
3540         amdgpu_vm_check_compute_bug(adev);
3541
3542         /* Initialize the buffer migration limit. */
3543         if (amdgpu_moverate >= 0)
3544                 max_MBps = amdgpu_moverate;
3545         else
3546                 max_MBps = 8; /* Allow 8 MB/s. */
3547         /* Get a log2 for easy divisions. */
3548         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3549
3550         amdgpu_fbdev_init(adev);
3551
3552         r = amdgpu_pm_sysfs_init(adev);
3553         if (r) {
3554                 adev->pm_sysfs_en = false;
3555                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3556         } else
3557                 adev->pm_sysfs_en = true;
3558
3559         r = amdgpu_ucode_sysfs_init(adev);
3560         if (r) {
3561                 adev->ucode_sysfs_en = false;
3562                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3563         } else
3564                 adev->ucode_sysfs_en = true;
3565
3566         if ((amdgpu_testing & 1)) {
3567                 if (adev->accel_working)
3568                         amdgpu_test_moves(adev);
3569                 else
3570                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3571         }
3572         if (amdgpu_benchmarking) {
3573                 if (adev->accel_working)
3574                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3575                 else
3576                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3577         }
3578
3579         /*
3580          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3581          * Otherwise the mgpu fan boost feature will be skipped due to the
3582          * gpu instance is counted less.
3583          */
3584         amdgpu_register_gpu_instance(adev);
3585
3586         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3587          * explicit gating rather than handling it automatically.
3588          */
3589         if (!adev->gmc.xgmi.pending_reset) {
3590                 r = amdgpu_device_ip_late_init(adev);
3591                 if (r) {
3592                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3593                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3594                         goto failed;
3595                 }
3596                 /* must succeed. */
3597                 amdgpu_ras_resume(adev);
3598                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3599                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3600         }
3601
3602         if (amdgpu_sriov_vf(adev))
3603                 flush_delayed_work(&adev->delayed_init_work);
3604
3605         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3606         if (r)
3607                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3608
3609         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3610                 r = amdgpu_pmu_init(adev);
3611         if (r)
3612                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3613
3614         /* Have stored pci confspace at hand for restore in sudden PCI error */
3615         if (amdgpu_device_cache_pci_state(adev->pdev))
3616                 pci_restore_state(pdev);
3617
3618         if (adev->gmc.xgmi.pending_reset)
3619                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3620                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3621
3622         return 0;
3623
3624 failed:
3625         amdgpu_vf_error_trans_all(adev);
3626         if (atpx)
3627                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3628
3629 failed_unmap:
3630         iounmap(adev->rmmio);
3631         adev->rmmio = NULL;
3632
3633         return r;
3634 }
3635
3636 /**
3637  * amdgpu_device_fini - tear down the driver
3638  *
3639  * @adev: amdgpu_device pointer
3640  *
3641  * Tear down the driver info (all asics).
3642  * Called at driver shutdown.
3643  */
3644 void amdgpu_device_fini(struct amdgpu_device *adev)
3645 {
3646         dev_info(adev->dev, "amdgpu: finishing device.\n");
3647         flush_delayed_work(&adev->delayed_init_work);
3648         ttm_bo_lock_delayed_workqueue(&adev->mman.bdev);
3649         adev->shutdown = true;
3650
3651         kfree(adev->pci_state);
3652
3653         /* make sure IB test finished before entering exclusive mode
3654          * to avoid preemption on IB test
3655          * */
3656         if (amdgpu_sriov_vf(adev)) {
3657                 amdgpu_virt_request_full_gpu(adev, false);
3658                 amdgpu_virt_fini_data_exchange(adev);
3659         }
3660
3661         /* disable all interrupts */
3662         amdgpu_irq_disable_all(adev);
3663         if (adev->mode_info.mode_config_initialized){
3664                 if (!amdgpu_device_has_dc_support(adev))
3665                         drm_helper_force_disable_all(adev_to_drm(adev));
3666                 else
3667                         drm_atomic_helper_shutdown(adev_to_drm(adev));
3668         }
3669         amdgpu_fence_driver_fini(adev);
3670         if (adev->pm_sysfs_en)
3671                 amdgpu_pm_sysfs_fini(adev);
3672         amdgpu_fbdev_fini(adev);
3673         amdgpu_device_ip_fini(adev);
3674         release_firmware(adev->firmware.gpu_info_fw);
3675         adev->firmware.gpu_info_fw = NULL;
3676         adev->accel_working = false;
3677         /* free i2c buses */
3678         if (!amdgpu_device_has_dc_support(adev))
3679                 amdgpu_i2c_fini(adev);
3680
3681         if (amdgpu_emu_mode != 1)
3682                 amdgpu_atombios_fini(adev);
3683
3684         kfree(adev->bios);
3685         adev->bios = NULL;
3686         if (amdgpu_has_atpx() &&
3687             (amdgpu_is_atpx_hybrid() ||
3688              amdgpu_has_atpx_dgpu_power_cntl()) &&
3689             !pci_is_thunderbolt_attached(adev->pdev))
3690                 vga_switcheroo_unregister_client(adev->pdev);
3691         if (amdgpu_device_supports_atpx(adev_to_drm(adev)))
3692                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3693         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3694                 vga_client_register(adev->pdev, NULL, NULL, NULL);
3695         if (adev->rio_mem)
3696                 pci_iounmap(adev->pdev, adev->rio_mem);
3697         adev->rio_mem = NULL;
3698         iounmap(adev->rmmio);
3699         adev->rmmio = NULL;
3700         amdgpu_device_doorbell_fini(adev);
3701
3702         if (adev->ucode_sysfs_en)
3703                 amdgpu_ucode_sysfs_fini(adev);
3704
3705         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3706         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3707                 amdgpu_pmu_fini(adev);
3708         if (adev->mman.discovery_bin)
3709                 amdgpu_discovery_fini(adev);
3710 }
3711
3712
3713 /*
3714  * Suspend & resume.
3715  */
3716 /**
3717  * amdgpu_device_suspend - initiate device suspend
3718  *
3719  * @dev: drm dev pointer
3720  * @fbcon : notify the fbdev of suspend
3721  *
3722  * Puts the hw in the suspend state (all asics).
3723  * Returns 0 for success or an error on failure.
3724  * Called at driver suspend.
3725  */
3726 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3727 {
3728         struct amdgpu_device *adev;
3729         struct drm_crtc *crtc;
3730         struct drm_connector *connector;
3731         struct drm_connector_list_iter iter;
3732         int r;
3733
3734         adev = drm_to_adev(dev);
3735
3736         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3737                 return 0;
3738
3739         adev->in_suspend = true;
3740         drm_kms_helper_poll_disable(dev);
3741
3742         if (fbcon)
3743                 amdgpu_fbdev_set_suspend(adev, 1);
3744
3745         cancel_delayed_work_sync(&adev->delayed_init_work);
3746
3747         if (!amdgpu_device_has_dc_support(adev)) {
3748                 /* turn off display hw */
3749                 drm_modeset_lock_all(dev);
3750                 drm_connector_list_iter_begin(dev, &iter);
3751                 drm_for_each_connector_iter(connector, &iter)
3752                         drm_helper_connector_dpms(connector,
3753                                                   DRM_MODE_DPMS_OFF);
3754                 drm_connector_list_iter_end(&iter);
3755                 drm_modeset_unlock_all(dev);
3756                         /* unpin the front buffers and cursors */
3757                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3758                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3759                         struct drm_framebuffer *fb = crtc->primary->fb;
3760                         struct amdgpu_bo *robj;
3761
3762                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3763                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3764                                 r = amdgpu_bo_reserve(aobj, true);
3765                                 if (r == 0) {
3766                                         amdgpu_bo_unpin(aobj);
3767                                         amdgpu_bo_unreserve(aobj);
3768                                 }
3769                         }
3770
3771                         if (fb == NULL || fb->obj[0] == NULL) {
3772                                 continue;
3773                         }
3774                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3775                         /* don't unpin kernel fb objects */
3776                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3777                                 r = amdgpu_bo_reserve(robj, true);
3778                                 if (r == 0) {
3779                                         amdgpu_bo_unpin(robj);
3780                                         amdgpu_bo_unreserve(robj);
3781                                 }
3782                         }
3783                 }
3784         }
3785
3786         amdgpu_ras_suspend(adev);
3787
3788         r = amdgpu_device_ip_suspend_phase1(adev);
3789
3790         amdgpu_amdkfd_suspend(adev, adev->in_runpm);
3791
3792         /* evict vram memory */
3793         amdgpu_bo_evict_vram(adev);
3794
3795         amdgpu_fence_driver_suspend(adev);
3796
3797         if (adev->in_poweroff_reboot_com ||
3798             !amdgpu_acpi_is_s0ix_supported(adev) || amdgpu_in_reset(adev))
3799                 r = amdgpu_device_ip_suspend_phase2(adev);
3800         else
3801                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D3Entry);
3802         /* evict remaining vram memory
3803          * This second call to evict vram is to evict the gart page table
3804          * using the CPU.
3805          */
3806         amdgpu_bo_evict_vram(adev);
3807
3808         return 0;
3809 }
3810
3811 /**
3812  * amdgpu_device_resume - initiate device resume
3813  *
3814  * @dev: drm dev pointer
3815  * @fbcon : notify the fbdev of resume
3816  *
3817  * Bring the hw back to operating state (all asics).
3818  * Returns 0 for success or an error on failure.
3819  * Called at driver resume.
3820  */
3821 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3822 {
3823         struct drm_connector *connector;
3824         struct drm_connector_list_iter iter;
3825         struct amdgpu_device *adev = drm_to_adev(dev);
3826         struct drm_crtc *crtc;
3827         int r = 0;
3828
3829         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3830                 return 0;
3831
3832         if (amdgpu_acpi_is_s0ix_supported(adev))
3833                 amdgpu_gfx_state_change_set(adev, sGpuChangeState_D0Entry);
3834
3835         /* post card */
3836         if (amdgpu_device_need_post(adev)) {
3837                 r = amdgpu_device_asic_init(adev);
3838                 if (r)
3839                         dev_err(adev->dev, "amdgpu asic init failed\n");
3840         }
3841
3842         r = amdgpu_device_ip_resume(adev);
3843         if (r) {
3844                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
3845                 return r;
3846         }
3847         amdgpu_fence_driver_resume(adev);
3848
3849
3850         r = amdgpu_device_ip_late_init(adev);
3851         if (r)
3852                 return r;
3853
3854         queue_delayed_work(system_wq, &adev->delayed_init_work,
3855                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3856
3857         if (!amdgpu_device_has_dc_support(adev)) {
3858                 /* pin cursors */
3859                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3860                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3861
3862                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3863                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3864                                 r = amdgpu_bo_reserve(aobj, true);
3865                                 if (r == 0) {
3866                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3867                                         if (r != 0)
3868                                                 dev_err(adev->dev, "Failed to pin cursor BO (%d)\n", r);
3869                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3870                                         amdgpu_bo_unreserve(aobj);
3871                                 }
3872                         }
3873                 }
3874         }
3875         r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
3876         if (r)
3877                 return r;
3878
3879         /* Make sure IB tests flushed */
3880         flush_delayed_work(&adev->delayed_init_work);
3881
3882         /* blat the mode back in */
3883         if (fbcon) {
3884                 if (!amdgpu_device_has_dc_support(adev)) {
3885                         /* pre DCE11 */
3886                         drm_helper_resume_force_mode(dev);
3887
3888                         /* turn on display hw */
3889                         drm_modeset_lock_all(dev);
3890
3891                         drm_connector_list_iter_begin(dev, &iter);
3892                         drm_for_each_connector_iter(connector, &iter)
3893                                 drm_helper_connector_dpms(connector,
3894                                                           DRM_MODE_DPMS_ON);
3895                         drm_connector_list_iter_end(&iter);
3896
3897                         drm_modeset_unlock_all(dev);
3898                 }
3899                 amdgpu_fbdev_set_suspend(adev, 0);
3900         }
3901
3902         drm_kms_helper_poll_enable(dev);
3903
3904         amdgpu_ras_resume(adev);
3905
3906         /*
3907          * Most of the connector probing functions try to acquire runtime pm
3908          * refs to ensure that the GPU is powered on when connector polling is
3909          * performed. Since we're calling this from a runtime PM callback,
3910          * trying to acquire rpm refs will cause us to deadlock.
3911          *
3912          * Since we're guaranteed to be holding the rpm lock, it's safe to
3913          * temporarily disable the rpm helpers so this doesn't deadlock us.
3914          */
3915 #ifdef CONFIG_PM
3916         dev->dev->power.disable_depth++;
3917 #endif
3918         if (!amdgpu_device_has_dc_support(adev))
3919                 drm_helper_hpd_irq_event(dev);
3920         else
3921                 drm_kms_helper_hotplug_event(dev);
3922 #ifdef CONFIG_PM
3923         dev->dev->power.disable_depth--;
3924 #endif
3925         adev->in_suspend = false;
3926
3927         return 0;
3928 }
3929
3930 /**
3931  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3932  *
3933  * @adev: amdgpu_device pointer
3934  *
3935  * The list of all the hardware IPs that make up the asic is walked and
3936  * the check_soft_reset callbacks are run.  check_soft_reset determines
3937  * if the asic is still hung or not.
3938  * Returns true if any of the IPs are still in a hung state, false if not.
3939  */
3940 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3941 {
3942         int i;
3943         bool asic_hang = false;
3944
3945         if (amdgpu_sriov_vf(adev))
3946                 return true;
3947
3948         if (amdgpu_asic_need_full_reset(adev))
3949                 return true;
3950
3951         for (i = 0; i < adev->num_ip_blocks; i++) {
3952                 if (!adev->ip_blocks[i].status.valid)
3953                         continue;
3954                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3955                         adev->ip_blocks[i].status.hang =
3956                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3957                 if (adev->ip_blocks[i].status.hang) {
3958                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3959                         asic_hang = true;
3960                 }
3961         }
3962         return asic_hang;
3963 }
3964
3965 /**
3966  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3967  *
3968  * @adev: amdgpu_device pointer
3969  *
3970  * The list of all the hardware IPs that make up the asic is walked and the
3971  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3972  * handles any IP specific hardware or software state changes that are
3973  * necessary for a soft reset to succeed.
3974  * Returns 0 on success, negative error code on failure.
3975  */
3976 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3977 {
3978         int i, r = 0;
3979
3980         for (i = 0; i < adev->num_ip_blocks; i++) {
3981                 if (!adev->ip_blocks[i].status.valid)
3982                         continue;
3983                 if (adev->ip_blocks[i].status.hang &&
3984                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3985                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3986                         if (r)
3987                                 return r;
3988                 }
3989         }
3990
3991         return 0;
3992 }
3993
3994 /**
3995  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3996  *
3997  * @adev: amdgpu_device pointer
3998  *
3999  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4000  * reset is necessary to recover.
4001  * Returns true if a full asic reset is required, false if not.
4002  */
4003 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4004 {
4005         int i;
4006
4007         if (amdgpu_asic_need_full_reset(adev))
4008                 return true;
4009
4010         for (i = 0; i < adev->num_ip_blocks; i++) {
4011                 if (!adev->ip_blocks[i].status.valid)
4012                         continue;
4013                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4014                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4015                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4016                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4017                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4018                         if (adev->ip_blocks[i].status.hang) {
4019                                 dev_info(adev->dev, "Some block need full reset!\n");
4020                                 return true;
4021                         }
4022                 }
4023         }
4024         return false;
4025 }
4026
4027 /**
4028  * amdgpu_device_ip_soft_reset - do a soft reset
4029  *
4030  * @adev: amdgpu_device pointer
4031  *
4032  * The list of all the hardware IPs that make up the asic is walked and the
4033  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4034  * IP specific hardware or software state changes that are necessary to soft
4035  * reset the IP.
4036  * Returns 0 on success, negative error code on failure.
4037  */
4038 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4039 {
4040         int i, r = 0;
4041
4042         for (i = 0; i < adev->num_ip_blocks; i++) {
4043                 if (!adev->ip_blocks[i].status.valid)
4044                         continue;
4045                 if (adev->ip_blocks[i].status.hang &&
4046                     adev->ip_blocks[i].version->funcs->soft_reset) {
4047                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4048                         if (r)
4049                                 return r;
4050                 }
4051         }
4052
4053         return 0;
4054 }
4055
4056 /**
4057  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4058  *
4059  * @adev: amdgpu_device pointer
4060  *
4061  * The list of all the hardware IPs that make up the asic is walked and the
4062  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4063  * handles any IP specific hardware or software state changes that are
4064  * necessary after the IP has been soft reset.
4065  * Returns 0 on success, negative error code on failure.
4066  */
4067 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4068 {
4069         int i, r = 0;
4070
4071         for (i = 0; i < adev->num_ip_blocks; i++) {
4072                 if (!adev->ip_blocks[i].status.valid)
4073                         continue;
4074                 if (adev->ip_blocks[i].status.hang &&
4075                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4076                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4077                 if (r)
4078                         return r;
4079         }
4080
4081         return 0;
4082 }
4083
4084 /**
4085  * amdgpu_device_recover_vram - Recover some VRAM contents
4086  *
4087  * @adev: amdgpu_device pointer
4088  *
4089  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4090  * restore things like GPUVM page tables after a GPU reset where
4091  * the contents of VRAM might be lost.
4092  *
4093  * Returns:
4094  * 0 on success, negative error code on failure.
4095  */
4096 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4097 {
4098         struct dma_fence *fence = NULL, *next = NULL;
4099         struct amdgpu_bo *shadow;
4100         long r = 1, tmo;
4101
4102         if (amdgpu_sriov_runtime(adev))
4103                 tmo = msecs_to_jiffies(8000);
4104         else
4105                 tmo = msecs_to_jiffies(100);
4106
4107         dev_info(adev->dev, "recover vram bo from shadow start\n");
4108         mutex_lock(&adev->shadow_list_lock);
4109         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
4110
4111                 /* No need to recover an evicted BO */
4112                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
4113                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
4114                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
4115                         continue;
4116
4117                 r = amdgpu_bo_restore_shadow(shadow, &next);
4118                 if (r)
4119                         break;
4120
4121                 if (fence) {
4122                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4123                         dma_fence_put(fence);
4124                         fence = next;
4125                         if (tmo == 0) {
4126                                 r = -ETIMEDOUT;
4127                                 break;
4128                         } else if (tmo < 0) {
4129                                 r = tmo;
4130                                 break;
4131                         }
4132                 } else {
4133                         fence = next;
4134                 }
4135         }
4136         mutex_unlock(&adev->shadow_list_lock);
4137
4138         if (fence)
4139                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4140         dma_fence_put(fence);
4141
4142         if (r < 0 || tmo <= 0) {
4143                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4144                 return -EIO;
4145         }
4146
4147         dev_info(adev->dev, "recover vram bo from shadow done\n");
4148         return 0;
4149 }
4150
4151
4152 /**
4153  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4154  *
4155  * @adev: amdgpu_device pointer
4156  * @from_hypervisor: request from hypervisor
4157  *
4158  * do VF FLR and reinitialize Asic
4159  * return 0 means succeeded otherwise failed
4160  */
4161 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4162                                      bool from_hypervisor)
4163 {
4164         int r;
4165
4166         if (from_hypervisor)
4167                 r = amdgpu_virt_request_full_gpu(adev, true);
4168         else
4169                 r = amdgpu_virt_reset_gpu(adev);
4170         if (r)
4171                 return r;
4172
4173         amdgpu_amdkfd_pre_reset(adev);
4174
4175         /* Resume IP prior to SMC */
4176         r = amdgpu_device_ip_reinit_early_sriov(adev);
4177         if (r)
4178                 goto error;
4179
4180         amdgpu_virt_init_data_exchange(adev);
4181         /* we need recover gart prior to run SMC/CP/SDMA resume */
4182         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
4183
4184         r = amdgpu_device_fw_loading(adev);
4185         if (r)
4186                 return r;
4187
4188         /* now we are okay to resume SMC/CP/SDMA */
4189         r = amdgpu_device_ip_reinit_late_sriov(adev);
4190         if (r)
4191                 goto error;
4192
4193         amdgpu_irq_gpu_reset_resume_helper(adev);
4194         r = amdgpu_ib_ring_tests(adev);
4195         amdgpu_amdkfd_post_reset(adev);
4196
4197 error:
4198         amdgpu_virt_release_full_gpu(adev, true);
4199         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4200                 amdgpu_inc_vram_lost(adev);
4201                 r = amdgpu_device_recover_vram(adev);
4202         }
4203
4204         return r;
4205 }
4206
4207 /**
4208  * amdgpu_device_has_job_running - check if there is any job in mirror list
4209  *
4210  * @adev: amdgpu_device pointer
4211  *
4212  * check if there is any job in mirror list
4213  */
4214 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4215 {
4216         int i;
4217         struct drm_sched_job *job;
4218
4219         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4220                 struct amdgpu_ring *ring = adev->rings[i];
4221
4222                 if (!ring || !ring->sched.thread)
4223                         continue;
4224
4225                 spin_lock(&ring->sched.job_list_lock);
4226                 job = list_first_entry_or_null(&ring->sched.pending_list,
4227                                                struct drm_sched_job, list);
4228                 spin_unlock(&ring->sched.job_list_lock);
4229                 if (job)
4230                         return true;
4231         }
4232         return false;
4233 }
4234
4235 /**
4236  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4237  *
4238  * @adev: amdgpu_device pointer
4239  *
4240  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4241  * a hung GPU.
4242  */
4243 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4244 {
4245         if (!amdgpu_device_ip_check_soft_reset(adev)) {
4246                 dev_info(adev->dev, "Timeout, but no hardware hang detected.\n");
4247                 return false;
4248         }
4249
4250         if (amdgpu_gpu_recovery == 0)
4251                 goto disabled;
4252
4253         if (amdgpu_sriov_vf(adev))
4254                 return true;
4255
4256         if (amdgpu_gpu_recovery == -1) {
4257                 switch (adev->asic_type) {
4258                 case CHIP_BONAIRE:
4259                 case CHIP_HAWAII:
4260                 case CHIP_TOPAZ:
4261                 case CHIP_TONGA:
4262                 case CHIP_FIJI:
4263                 case CHIP_POLARIS10:
4264                 case CHIP_POLARIS11:
4265                 case CHIP_POLARIS12:
4266                 case CHIP_VEGAM:
4267                 case CHIP_VEGA20:
4268                 case CHIP_VEGA10:
4269                 case CHIP_VEGA12:
4270                 case CHIP_RAVEN:
4271                 case CHIP_ARCTURUS:
4272                 case CHIP_RENOIR:
4273                 case CHIP_NAVI10:
4274                 case CHIP_NAVI14:
4275                 case CHIP_NAVI12:
4276                 case CHIP_SIENNA_CICHLID:
4277                 case CHIP_NAVY_FLOUNDER:
4278                 case CHIP_DIMGREY_CAVEFISH:
4279                         break;
4280                 default:
4281                         goto disabled;
4282                 }
4283         }
4284
4285         return true;
4286
4287 disabled:
4288                 dev_info(adev->dev, "GPU recovery disabled.\n");
4289                 return false;
4290 }
4291
4292 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4293 {
4294         u32 i;
4295         int ret = 0;
4296
4297         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4298
4299         dev_info(adev->dev, "GPU mode1 reset\n");
4300
4301         /* disable BM */
4302         pci_clear_master(adev->pdev);
4303
4304         amdgpu_device_cache_pci_state(adev->pdev);
4305
4306         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4307                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4308                 ret = amdgpu_dpm_mode1_reset(adev);
4309         } else {
4310                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4311                 ret = psp_gpu_reset(adev);
4312         }
4313
4314         if (ret)
4315                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4316
4317         amdgpu_device_load_pci_state(adev->pdev);
4318
4319         /* wait for asic to come out of reset */
4320         for (i = 0; i < adev->usec_timeout; i++) {
4321                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4322
4323                 if (memsize != 0xffffffff)
4324                         break;
4325                 udelay(1);
4326         }
4327
4328         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4329         return ret;
4330 }
4331
4332 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4333                                   struct amdgpu_job *job,
4334                                   bool *need_full_reset_arg)
4335 {
4336         int i, r = 0;
4337         bool need_full_reset  = *need_full_reset_arg;
4338
4339         /* no need to dump if device is not in good state during probe period */
4340         if (!adev->gmc.xgmi.pending_reset)
4341                 amdgpu_debugfs_wait_dump(adev);
4342
4343         if (amdgpu_sriov_vf(adev)) {
4344                 /* stop the data exchange thread */
4345                 amdgpu_virt_fini_data_exchange(adev);
4346         }
4347
4348         /* block all schedulers and reset given job's ring */
4349         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4350                 struct amdgpu_ring *ring = adev->rings[i];
4351
4352                 if (!ring || !ring->sched.thread)
4353                         continue;
4354
4355                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4356                 amdgpu_fence_driver_force_completion(ring);
4357         }
4358
4359         if(job)
4360                 drm_sched_increase_karma(&job->base);
4361
4362         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4363         if (!amdgpu_sriov_vf(adev)) {
4364
4365                 if (!need_full_reset)
4366                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4367
4368                 if (!need_full_reset) {
4369                         amdgpu_device_ip_pre_soft_reset(adev);
4370                         r = amdgpu_device_ip_soft_reset(adev);
4371                         amdgpu_device_ip_post_soft_reset(adev);
4372                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4373                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4374                                 need_full_reset = true;
4375                         }
4376                 }
4377
4378                 if (need_full_reset)
4379                         r = amdgpu_device_ip_suspend(adev);
4380
4381                 *need_full_reset_arg = need_full_reset;
4382         }
4383
4384         return r;
4385 }
4386
4387 int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4388                           struct list_head *device_list_handle,
4389                           bool *need_full_reset_arg,
4390                           bool skip_hw_reset)
4391 {
4392         struct amdgpu_device *tmp_adev = NULL;
4393         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4394         int r = 0;
4395
4396         /*
4397          * ASIC reset has to be done on all XGMI hive nodes ASAP
4398          * to allow proper links negotiation in FW (within 1 sec)
4399          */
4400         if (!skip_hw_reset && need_full_reset) {
4401                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4402                         /* For XGMI run all resets in parallel to speed up the process */
4403                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4404                                 tmp_adev->gmc.xgmi.pending_reset = false;
4405                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4406                                         r = -EALREADY;
4407                         } else
4408                                 r = amdgpu_asic_reset(tmp_adev);
4409
4410                         if (r) {
4411                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4412                                          r, adev_to_drm(tmp_adev)->unique);
4413                                 break;
4414                         }
4415                 }
4416
4417                 /* For XGMI wait for all resets to complete before proceed */
4418                 if (!r) {
4419                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4420                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4421                                         flush_work(&tmp_adev->xgmi_reset_work);
4422                                         r = tmp_adev->asic_reset_res;
4423                                         if (r)
4424                                                 break;
4425                                 }
4426                         }
4427                 }
4428         }
4429
4430         if (!r && amdgpu_ras_intr_triggered()) {
4431                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4432                         if (tmp_adev->mmhub.funcs &&
4433                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4434                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4435                 }
4436
4437                 amdgpu_ras_intr_cleared();
4438         }
4439
4440         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4441                 if (need_full_reset) {
4442                         /* post card */
4443                         r = amdgpu_device_asic_init(tmp_adev);
4444                         if (r) {
4445                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4446                         } else {
4447                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4448                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4449                                 if (r)
4450                                         goto out;
4451
4452                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4453                                 if (vram_lost) {
4454                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4455                                         amdgpu_inc_vram_lost(tmp_adev);
4456                                 }
4457
4458                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4459                                 if (r)
4460                                         goto out;
4461
4462                                 r = amdgpu_device_fw_loading(tmp_adev);
4463                                 if (r)
4464                                         return r;
4465
4466                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4467                                 if (r)
4468                                         goto out;
4469
4470                                 if (vram_lost)
4471                                         amdgpu_device_fill_reset_magic(tmp_adev);
4472
4473                                 /*
4474                                  * Add this ASIC as tracked as reset was already
4475                                  * complete successfully.
4476                                  */
4477                                 amdgpu_register_gpu_instance(tmp_adev);
4478
4479                                 if (!hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4480                                         amdgpu_xgmi_add_device(tmp_adev);
4481
4482                                 r = amdgpu_device_ip_late_init(tmp_adev);
4483                                 if (r)
4484                                         goto out;
4485
4486                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4487
4488                                 /*
4489                                  * The GPU enters bad state once faulty pages
4490                                  * by ECC has reached the threshold, and ras
4491                                  * recovery is scheduled next. So add one check
4492                                  * here to break recovery if it indeed exceeds
4493                                  * bad page threshold, and remind user to
4494                                  * retire this GPU or setting one bigger
4495                                  * bad_page_threshold value to fix this once
4496                                  * probing driver again.
4497                                  */
4498                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
4499                                         /* must succeed. */
4500                                         amdgpu_ras_resume(tmp_adev);
4501                                 } else {
4502                                         r = -EINVAL;
4503                                         goto out;
4504                                 }
4505
4506                                 /* Update PSP FW topology after reset */
4507                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4508                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4509                         }
4510                 }
4511
4512 out:
4513                 if (!r) {
4514                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4515                         r = amdgpu_ib_ring_tests(tmp_adev);
4516                         if (r) {
4517                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4518                                 r = amdgpu_device_ip_suspend(tmp_adev);
4519                                 need_full_reset = true;
4520                                 r = -EAGAIN;
4521                                 goto end;
4522                         }
4523                 }
4524
4525                 if (!r)
4526                         r = amdgpu_device_recover_vram(tmp_adev);
4527                 else
4528                         tmp_adev->asic_reset_res = r;
4529         }
4530
4531 end:
4532         *need_full_reset_arg = need_full_reset;
4533         return r;
4534 }
4535
4536 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev,
4537                                 struct amdgpu_hive_info *hive)
4538 {
4539         if (atomic_cmpxchg(&adev->in_gpu_reset, 0, 1) != 0)
4540                 return false;
4541
4542         if (hive) {
4543                 down_write_nest_lock(&adev->reset_sem, &hive->hive_lock);
4544         } else {
4545                 down_write(&adev->reset_sem);
4546         }
4547
4548         switch (amdgpu_asic_reset_method(adev)) {
4549         case AMD_RESET_METHOD_MODE1:
4550                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4551                 break;
4552         case AMD_RESET_METHOD_MODE2:
4553                 adev->mp1_state = PP_MP1_STATE_RESET;
4554                 break;
4555         default:
4556                 adev->mp1_state = PP_MP1_STATE_NONE;
4557                 break;
4558         }
4559
4560         return true;
4561 }
4562
4563 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4564 {
4565         amdgpu_vf_error_trans_all(adev);
4566         adev->mp1_state = PP_MP1_STATE_NONE;
4567         atomic_set(&adev->in_gpu_reset, 0);
4568         up_write(&adev->reset_sem);
4569 }
4570
4571 /*
4572  * to lockup a list of amdgpu devices in a hive safely, if not a hive
4573  * with multiple nodes, it will be similar as amdgpu_device_lock_adev.
4574  *
4575  * unlock won't require roll back.
4576  */
4577 static int amdgpu_device_lock_hive_adev(struct amdgpu_device *adev, struct amdgpu_hive_info *hive)
4578 {
4579         struct amdgpu_device *tmp_adev = NULL;
4580
4581         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4582                 if (!hive) {
4583                         dev_err(adev->dev, "Hive is NULL while device has multiple xgmi nodes");
4584                         return -ENODEV;
4585                 }
4586                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4587                         if (!amdgpu_device_lock_adev(tmp_adev, hive))
4588                                 goto roll_back;
4589                 }
4590         } else if (!amdgpu_device_lock_adev(adev, hive))
4591                 return -EAGAIN;
4592
4593         return 0;
4594 roll_back:
4595         if (!list_is_first(&tmp_adev->gmc.xgmi.head, &hive->device_list)) {
4596                 /*
4597                  * if the lockup iteration break in the middle of a hive,
4598                  * it may means there may has a race issue,
4599                  * or a hive device locked up independently.
4600                  * we may be in trouble and may not, so will try to roll back
4601                  * the lock and give out a warnning.
4602                  */
4603                 dev_warn(tmp_adev->dev, "Hive lock iteration broke in the middle. Rolling back to unlock");
4604                 list_for_each_entry_continue_reverse(tmp_adev, &hive->device_list, gmc.xgmi.head) {
4605                         amdgpu_device_unlock_adev(tmp_adev);
4606                 }
4607         }
4608         return -EAGAIN;
4609 }
4610
4611 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4612 {
4613         struct pci_dev *p = NULL;
4614
4615         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4616                         adev->pdev->bus->number, 1);
4617         if (p) {
4618                 pm_runtime_enable(&(p->dev));
4619                 pm_runtime_resume(&(p->dev));
4620         }
4621 }
4622
4623 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4624 {
4625         enum amd_reset_method reset_method;
4626         struct pci_dev *p = NULL;
4627         u64 expires;
4628
4629         /*
4630          * For now, only BACO and mode1 reset are confirmed
4631          * to suffer the audio issue without proper suspended.
4632          */
4633         reset_method = amdgpu_asic_reset_method(adev);
4634         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4635              (reset_method != AMD_RESET_METHOD_MODE1))
4636                 return -EINVAL;
4637
4638         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4639                         adev->pdev->bus->number, 1);
4640         if (!p)
4641                 return -ENODEV;
4642
4643         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4644         if (!expires)
4645                 /*
4646                  * If we cannot get the audio device autosuspend delay,
4647                  * a fixed 4S interval will be used. Considering 3S is
4648                  * the audio controller default autosuspend delay setting.
4649                  * 4S used here is guaranteed to cover that.
4650                  */
4651                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4652
4653         while (!pm_runtime_status_suspended(&(p->dev))) {
4654                 if (!pm_runtime_suspend(&(p->dev)))
4655                         break;
4656
4657                 if (expires < ktime_get_mono_fast_ns()) {
4658                         dev_warn(adev->dev, "failed to suspend display audio\n");
4659                         /* TODO: abort the succeeding gpu reset? */
4660                         return -ETIMEDOUT;
4661                 }
4662         }
4663
4664         pm_runtime_disable(&(p->dev));
4665
4666         return 0;
4667 }
4668
4669 /**
4670  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4671  *
4672  * @adev: amdgpu_device pointer
4673  * @job: which job trigger hang
4674  *
4675  * Attempt to reset the GPU if it has hung (all asics).
4676  * Attempt to do soft-reset or full-reset and reinitialize Asic
4677  * Returns 0 for success or an error on failure.
4678  */
4679
4680 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4681                               struct amdgpu_job *job)
4682 {
4683         struct list_head device_list, *device_list_handle =  NULL;
4684         bool need_full_reset = false;
4685         bool job_signaled = false;
4686         struct amdgpu_hive_info *hive = NULL;
4687         struct amdgpu_device *tmp_adev = NULL;
4688         int i, r = 0;
4689         bool need_emergency_restart = false;
4690         bool audio_suspended = false;
4691
4692         /*
4693          * Special case: RAS triggered and full reset isn't supported
4694          */
4695         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4696
4697         /*
4698          * Flush RAM to disk so that after reboot
4699          * the user can read log and see why the system rebooted.
4700          */
4701         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4702                 DRM_WARN("Emergency reboot.");
4703
4704                 ksys_sync_helper();
4705                 emergency_restart();
4706         }
4707
4708         dev_info(adev->dev, "GPU %s begin!\n",
4709                 need_emergency_restart ? "jobs stop":"reset");
4710
4711         /*
4712          * Here we trylock to avoid chain of resets executing from
4713          * either trigger by jobs on different adevs in XGMI hive or jobs on
4714          * different schedulers for same device while this TO handler is running.
4715          * We always reset all schedulers for device and all devices for XGMI
4716          * hive so that should take care of them too.
4717          */
4718         hive = amdgpu_get_xgmi_hive(adev);
4719         if (hive) {
4720                 if (atomic_cmpxchg(&hive->in_reset, 0, 1) != 0) {
4721                         DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4722                                 job ? job->base.id : -1, hive->hive_id);
4723                         amdgpu_put_xgmi_hive(hive);
4724                         if (job)
4725                                 drm_sched_increase_karma(&job->base);
4726                         return 0;
4727                 }
4728                 mutex_lock(&hive->hive_lock);
4729         }
4730
4731         /*
4732          * lock the device before we try to operate the linked list
4733          * if didn't get the device lock, don't touch the linked list since
4734          * others may iterating it.
4735          */
4736         r = amdgpu_device_lock_hive_adev(adev, hive);
4737         if (r) {
4738                 dev_info(adev->dev, "Bailing on TDR for s_job:%llx, as another already in progress",
4739                                         job ? job->base.id : -1);
4740
4741                 /* even we skipped this reset, still need to set the job to guilty */
4742                 if (job)
4743                         drm_sched_increase_karma(&job->base);
4744                 goto skip_recovery;
4745         }
4746
4747         /*
4748          * Build list of devices to reset.
4749          * In case we are in XGMI hive mode, resort the device list
4750          * to put adev in the 1st position.
4751          */
4752         INIT_LIST_HEAD(&device_list);
4753         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4754                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head)
4755                         list_add_tail(&tmp_adev->reset_list, &device_list);
4756                 if (!list_is_first(&adev->reset_list, &device_list))
4757                         list_rotate_to_front(&adev->reset_list, &device_list);
4758                 device_list_handle = &device_list;
4759         } else {
4760                 list_add_tail(&adev->reset_list, &device_list);
4761                 device_list_handle = &device_list;
4762         }
4763
4764         /* block all schedulers and reset given job's ring */
4765         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4766                 /*
4767                  * Try to put the audio codec into suspend state
4768                  * before gpu reset started.
4769                  *
4770                  * Due to the power domain of the graphics device
4771                  * is shared with AZ power domain. Without this,
4772                  * we may change the audio hardware from behind
4773                  * the audio driver's back. That will trigger
4774                  * some audio codec errors.
4775                  */
4776                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4777                         audio_suspended = true;
4778
4779                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4780
4781                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4782
4783                 if (!amdgpu_sriov_vf(tmp_adev))
4784                         amdgpu_amdkfd_pre_reset(tmp_adev);
4785
4786                 /*
4787                  * Mark these ASICs to be reseted as untracked first
4788                  * And add them back after reset completed
4789                  */
4790                 amdgpu_unregister_gpu_instance(tmp_adev);
4791
4792                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4793
4794                 /* disable ras on ALL IPs */
4795                 if (!need_emergency_restart &&
4796                       amdgpu_device_ip_need_full_reset(tmp_adev))
4797                         amdgpu_ras_suspend(tmp_adev);
4798
4799                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4800                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4801
4802                         if (!ring || !ring->sched.thread)
4803                                 continue;
4804
4805                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4806
4807                         if (need_emergency_restart)
4808                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4809                 }
4810                 atomic_inc(&tmp_adev->gpu_reset_counter);
4811         }
4812
4813         if (need_emergency_restart)
4814                 goto skip_sched_resume;
4815
4816         /*
4817          * Must check guilty signal here since after this point all old
4818          * HW fences are force signaled.
4819          *
4820          * job->base holds a reference to parent fence
4821          */
4822         if (job && job->base.s_fence->parent &&
4823             dma_fence_is_signaled(job->base.s_fence->parent)) {
4824                 job_signaled = true;
4825                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4826                 goto skip_hw_reset;
4827         }
4828
4829 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4830         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4831                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4832                                                  (tmp_adev == adev) ? job : NULL,
4833                                                  &need_full_reset);
4834                 /*TODO Should we stop ?*/
4835                 if (r) {
4836                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
4837                                   r, adev_to_drm(tmp_adev)->unique);
4838                         tmp_adev->asic_reset_res = r;
4839                 }
4840         }
4841
4842         /* Actual ASIC resets if needed.*/
4843         /* TODO Implement XGMI hive reset logic for SRIOV */
4844         if (amdgpu_sriov_vf(adev)) {
4845                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4846                 if (r)
4847                         adev->asic_reset_res = r;
4848         } else {
4849                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset, false);
4850                 if (r && r == -EAGAIN)
4851                         goto retry;
4852         }
4853
4854 skip_hw_reset:
4855
4856         /* Post ASIC reset for all devs .*/
4857         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4858
4859                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4860                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4861
4862                         if (!ring || !ring->sched.thread)
4863                                 continue;
4864
4865                         /* No point to resubmit jobs if we didn't HW reset*/
4866                         if (!tmp_adev->asic_reset_res && !job_signaled)
4867                                 drm_sched_resubmit_jobs(&ring->sched);
4868
4869                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4870                 }
4871
4872                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4873                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
4874                 }
4875
4876                 tmp_adev->asic_reset_res = 0;
4877
4878                 if (r) {
4879                         /* bad news, how to tell it to userspace ? */
4880                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4881                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4882                 } else {
4883                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4884                 }
4885         }
4886
4887 skip_sched_resume:
4888         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4889                 /* unlock kfd: SRIOV would do it separately */
4890                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4891                         amdgpu_amdkfd_post_reset(tmp_adev);
4892
4893                 /* kfd_post_reset will do nothing if kfd device is not initialized,
4894                  * need to bring up kfd here if it's not be initialized before
4895                  */
4896                 if (!adev->kfd.init_complete)
4897                         amdgpu_amdkfd_device_init(adev);
4898
4899                 if (audio_suspended)
4900                         amdgpu_device_resume_display_audio(tmp_adev);
4901                 amdgpu_device_unlock_adev(tmp_adev);
4902         }
4903
4904 skip_recovery:
4905         if (hive) {
4906                 atomic_set(&hive->in_reset, 0);
4907                 mutex_unlock(&hive->hive_lock);
4908                 amdgpu_put_xgmi_hive(hive);
4909         }
4910
4911         if (r && r != -EAGAIN)
4912                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4913         return r;
4914 }
4915
4916 /**
4917  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4918  *
4919  * @adev: amdgpu_device pointer
4920  *
4921  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4922  * and lanes) of the slot the device is in. Handles APUs and
4923  * virtualized environments where PCIE config space may not be available.
4924  */
4925 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4926 {
4927         struct pci_dev *pdev;
4928         enum pci_bus_speed speed_cap, platform_speed_cap;
4929         enum pcie_link_width platform_link_width;
4930
4931         if (amdgpu_pcie_gen_cap)
4932                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4933
4934         if (amdgpu_pcie_lane_cap)
4935                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4936
4937         /* covers APUs as well */
4938         if (pci_is_root_bus(adev->pdev->bus)) {
4939                 if (adev->pm.pcie_gen_mask == 0)
4940                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4941                 if (adev->pm.pcie_mlw_mask == 0)
4942                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4943                 return;
4944         }
4945
4946         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4947                 return;
4948
4949         pcie_bandwidth_available(adev->pdev, NULL,
4950                                  &platform_speed_cap, &platform_link_width);
4951
4952         if (adev->pm.pcie_gen_mask == 0) {
4953                 /* asic caps */
4954                 pdev = adev->pdev;
4955                 speed_cap = pcie_get_speed_cap(pdev);
4956                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4957                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4958                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4959                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4960                 } else {
4961                         if (speed_cap == PCIE_SPEED_32_0GT)
4962                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4963                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4964                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4965                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
4966                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
4967                         else if (speed_cap == PCIE_SPEED_16_0GT)
4968                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4969                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4970                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4971                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4972                         else if (speed_cap == PCIE_SPEED_8_0GT)
4973                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4974                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4975                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4976                         else if (speed_cap == PCIE_SPEED_5_0GT)
4977                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4978                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4979                         else
4980                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4981                 }
4982                 /* platform caps */
4983                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4984                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4985                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4986                 } else {
4987                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
4988                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4989                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4990                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4991                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
4992                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
4993                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
4994                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4995                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4996                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4997                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4998                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4999                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5000                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5001                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5002                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5003                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5004                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5005                         else
5006                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5007
5008                 }
5009         }
5010         if (adev->pm.pcie_mlw_mask == 0) {
5011                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5012                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5013                 } else {
5014                         switch (platform_link_width) {
5015                         case PCIE_LNK_X32:
5016                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5017                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5018                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5019                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5020                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5021                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5022                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5023                                 break;
5024                         case PCIE_LNK_X16:
5025                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5026                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5027                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5028                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5029                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5030                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5031                                 break;
5032                         case PCIE_LNK_X12:
5033                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5034                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5035                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5036                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5037                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5038                                 break;
5039                         case PCIE_LNK_X8:
5040                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5041                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5042                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5043                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5044                                 break;
5045                         case PCIE_LNK_X4:
5046                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5047                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5048                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5049                                 break;
5050                         case PCIE_LNK_X2:
5051                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5052                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5053                                 break;
5054                         case PCIE_LNK_X1:
5055                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5056                                 break;
5057                         default:
5058                                 break;
5059                         }
5060                 }
5061         }
5062 }
5063
5064 int amdgpu_device_baco_enter(struct drm_device *dev)
5065 {
5066         struct amdgpu_device *adev = drm_to_adev(dev);
5067         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5068
5069         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5070                 return -ENOTSUPP;
5071
5072         if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5073                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5074
5075         return amdgpu_dpm_baco_enter(adev);
5076 }
5077
5078 int amdgpu_device_baco_exit(struct drm_device *dev)
5079 {
5080         struct amdgpu_device *adev = drm_to_adev(dev);
5081         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5082         int ret = 0;
5083
5084         if (!amdgpu_device_supports_baco(adev_to_drm(adev)))
5085                 return -ENOTSUPP;
5086
5087         ret = amdgpu_dpm_baco_exit(adev);
5088         if (ret)
5089                 return ret;
5090
5091         if (ras && ras->supported && adev->nbio.funcs->enable_doorbell_interrupt)
5092                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5093
5094         return 0;
5095 }
5096
5097 static void amdgpu_cancel_all_tdr(struct amdgpu_device *adev)
5098 {
5099         int i;
5100
5101         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5102                 struct amdgpu_ring *ring = adev->rings[i];
5103
5104                 if (!ring || !ring->sched.thread)
5105                         continue;
5106
5107                 cancel_delayed_work_sync(&ring->sched.work_tdr);
5108         }
5109 }
5110
5111 /**
5112  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5113  * @pdev: PCI device struct
5114  * @state: PCI channel state
5115  *
5116  * Description: Called when a PCI error is detected.
5117  *
5118  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5119  */
5120 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5121 {
5122         struct drm_device *dev = pci_get_drvdata(pdev);
5123         struct amdgpu_device *adev = drm_to_adev(dev);
5124         int i;
5125
5126         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5127
5128         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5129                 DRM_WARN("No support for XGMI hive yet...");
5130                 return PCI_ERS_RESULT_DISCONNECT;
5131         }
5132
5133         switch (state) {
5134         case pci_channel_io_normal:
5135                 return PCI_ERS_RESULT_CAN_RECOVER;
5136         /* Fatal error, prepare for slot reset */
5137         case pci_channel_io_frozen:
5138                 /*
5139                  * Cancel and wait for all TDRs in progress if failing to
5140                  * set  adev->in_gpu_reset in amdgpu_device_lock_adev
5141                  *
5142                  * Locking adev->reset_sem will prevent any external access
5143                  * to GPU during PCI error recovery
5144                  */
5145                 while (!amdgpu_device_lock_adev(adev, NULL))
5146                         amdgpu_cancel_all_tdr(adev);
5147
5148                 /*
5149                  * Block any work scheduling as we do for regular GPU reset
5150                  * for the duration of the recovery
5151                  */
5152                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5153                         struct amdgpu_ring *ring = adev->rings[i];
5154
5155                         if (!ring || !ring->sched.thread)
5156                                 continue;
5157
5158                         drm_sched_stop(&ring->sched, NULL);
5159                 }
5160                 atomic_inc(&adev->gpu_reset_counter);
5161                 return PCI_ERS_RESULT_NEED_RESET;
5162         case pci_channel_io_perm_failure:
5163                 /* Permanent error, prepare for device removal */
5164                 return PCI_ERS_RESULT_DISCONNECT;
5165         }
5166
5167         return PCI_ERS_RESULT_NEED_RESET;
5168 }
5169
5170 /**
5171  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5172  * @pdev: pointer to PCI device
5173  */
5174 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5175 {
5176
5177         DRM_INFO("PCI error: mmio enabled callback!!\n");
5178
5179         /* TODO - dump whatever for debugging purposes */
5180
5181         /* This called only if amdgpu_pci_error_detected returns
5182          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5183          * works, no need to reset slot.
5184          */
5185
5186         return PCI_ERS_RESULT_RECOVERED;
5187 }
5188
5189 /**
5190  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5191  * @pdev: PCI device struct
5192  *
5193  * Description: This routine is called by the pci error recovery
5194  * code after the PCI slot has been reset, just before we
5195  * should resume normal operations.
5196  */
5197 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5198 {
5199         struct drm_device *dev = pci_get_drvdata(pdev);
5200         struct amdgpu_device *adev = drm_to_adev(dev);
5201         int r, i;
5202         bool need_full_reset = true;
5203         u32 memsize;
5204         struct list_head device_list;
5205
5206         DRM_INFO("PCI error: slot reset callback!!\n");
5207
5208         INIT_LIST_HEAD(&device_list);
5209         list_add_tail(&adev->reset_list, &device_list);
5210
5211         /* wait for asic to come out of reset */
5212         msleep(500);
5213
5214         /* Restore PCI confspace */
5215         amdgpu_device_load_pci_state(pdev);
5216
5217         /* confirm  ASIC came out of reset */
5218         for (i = 0; i < adev->usec_timeout; i++) {
5219                 memsize = amdgpu_asic_get_config_memsize(adev);
5220
5221                 if (memsize != 0xffffffff)
5222                         break;
5223                 udelay(1);
5224         }
5225         if (memsize == 0xffffffff) {
5226                 r = -ETIME;
5227                 goto out;
5228         }
5229
5230         adev->in_pci_err_recovery = true;
5231         r = amdgpu_device_pre_asic_reset(adev, NULL, &need_full_reset);
5232         adev->in_pci_err_recovery = false;
5233         if (r)
5234                 goto out;
5235
5236         r = amdgpu_do_asic_reset(NULL, &device_list, &need_full_reset, true);
5237
5238 out:
5239         if (!r) {
5240                 if (amdgpu_device_cache_pci_state(adev->pdev))
5241                         pci_restore_state(adev->pdev);
5242
5243                 DRM_INFO("PCIe error recovery succeeded\n");
5244         } else {
5245                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5246                 amdgpu_device_unlock_adev(adev);
5247         }
5248
5249         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5250 }
5251
5252 /**
5253  * amdgpu_pci_resume() - resume normal ops after PCI reset
5254  * @pdev: pointer to PCI device
5255  *
5256  * Called when the error recovery driver tells us that its
5257  * OK to resume normal operation.
5258  */
5259 void amdgpu_pci_resume(struct pci_dev *pdev)
5260 {
5261         struct drm_device *dev = pci_get_drvdata(pdev);
5262         struct amdgpu_device *adev = drm_to_adev(dev);
5263         int i;
5264
5265
5266         DRM_INFO("PCI error: resume callback!!\n");
5267
5268         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5269                 struct amdgpu_ring *ring = adev->rings[i];
5270
5271                 if (!ring || !ring->sched.thread)
5272                         continue;
5273
5274
5275                 drm_sched_resubmit_jobs(&ring->sched);
5276                 drm_sched_start(&ring->sched, true);
5277         }
5278
5279         amdgpu_device_unlock_adev(adev);
5280 }
5281
5282 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5283 {
5284         struct drm_device *dev = pci_get_drvdata(pdev);
5285         struct amdgpu_device *adev = drm_to_adev(dev);
5286         int r;
5287
5288         r = pci_save_state(pdev);
5289         if (!r) {
5290                 kfree(adev->pci_state);
5291
5292                 adev->pci_state = pci_store_saved_state(pdev);
5293
5294                 if (!adev->pci_state) {
5295                         DRM_ERROR("Failed to store PCI saved state");
5296                         return false;
5297                 }
5298         } else {
5299                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5300                 return false;
5301         }
5302
5303         return true;
5304 }
5305
5306 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5307 {
5308         struct drm_device *dev = pci_get_drvdata(pdev);
5309         struct amdgpu_device *adev = drm_to_adev(dev);
5310         int r;
5311
5312         if (!adev->pci_state)
5313                 return false;
5314
5315         r = pci_load_saved_state(pdev, adev->pci_state);
5316
5317         if (!r) {
5318                 pci_restore_state(pdev);
5319         } else {
5320                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5321                 return false;
5322         }
5323
5324         return true;
5325 }
5326
5327