drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67
  68 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  69 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  70 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  71 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  72 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  73 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  76
  77 #define AMDGPU_RESUME_MS                2000
  78
  79 static const char *amdgpu_asic_name[] = {
  80         "TAHITI",
  81         "PITCAIRN",
  82         "VERDE",
  83         "OLAND",
  84         "HAINAN",
  85         "BONAIRE",
  86         "KAVERI",
  87         "KABINI",
  88         "HAWAII",
  89         "MULLINS",
  90         "TOPAZ",
  91         "TONGA",
  92         "FIJI",
  93         "CARRIZO",
  94         "STONEY",
  95         "POLARIS10",
  96         "POLARIS11",
  97         "POLARIS12",
  98         "VEGAM",
  99         "VEGA10",
 100         "VEGA12",
 101         "VEGA20",
 102         "RAVEN",
 103         "ARCTURUS",
 104         "NAVI10",
 105         "NAVI14",
 106         "LAST",
 107 };
 108
 109 /**
 110  * DOC: pcie_replay_count
 111  *
 112  * The amdgpu driver provides a sysfs API for reporting the total number
 113  * of PCIe replays (NAKs)
 114  * The file pcie_replay_count is used for this and returns the total
 115  * number of replays as a sum of the NAKs generated and NAKs received
 116  */
 117
 118 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 119                 struct device_attribute *attr, char *buf)
 120 {
 121         struct drm_device *ddev = dev_get_drvdata(dev);
 122         struct amdgpu_device *adev = ddev->dev_private;
 123         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 124
 125         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 126 }
 127
 128 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 129                 amdgpu_device_get_pcie_replay_count, NULL);
 130
 131 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 132
 133 /**
 134  * amdgpu_device_is_px - Is the device is a dGPU with HG/PX power control
 135  *
 136  * @dev: drm_device pointer
 137  *
 138  * Returns true if the device is a dGPU with HG/PX power control,
 139  * otherwise return false.
 140  */
 141 bool amdgpu_device_is_px(struct drm_device *dev)
 142 {
 143         struct amdgpu_device *adev = dev->dev_private;
 144
 145         if (adev->flags & AMD_IS_PX)
 146                 return true;
 147         return false;
 148 }
 149
 150 /*
 151  * MMIO register access helper functions.
 152  */
 153 /**
 154  * amdgpu_mm_rreg - read a memory mapped IO register
 155  *
 156  * @adev: amdgpu_device pointer
 157  * @reg: dword aligned register offset
 158  * @acc_flags: access flags which require special behavior
 159  *
 160  * Returns the 32 bit value from the offset specified.
 161  */
 162 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 163                         uint32_t acc_flags)
 164 {
 165         uint32_t ret;
 166
 167         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 168                 return amdgpu_virt_kiq_rreg(adev, reg);
 169
 170         if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
 171                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 172         else {
 173                 unsigned long flags;
 174
 175                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 176                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 177                 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 178                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 179         }
 180         trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
 181         return ret;
 182 }
 183
 184 /*
 185  * MMIO register read with bytes helper functions
 186  * @offset:bytes offset from MMIO start
 187  *
 188 */
 189
 190 /**
 191  * amdgpu_mm_rreg8 - read a memory mapped IO register
 192  *
 193  * @adev: amdgpu_device pointer
 194  * @offset: byte aligned register offset
 195  *
 196  * Returns the 8 bit value from the offset specified.
 197  */
 198 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
 199         if (offset < adev->rmmio_size)
 200                 return (readb(adev->rmmio + offset));
 201         BUG();
 202 }
 203
 204 /*
 205  * MMIO register write with bytes helper functions
 206  * @offset:bytes offset from MMIO start
 207  * @value: the value want to be written to the register
 208  *
 209 */
 210 /**
 211  * amdgpu_mm_wreg8 - read a memory mapped IO register
 212  *
 213  * @adev: amdgpu_device pointer
 214  * @offset: byte aligned register offset
 215  * @value: 8 bit value to write
 216  *
 217  * Writes the value specified to the offset specified.
 218  */
 219 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
 220         if (offset < adev->rmmio_size)
 221                 writeb(value, adev->rmmio + offset);
 222         else
 223                 BUG();
 224 }
 225
 226 /**
 227  * amdgpu_mm_wreg - write to a memory mapped IO register
 228  *
 229  * @adev: amdgpu_device pointer
 230  * @reg: dword aligned register offset
 231  * @v: 32 bit value to write to the register
 232  * @acc_flags: access flags which require special behavior
 233  *
 234  * Writes the value specified to the offset specified.
 235  */
 236 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 237                     uint32_t acc_flags)
 238 {
 239         trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 240
 241         if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
 242                 adev->last_mm_index = v;
 243         }
 244
 245         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 246                 return amdgpu_virt_kiq_wreg(adev, reg, v);
 247
 248         if ((reg * 4) < adev->rmmio_size && !(acc_flags & AMDGPU_REGS_IDX))
 249                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 250         else {
 251                 unsigned long flags;
 252
 253                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 254                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 255                 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 256                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 257         }
 258
 259         if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
 260                 udelay(500);
 261         }
 262 }
 263
 264 /**
 265  * amdgpu_mm_rreg64 - read a 64 bit memory mapped IO register
 266  *
 267  * @adev: amdgpu_device pointer
 268  * @reg: dword aligned register offset
 269  *
 270  * Returns the 64 bit value from the offset specified.
 271  */
 272 uint64_t amdgpu_mm_rreg64(struct amdgpu_device *adev, uint32_t reg)
 273 {
 274         uint64_t ret;
 275
 276         if ((reg * 4) < adev->rmmio_size)
 277                 ret = readq(((void __iomem *)adev->rmmio) + (reg * 4));
 278         else
 279                 BUG();
 280
 281         return ret;
 282 }
 283
 284 /**
 285  * amdgpu_mm_wreg64 - write to a 64 bit memory mapped IO register
 286  *
 287  * @adev: amdgpu_device pointer
 288  * @reg: dword aligned register offset
 289  * @v: 64 bit value to write to the register
 290  *
 291  * Writes the value specified to the offset specified.
 292  */
 293 void amdgpu_mm_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 294 {
 295         if ((reg * 4) < adev->rmmio_size)
 296                 writeq(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 297         else
 298                 BUG();
 299 }
 300
 301 /**
 302  * amdgpu_io_rreg - read an IO register
 303  *
 304  * @adev: amdgpu_device pointer
 305  * @reg: dword aligned register offset
 306  *
 307  * Returns the 32 bit value from the offset specified.
 308  */
 309 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 310 {
 311         if ((reg * 4) < adev->rio_mem_size)
 312                 return ioread32(adev->rio_mem + (reg * 4));
 313         else {
 314                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 315                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 316         }
 317 }
 318
 319 /**
 320  * amdgpu_io_wreg - write to an IO register
 321  *
 322  * @adev: amdgpu_device pointer
 323  * @reg: dword aligned register offset
 324  * @v: 32 bit value to write to the register
 325  *
 326  * Writes the value specified to the offset specified.
 327  */
 328 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 329 {
 330         if (adev->asic_type >= CHIP_VEGA10 && reg == 0) {
 331                 adev->last_mm_index = v;
 332         }
 333
 334         if ((reg * 4) < adev->rio_mem_size)
 335                 iowrite32(v, adev->rio_mem + (reg * 4));
 336         else {
 337                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 338                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 339         }
 340
 341         if (adev->asic_type >= CHIP_VEGA10 && reg == 1 && adev->last_mm_index == 0x5702C) {
 342                 udelay(500);
 343         }
 344 }
 345
 346 /**
 347  * amdgpu_mm_rdoorbell - read a doorbell dword
 348  *
 349  * @adev: amdgpu_device pointer
 350  * @index: doorbell index
 351  *
 352  * Returns the value in the doorbell aperture at the
 353  * requested doorbell index (CIK).
 354  */
 355 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 356 {
 357         if (index < adev->doorbell.num_doorbells) {
 358                 return readl(adev->doorbell.ptr + index);
 359         } else {
 360                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 361                 return 0;
 362         }
 363 }
 364
 365 /**
 366  * amdgpu_mm_wdoorbell - write a doorbell dword
 367  *
 368  * @adev: amdgpu_device pointer
 369  * @index: doorbell index
 370  * @v: value to write
 371  *
 372  * Writes @v to the doorbell aperture at the
 373  * requested doorbell index (CIK).
 374  */
 375 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 376 {
 377         if (index < adev->doorbell.num_doorbells) {
 378                 writel(v, adev->doorbell.ptr + index);
 379         } else {
 380                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 381         }
 382 }
 383
 384 /**
 385  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 386  *
 387  * @adev: amdgpu_device pointer
 388  * @index: doorbell index
 389  *
 390  * Returns the value in the doorbell aperture at the
 391  * requested doorbell index (VEGA10+).
 392  */
 393 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 394 {
 395         if (index < adev->doorbell.num_doorbells) {
 396                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 397         } else {
 398                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 399                 return 0;
 400         }
 401 }
 402
 403 /**
 404  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 405  *
 406  * @adev: amdgpu_device pointer
 407  * @index: doorbell index
 408  * @v: value to write
 409  *
 410  * Writes @v to the doorbell aperture at the
 411  * requested doorbell index (VEGA10+).
 412  */
 413 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 414 {
 415         if (index < adev->doorbell.num_doorbells) {
 416                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 417         } else {
 418                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 419         }
 420 }
 421
 422 /**
 423  * amdgpu_invalid_rreg - dummy reg read function
 424  *
 425  * @adev: amdgpu device pointer
 426  * @reg: offset of register
 427  *
 428  * Dummy register read function.  Used for register blocks
 429  * that certain asics don't have (all asics).
 430  * Returns the value in the register.
 431  */
 432 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 433 {
 434         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 435         BUG();
 436         return 0;
 437 }
 438
 439 /**
 440  * amdgpu_invalid_wreg - dummy reg write function
 441  *
 442  * @adev: amdgpu device pointer
 443  * @reg: offset of register
 444  * @v: value to write to the register
 445  *
 446  * Dummy register read function.  Used for register blocks
 447  * that certain asics don't have (all asics).
 448  */
 449 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 450 {
 451         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 452                   reg, v);
 453         BUG();
 454 }
 455
 456 /**
 457  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 458  *
 459  * @adev: amdgpu device pointer
 460  * @reg: offset of register
 461  *
 462  * Dummy register read function.  Used for register blocks
 463  * that certain asics don't have (all asics).
 464  * Returns the value in the register.
 465  */
 466 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 467 {
 468         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 469         BUG();
 470         return 0;
 471 }
 472
 473 /**
 474  * amdgpu_invalid_wreg64 - dummy reg write function
 475  *
 476  * @adev: amdgpu device pointer
 477  * @reg: offset of register
 478  * @v: value to write to the register
 479  *
 480  * Dummy register read function.  Used for register blocks
 481  * that certain asics don't have (all asics).
 482  */
 483 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 484 {
 485         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 486                   reg, v);
 487         BUG();
 488 }
 489
 490 /**
 491  * amdgpu_block_invalid_rreg - dummy reg read function
 492  *
 493  * @adev: amdgpu device pointer
 494  * @block: offset of instance
 495  * @reg: offset of register
 496  *
 497  * Dummy register read function.  Used for register blocks
 498  * that certain asics don't have (all asics).
 499  * Returns the value in the register.
 500  */
 501 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 502                                           uint32_t block, uint32_t reg)
 503 {
 504         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 505                   reg, block);
 506         BUG();
 507         return 0;
 508 }
 509
 510 /**
 511  * amdgpu_block_invalid_wreg - dummy reg write function
 512  *
 513  * @adev: amdgpu device pointer
 514  * @block: offset of instance
 515  * @reg: offset of register
 516  * @v: value to write to the register
 517  *
 518  * Dummy register read function.  Used for register blocks
 519  * that certain asics don't have (all asics).
 520  */
 521 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 522                                       uint32_t block,
 523                                       uint32_t reg, uint32_t v)
 524 {
 525         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 526                   reg, block, v);
 527         BUG();
 528 }
 529
 530 /**
 531  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 532  *
 533  * @adev: amdgpu device pointer
 534  *
 535  * Allocates a scratch page of VRAM for use by various things in the
 536  * driver.
 537  */
 538 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 539 {
 540         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 541                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 542                                        &adev->vram_scratch.robj,
 543                                        &adev->vram_scratch.gpu_addr,
 544                                        (void **)&adev->vram_scratch.ptr);
 545 }
 546
 547 /**
 548  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 549  *
 550  * @adev: amdgpu device pointer
 551  *
 552  * Frees the VRAM scratch page.
 553  */
 554 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 555 {
 556         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 557 }
 558
 559 /**
 560  * amdgpu_device_program_register_sequence - program an array of registers.
 561  *
 562  * @adev: amdgpu_device pointer
 563  * @registers: pointer to the register array
 564  * @array_size: size of the register array
 565  *
 566  * Programs an array or registers with and and or masks.
 567  * This is a helper for setting golden registers.
 568  */
 569 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 570                                              const u32 *registers,
 571                                              const u32 array_size)
 572 {
 573         u32 tmp, reg, and_mask, or_mask;
 574         int i;
 575
 576         if (array_size % 3)
 577                 return;
 578
 579         for (i = 0; i < array_size; i +=3) {
 580                 reg = registers[i + 0];
 581                 and_mask = registers[i + 1];
 582                 or_mask = registers[i + 2];
 583
 584                 if (and_mask == 0xffffffff) {
 585                         tmp = or_mask;
 586                 } else {
 587                         tmp = RREG32(reg);
 588                         tmp &= ~and_mask;
 589                         if (adev->family >= AMDGPU_FAMILY_AI)
 590                                 tmp |= (or_mask & and_mask);
 591                         else
 592                                 tmp |= or_mask;
 593                 }
 594                 WREG32(reg, tmp);
 595         }
 596 }
 597
 598 /**
 599  * amdgpu_device_pci_config_reset - reset the GPU
 600  *
 601  * @adev: amdgpu_device pointer
 602  *
 603  * Resets the GPU using the pci config reset sequence.
 604  * Only applicable to asics prior to vega10.
 605  */
 606 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 607 {
 608         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 609 }
 610
 611 /*
 612  * GPU doorbell aperture helpers function.
 613  */
 614 /**
 615  * amdgpu_device_doorbell_init - Init doorbell driver information.
 616  *
 617  * @adev: amdgpu_device pointer
 618  *
 619  * Init doorbell driver information (CIK)
 620  * Returns 0 on success, error on failure.
 621  */
 622 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 623 {
 624
 625         /* No doorbell on SI hardware generation */
 626         if (adev->asic_type < CHIP_BONAIRE) {
 627                 adev->doorbell.base = 0;
 628                 adev->doorbell.size = 0;
 629                 adev->doorbell.num_doorbells = 0;
 630                 adev->doorbell.ptr = NULL;
 631                 return 0;
 632         }
 633
 634         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 635                 return -EINVAL;
 636
 637         amdgpu_asic_init_doorbell_index(adev);
 638
 639         /* doorbell bar mapping */
 640         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 641         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 642
 643         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 644                                              adev->doorbell_index.max_assignment+1);
 645         if (adev->doorbell.num_doorbells == 0)
 646                 return -EINVAL;
 647
 648         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 649          * paging queue doorbell use the second page. The
 650          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 651          * doorbells are in the first page. So with paging queue enabled,
 652          * the max num_doorbells should + 1 page (0x400 in dword)
 653          */
 654         if (adev->asic_type >= CHIP_VEGA10)
 655                 adev->doorbell.num_doorbells += 0x400;
 656
 657         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 658                                      adev->doorbell.num_doorbells *
 659                                      sizeof(u32));
 660         if (adev->doorbell.ptr == NULL)
 661                 return -ENOMEM;
 662
 663         return 0;
 664 }
 665
 666 /**
 667  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 668  *
 669  * @adev: amdgpu_device pointer
 670  *
 671  * Tear down doorbell driver information (CIK)
 672  */
 673 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 674 {
 675         iounmap(adev->doorbell.ptr);
 676         adev->doorbell.ptr = NULL;
 677 }
 678
 679
 680
 681 /*
 682  * amdgpu_device_wb_*()
 683  * Writeback is the method by which the GPU updates special pages in memory
 684  * with the status of certain GPU events (fences, ring pointers,etc.).
 685  */
 686
 687 /**
 688  * amdgpu_device_wb_fini - Disable Writeback and free memory
 689  *
 690  * @adev: amdgpu_device pointer
 691  *
 692  * Disables Writeback and frees the Writeback memory (all asics).
 693  * Used at driver shutdown.
 694  */
 695 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 696 {
 697         if (adev->wb.wb_obj) {
 698                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 699                                       &adev->wb.gpu_addr,
 700                                       (void **)&adev->wb.wb);
 701                 adev->wb.wb_obj = NULL;
 702         }
 703 }
 704
 705 /**
 706  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 707  *
 708  * @adev: amdgpu_device pointer
 709  *
 710  * Initializes writeback and allocates writeback memory (all asics).
 711  * Used at driver startup.
 712  * Returns 0 on success or an -error on failure.
 713  */
 714 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 715 {
 716         int r;
 717
 718         if (adev->wb.wb_obj == NULL) {
 719                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 720                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 721                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 722                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 723                                             (void **)&adev->wb.wb);
 724                 if (r) {
 725                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 726                         return r;
 727                 }
 728
 729                 adev->wb.num_wb = AMDGPU_MAX_WB;
 730                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 731
 732                 /* clear wb memory */
 733                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 734         }
 735
 736         return 0;
 737 }
 738
 739 /**
 740  * amdgpu_device_wb_get - Allocate a wb entry
 741  *
 742  * @adev: amdgpu_device pointer
 743  * @wb: wb index
 744  *
 745  * Allocate a wb slot for use by the driver (all asics).
 746  * Returns 0 on success or -EINVAL on failure.
 747  */
 748 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 749 {
 750         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 751
 752         if (offset < adev->wb.num_wb) {
 753                 __set_bit(offset, adev->wb.used);
 754                 *wb = offset << 3; /* convert to dw offset */
 755                 return 0;
 756         } else {
 757                 return -EINVAL;
 758         }
 759 }
 760
 761 /**
 762  * amdgpu_device_wb_free - Free a wb entry
 763  *
 764  * @adev: amdgpu_device pointer
 765  * @wb: wb index
 766  *
 767  * Free a wb slot allocated for use by the driver (all asics)
 768  */
 769 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 770 {
 771         wb >>= 3;
 772         if (wb < adev->wb.num_wb)
 773                 __clear_bit(wb, adev->wb.used);
 774 }
 775
 776 /**
 777  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 778  *
 779  * @adev: amdgpu_device pointer
 780  *
 781  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 782  * to fail, but if any of the BARs is not accessible after the size we abort
 783  * driver loading by returning -ENODEV.
 784  */
 785 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 786 {
 787         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 788         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 789         struct pci_bus *root;
 790         struct resource *res;
 791         unsigned i;
 792         u16 cmd;
 793         int r;
 794
 795         /* Bypass for VF */
 796         if (amdgpu_sriov_vf(adev))
 797                 return 0;
 798
 799         /* Check if the root BUS has 64bit memory resources */
 800         root = adev->pdev->bus;
 801         while (root->parent)
 802                 root = root->parent;
 803
 804         pci_bus_for_each_resource(root, res, i) {
 805                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
 806                     res->start > 0x100000000ull)
 807                         break;
 808         }
 809
 810         /* Trying to resize is pointless without a root hub window above 4GB */
 811         if (!res)
 812                 return 0;
 813
 814         /* Disable memory decoding while we change the BAR addresses and size */
 815         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
 816         pci_write_config_word(adev->pdev, PCI_COMMAND,
 817                               cmd & ~PCI_COMMAND_MEMORY);
 818
 819         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
 820         amdgpu_device_doorbell_fini(adev);
 821         if (adev->asic_type >= CHIP_BONAIRE)
 822                 pci_release_resource(adev->pdev, 2);
 823
 824         pci_release_resource(adev->pdev, 0);
 825
 826         r = pci_resize_resource(adev->pdev, 0, rbar_size);
 827         if (r == -ENOSPC)
 828                 DRM_INFO("Not enough PCI address space for a large BAR.");
 829         else if (r && r != -ENOTSUPP)
 830                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
 831
 832         pci_assign_unassigned_bus_resources(adev->pdev->bus);
 833
 834         /* When the doorbell or fb BAR isn't available we have no chance of
 835          * using the device.
 836          */
 837         r = amdgpu_device_doorbell_init(adev);
 838         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
 839                 return -ENODEV;
 840
 841         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
 842
 843         return 0;
 844 }
 845
 846 /*
 847  * GPU helpers function.
 848  */
 849 /**
 850  * amdgpu_device_need_post - check if the hw need post or not
 851  *
 852  * @adev: amdgpu_device pointer
 853  *
 854  * Check if the asic has been initialized (all asics) at driver startup
 855  * or post is needed if  hw reset is performed.
 856  * Returns true if need or false if not.
 857  */
 858 bool amdgpu_device_need_post(struct amdgpu_device *adev)
 859 {
 860         uint32_t reg;
 861
 862         if (amdgpu_sriov_vf(adev))
 863                 return false;
 864
 865         if (amdgpu_passthrough(adev)) {
 866                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
 867                  * some old smc fw still need driver do vPost otherwise gpu hang, while
 868                  * those smc fw version above 22.15 doesn't have this flaw, so we force
 869                  * vpost executed for smc version below 22.15
 870                  */
 871                 if (adev->asic_type == CHIP_FIJI) {
 872                         int err;
 873                         uint32_t fw_ver;
 874                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
 875                         /* force vPost if error occured */
 876                         if (err)
 877                                 return true;
 878
 879                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
 880                         if (fw_ver < 0x00160e00)
 881                                 return true;
 882                 }
 883         }
 884
 885         if (adev->has_hw_reset) {
 886                 adev->has_hw_reset = false;
 887                 return true;
 888         }
 889
 890         /* bios scratch used on CIK+ */
 891         if (adev->asic_type >= CHIP_BONAIRE)
 892                 return amdgpu_atombios_scratch_need_asic_init(adev);
 893
 894         /* check MEM_SIZE for older asics */
 895         reg = amdgpu_asic_get_config_memsize(adev);
 896
 897         if ((reg != 0) && (reg != 0xffffffff))
 898                 return false;
 899
 900         return true;
 901 }
 902
 903 /* if we get transitioned to only one device, take VGA back */
 904 /**
 905  * amdgpu_device_vga_set_decode - enable/disable vga decode
 906  *
 907  * @cookie: amdgpu_device pointer
 908  * @state: enable/disable vga decode
 909  *
 910  * Enable/disable vga decode (all asics).
 911  * Returns VGA resource flags.
 912  */
 913 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
 914 {
 915         struct amdgpu_device *adev = cookie;
 916         amdgpu_asic_set_vga_state(adev, state);
 917         if (state)
 918                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
 919                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
 920         else
 921                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
 922 }
 923
 924 /**
 925  * amdgpu_device_check_block_size - validate the vm block size
 926  *
 927  * @adev: amdgpu_device pointer
 928  *
 929  * Validates the vm block size specified via module parameter.
 930  * The vm block size defines number of bits in page table versus page directory,
 931  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
 932  * page table and the remaining bits are in the page directory.
 933  */
 934 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
 935 {
 936         /* defines number of bits in page table versus page directory,
 937          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
 938          * page table and the remaining bits are in the page directory */
 939         if (amdgpu_vm_block_size == -1)
 940                 return;
 941
 942         if (amdgpu_vm_block_size < 9) {
 943                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
 944                          amdgpu_vm_block_size);
 945                 amdgpu_vm_block_size = -1;
 946         }
 947 }
 948
 949 /**
 950  * amdgpu_device_check_vm_size - validate the vm size
 951  *
 952  * @adev: amdgpu_device pointer
 953  *
 954  * Validates the vm size in GB specified via module parameter.
 955  * The VM size is the size of the GPU virtual memory space in GB.
 956  */
 957 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
 958 {
 959         /* no need to check the default value */
 960         if (amdgpu_vm_size == -1)
 961                 return;
 962
 963         if (amdgpu_vm_size < 1) {
 964                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
 965                          amdgpu_vm_size);
 966                 amdgpu_vm_size = -1;
 967         }
 968 }
 969
 970 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
 971 {
 972         struct sysinfo si;
 973         bool is_os_64 = (sizeof(void *) == 8) ? true : false;
 974         uint64_t total_memory;
 975         uint64_t dram_size_seven_GB = 0x1B8000000;
 976         uint64_t dram_size_three_GB = 0xB8000000;
 977
 978         if (amdgpu_smu_memory_pool_size == 0)
 979                 return;
 980
 981         if (!is_os_64) {
 982                 DRM_WARN("Not 64-bit OS, feature not supported\n");
 983                 goto def_value;
 984         }
 985         si_meminfo(&si);
 986         total_memory = (uint64_t)si.totalram * si.mem_unit;
 987
 988         if ((amdgpu_smu_memory_pool_size == 1) ||
 989                 (amdgpu_smu_memory_pool_size == 2)) {
 990                 if (total_memory < dram_size_three_GB)
 991                         goto def_value1;
 992         } else if ((amdgpu_smu_memory_pool_size == 4) ||
 993                 (amdgpu_smu_memory_pool_size == 8)) {
 994                 if (total_memory < dram_size_seven_GB)
 995                         goto def_value1;
 996         } else {
 997                 DRM_WARN("Smu memory pool size not supported\n");
 998                 goto def_value;
 999         }
1000         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1001
1002         return;
1003
1004 def_value1:
1005         DRM_WARN("No enough system memory\n");
1006 def_value:
1007         adev->pm.smu_prv_buffer_size = 0;
1008 }
1009
1010 /**
1011  * amdgpu_device_check_arguments - validate module params
1012  *
1013  * @adev: amdgpu_device pointer
1014  *
1015  * Validates certain module parameters and updates
1016  * the associated values used by the driver (all asics).
1017  */
1018 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1019 {
1020         int ret = 0;
1021
1022         if (amdgpu_sched_jobs < 4) {
1023                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1024                          amdgpu_sched_jobs);
1025                 amdgpu_sched_jobs = 4;
1026         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1027                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1028                          amdgpu_sched_jobs);
1029                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1030         }
1031
1032         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1033                 /* gart size must be greater or equal to 32M */
1034                 dev_warn(adev->dev, "gart size (%d) too small\n",
1035                          amdgpu_gart_size);
1036                 amdgpu_gart_size = -1;
1037         }
1038
1039         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1040                 /* gtt size must be greater or equal to 32M */
1041                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1042                                  amdgpu_gtt_size);
1043                 amdgpu_gtt_size = -1;
1044         }
1045
1046         /* valid range is between 4 and 9 inclusive */
1047         if (amdgpu_vm_fragment_size != -1 &&
1048             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1049                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1050                 amdgpu_vm_fragment_size = -1;
1051         }
1052
1053         amdgpu_device_check_smu_prv_buffer_size(adev);
1054
1055         amdgpu_device_check_vm_size(adev);
1056
1057         amdgpu_device_check_block_size(adev);
1058
1059         ret = amdgpu_device_get_job_timeout_settings(adev);
1060         if (ret) {
1061                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
1062                 return ret;
1063         }
1064
1065         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1066
1067         return ret;
1068 }
1069
1070 /**
1071  * amdgpu_switcheroo_set_state - set switcheroo state
1072  *
1073  * @pdev: pci dev pointer
1074  * @state: vga_switcheroo state
1075  *
1076  * Callback for the switcheroo driver.  Suspends or resumes the
1077  * the asics before or after it is powered up using ACPI methods.
1078  */
1079 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1080 {
1081         struct drm_device *dev = pci_get_drvdata(pdev);
1082
1083         if (amdgpu_device_is_px(dev) && state == VGA_SWITCHEROO_OFF)
1084                 return;
1085
1086         if (state == VGA_SWITCHEROO_ON) {
1087                 pr_info("amdgpu: switched on\n");
1088                 /* don't suspend or resume card normally */
1089                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1090
1091                 amdgpu_device_resume(dev, true, true);
1092
1093                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1094                 drm_kms_helper_poll_enable(dev);
1095         } else {
1096                 pr_info("amdgpu: switched off\n");
1097                 drm_kms_helper_poll_disable(dev);
1098                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1099                 amdgpu_device_suspend(dev, true, true);
1100                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1101         }
1102 }
1103
1104 /**
1105  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1106  *
1107  * @pdev: pci dev pointer
1108  *
1109  * Callback for the switcheroo driver.  Check of the switcheroo
1110  * state can be changed.
1111  * Returns true if the state can be changed, false if not.
1112  */
1113 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1114 {
1115         struct drm_device *dev = pci_get_drvdata(pdev);
1116
1117         /*
1118         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1119         * locking inversion with the driver load path. And the access here is
1120         * completely racy anyway. So don't bother with locking for now.
1121         */
1122         return dev->open_count == 0;
1123 }
1124
1125 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1126         .set_gpu_state = amdgpu_switcheroo_set_state,
1127         .reprobe = NULL,
1128         .can_switch = amdgpu_switcheroo_can_switch,
1129 };
1130
1131 /**
1132  * amdgpu_device_ip_set_clockgating_state - set the CG state
1133  *
1134  * @dev: amdgpu_device pointer
1135  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1136  * @state: clockgating state (gate or ungate)
1137  *
1138  * Sets the requested clockgating state for all instances of
1139  * the hardware IP specified.
1140  * Returns the error code from the last instance.
1141  */
1142 int amdgpu_device_ip_set_clockgating_state(void *dev,
1143                                            enum amd_ip_block_type block_type,
1144                                            enum amd_clockgating_state state)
1145 {
1146         struct amdgpu_device *adev = dev;
1147         int i, r = 0;
1148
1149         for (i = 0; i < adev->num_ip_blocks; i++) {
1150                 if (!adev->ip_blocks[i].status.valid)
1151                         continue;
1152                 if (adev->ip_blocks[i].version->type != block_type)
1153                         continue;
1154                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1155                         continue;
1156                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1157                         (void *)adev, state);
1158                 if (r)
1159                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1160                                   adev->ip_blocks[i].version->funcs->name, r);
1161         }
1162         return r;
1163 }
1164
1165 /**
1166  * amdgpu_device_ip_set_powergating_state - set the PG state
1167  *
1168  * @dev: amdgpu_device pointer
1169  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1170  * @state: powergating state (gate or ungate)
1171  *
1172  * Sets the requested powergating state for all instances of
1173  * the hardware IP specified.
1174  * Returns the error code from the last instance.
1175  */
1176 int amdgpu_device_ip_set_powergating_state(void *dev,
1177                                            enum amd_ip_block_type block_type,
1178                                            enum amd_powergating_state state)
1179 {
1180         struct amdgpu_device *adev = dev;
1181         int i, r = 0;
1182
1183         for (i = 0; i < adev->num_ip_blocks; i++) {
1184                 if (!adev->ip_blocks[i].status.valid)
1185                         continue;
1186                 if (adev->ip_blocks[i].version->type != block_type)
1187                         continue;
1188                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1189                         continue;
1190                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1191                         (void *)adev, state);
1192                 if (r)
1193                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1194                                   adev->ip_blocks[i].version->funcs->name, r);
1195         }
1196         return r;
1197 }
1198
1199 /**
1200  * amdgpu_device_ip_get_clockgating_state - get the CG state
1201  *
1202  * @adev: amdgpu_device pointer
1203  * @flags: clockgating feature flags
1204  *
1205  * Walks the list of IPs on the device and updates the clockgating
1206  * flags for each IP.
1207  * Updates @flags with the feature flags for each hardware IP where
1208  * clockgating is enabled.
1209  */
1210 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1211                                             u32 *flags)
1212 {
1213         int i;
1214
1215         for (i = 0; i < adev->num_ip_blocks; i++) {
1216                 if (!adev->ip_blocks[i].status.valid)
1217                         continue;
1218                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1219                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1220         }
1221 }
1222
1223 /**
1224  * amdgpu_device_ip_wait_for_idle - wait for idle
1225  *
1226  * @adev: amdgpu_device pointer
1227  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1228  *
1229  * Waits for the request hardware IP to be idle.
1230  * Returns 0 for success or a negative error code on failure.
1231  */
1232 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1233                                    enum amd_ip_block_type block_type)
1234 {
1235         int i, r;
1236
1237         for (i = 0; i < adev->num_ip_blocks; i++) {
1238                 if (!adev->ip_blocks[i].status.valid)
1239                         continue;
1240                 if (adev->ip_blocks[i].version->type == block_type) {
1241                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1242                         if (r)
1243                                 return r;
1244                         break;
1245                 }
1246         }
1247         return 0;
1248
1249 }
1250
1251 /**
1252  * amdgpu_device_ip_is_idle - is the hardware IP idle
1253  *
1254  * @adev: amdgpu_device pointer
1255  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1256  *
1257  * Check if the hardware IP is idle or not.
1258  * Returns true if it the IP is idle, false if not.
1259  */
1260 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1261                               enum amd_ip_block_type block_type)
1262 {
1263         int i;
1264
1265         for (i = 0; i < adev->num_ip_blocks; i++) {
1266                 if (!adev->ip_blocks[i].status.valid)
1267                         continue;
1268                 if (adev->ip_blocks[i].version->type == block_type)
1269                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1270         }
1271         return true;
1272
1273 }
1274
1275 /**
1276  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1277  *
1278  * @adev: amdgpu_device pointer
1279  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1280  *
1281  * Returns a pointer to the hardware IP block structure
1282  * if it exists for the asic, otherwise NULL.
1283  */
1284 struct amdgpu_ip_block *
1285 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1286                               enum amd_ip_block_type type)
1287 {
1288         int i;
1289
1290         for (i = 0; i < adev->num_ip_blocks; i++)
1291                 if (adev->ip_blocks[i].version->type == type)
1292                         return &adev->ip_blocks[i];
1293
1294         return NULL;
1295 }
1296
1297 /**
1298  * amdgpu_device_ip_block_version_cmp
1299  *
1300  * @adev: amdgpu_device pointer
1301  * @type: enum amd_ip_block_type
1302  * @major: major version
1303  * @minor: minor version
1304  *
1305  * return 0 if equal or greater
1306  * return 1 if smaller or the ip_block doesn't exist
1307  */
1308 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1309                                        enum amd_ip_block_type type,
1310                                        u32 major, u32 minor)
1311 {
1312         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1313
1314         if (ip_block && ((ip_block->version->major > major) ||
1315                         ((ip_block->version->major == major) &&
1316                         (ip_block->version->minor >= minor))))
1317                 return 0;
1318
1319         return 1;
1320 }
1321
1322 /**
1323  * amdgpu_device_ip_block_add
1324  *
1325  * @adev: amdgpu_device pointer
1326  * @ip_block_version: pointer to the IP to add
1327  *
1328  * Adds the IP block driver information to the collection of IPs
1329  * on the asic.
1330  */
1331 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1332                                const struct amdgpu_ip_block_version *ip_block_version)
1333 {
1334         if (!ip_block_version)
1335                 return -EINVAL;
1336
1337         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1338                   ip_block_version->funcs->name);
1339
1340         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1341
1342         return 0;
1343 }
1344
1345 /**
1346  * amdgpu_device_enable_virtual_display - enable virtual display feature
1347  *
1348  * @adev: amdgpu_device pointer
1349  *
1350  * Enabled the virtual display feature if the user has enabled it via
1351  * the module parameter virtual_display.  This feature provides a virtual
1352  * display hardware on headless boards or in virtualized environments.
1353  * This function parses and validates the configuration string specified by
1354  * the user and configues the virtual display configuration (number of
1355  * virtual connectors, crtcs, etc.) specified.
1356  */
1357 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1358 {
1359         adev->enable_virtual_display = false;
1360
1361         if (amdgpu_virtual_display) {
1362                 struct drm_device *ddev = adev->ddev;
1363                 const char *pci_address_name = pci_name(ddev->pdev);
1364                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1365
1366                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1367                 pciaddstr_tmp = pciaddstr;
1368                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1369                         pciaddname = strsep(&pciaddname_tmp, ",");
1370                         if (!strcmp("all", pciaddname)
1371                             || !strcmp(pci_address_name, pciaddname)) {
1372                                 long num_crtc;
1373                                 int res = -1;
1374
1375                                 adev->enable_virtual_display = true;
1376
1377                                 if (pciaddname_tmp)
1378                                         res = kstrtol(pciaddname_tmp, 10,
1379                                                       &num_crtc);
1380
1381                                 if (!res) {
1382                                         if (num_crtc < 1)
1383                                                 num_crtc = 1;
1384                                         if (num_crtc > 6)
1385                                                 num_crtc = 6;
1386                                         adev->mode_info.num_crtc = num_crtc;
1387                                 } else {
1388                                         adev->mode_info.num_crtc = 1;
1389                                 }
1390                                 break;
1391                         }
1392                 }
1393
1394                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1395                          amdgpu_virtual_display, pci_address_name,
1396                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1397
1398                 kfree(pciaddstr);
1399         }
1400 }
1401
1402 /**
1403  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1404  *
1405  * @adev: amdgpu_device pointer
1406  *
1407  * Parses the asic configuration parameters specified in the gpu info
1408  * firmware and makes them availale to the driver for use in configuring
1409  * the asic.
1410  * Returns 0 on success, -EINVAL on failure.
1411  */
1412 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1413 {
1414         const char *chip_name;
1415         char fw_name[30];
1416         int err;
1417         const struct gpu_info_firmware_header_v1_0 *hdr;
1418
1419         adev->firmware.gpu_info_fw = NULL;
1420
1421         switch (adev->asic_type) {
1422         case CHIP_TOPAZ:
1423         case CHIP_TONGA:
1424         case CHIP_FIJI:
1425         case CHIP_POLARIS10:
1426         case CHIP_POLARIS11:
1427         case CHIP_POLARIS12:
1428         case CHIP_VEGAM:
1429         case CHIP_CARRIZO:
1430         case CHIP_STONEY:
1431 #ifdef CONFIG_DRM_AMDGPU_SI
1432         case CHIP_VERDE:
1433         case CHIP_TAHITI:
1434         case CHIP_PITCAIRN:
1435         case CHIP_OLAND:
1436         case CHIP_HAINAN:
1437 #endif
1438 #ifdef CONFIG_DRM_AMDGPU_CIK
1439         case CHIP_BONAIRE:
1440         case CHIP_HAWAII:
1441         case CHIP_KAVERI:
1442         case CHIP_KABINI:
1443         case CHIP_MULLINS:
1444 #endif
1445         case CHIP_VEGA20:
1446         default:
1447                 return 0;
1448         case CHIP_VEGA10:
1449                 chip_name = "vega10";
1450                 break;
1451         case CHIP_VEGA12:
1452                 chip_name = "vega12";
1453                 break;
1454         case CHIP_RAVEN:
1455                 if (adev->rev_id >= 8)
1456                         chip_name = "raven2";
1457                 else if (adev->pdev->device == 0x15d8)
1458                         chip_name = "picasso";
1459                 else
1460                         chip_name = "raven";
1461                 break;
1462         case CHIP_ARCTURUS:
1463                 chip_name = "arcturus";
1464                 break;
1465         case CHIP_NAVI10:
1466                 chip_name = "navi10";
1467                 break;
1468         case CHIP_NAVI14:
1469                 chip_name = "navi14";
1470                 break;
1471         }
1472
1473         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1474         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1475         if (err) {
1476                 dev_err(adev->dev,
1477                         "Failed to load gpu_info firmware \"%s\"\n",
1478                         fw_name);
1479                 goto out;
1480         }
1481         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1482         if (err) {
1483                 dev_err(adev->dev,
1484                         "Failed to validate gpu_info firmware \"%s\"\n",
1485                         fw_name);
1486                 goto out;
1487         }
1488
1489         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1490         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1491
1492         switch (hdr->version_major) {
1493         case 1:
1494         {
1495                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1496                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1497                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1498
1499                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1500                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1501                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1502                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1503                 adev->gfx.config.max_texture_channel_caches =
1504                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1505                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1506                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1507                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1508                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1509                 adev->gfx.config.double_offchip_lds_buf =
1510                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1511                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1512                 adev->gfx.cu_info.max_waves_per_simd =
1513                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1514                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1515                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1516                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1517                 if (hdr->version_minor >= 1) {
1518                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1519                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1520                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1521                         adev->gfx.config.num_sc_per_sh =
1522                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1523                         adev->gfx.config.num_packer_per_sc =
1524                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1525                 }
1526 #ifdef CONFIG_DRM_AMD_DC_DCN2_0
1527                 if (hdr->version_minor == 2) {
1528                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1529                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1530                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1531                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1532                 }
1533 #endif
1534                 break;
1535         }
1536         default:
1537                 dev_err(adev->dev,
1538                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1539                 err = -EINVAL;
1540                 goto out;
1541         }
1542 out:
1543         return err;
1544 }
1545
1546 /**
1547  * amdgpu_device_ip_early_init - run early init for hardware IPs
1548  *
1549  * @adev: amdgpu_device pointer
1550  *
1551  * Early initialization pass for hardware IPs.  The hardware IPs that make
1552  * up each asic are discovered each IP's early_init callback is run.  This
1553  * is the first stage in initializing the asic.
1554  * Returns 0 on success, negative error code on failure.
1555  */
1556 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1557 {
1558         int i, r;
1559
1560         amdgpu_device_enable_virtual_display(adev);
1561
1562         switch (adev->asic_type) {
1563         case CHIP_TOPAZ:
1564         case CHIP_TONGA:
1565         case CHIP_FIJI:
1566         case CHIP_POLARIS10:
1567         case CHIP_POLARIS11:
1568         case CHIP_POLARIS12:
1569         case CHIP_VEGAM:
1570         case CHIP_CARRIZO:
1571         case CHIP_STONEY:
1572                 if (adev->asic_type == CHIP_CARRIZO || adev->asic_type == CHIP_STONEY)
1573                         adev->family = AMDGPU_FAMILY_CZ;
1574                 else
1575                         adev->family = AMDGPU_FAMILY_VI;
1576
1577                 r = vi_set_ip_blocks(adev);
1578                 if (r)
1579                         return r;
1580                 break;
1581 #ifdef CONFIG_DRM_AMDGPU_SI
1582         case CHIP_VERDE:
1583         case CHIP_TAHITI:
1584         case CHIP_PITCAIRN:
1585         case CHIP_OLAND:
1586         case CHIP_HAINAN:
1587                 adev->family = AMDGPU_FAMILY_SI;
1588                 r = si_set_ip_blocks(adev);
1589                 if (r)
1590                         return r;
1591                 break;
1592 #endif
1593 #ifdef CONFIG_DRM_AMDGPU_CIK
1594         case CHIP_BONAIRE:
1595         case CHIP_HAWAII:
1596         case CHIP_KAVERI:
1597         case CHIP_KABINI:
1598         case CHIP_MULLINS:
1599                 if ((adev->asic_type == CHIP_BONAIRE) || (adev->asic_type == CHIP_HAWAII))
1600                         adev->family = AMDGPU_FAMILY_CI;
1601                 else
1602                         adev->family = AMDGPU_FAMILY_KV;
1603
1604                 r = cik_set_ip_blocks(adev);
1605                 if (r)
1606                         return r;
1607                 break;
1608 #endif
1609         case CHIP_VEGA10:
1610         case CHIP_VEGA12:
1611         case CHIP_VEGA20:
1612         case CHIP_RAVEN:
1613         case CHIP_ARCTURUS:
1614                 if (adev->asic_type == CHIP_RAVEN)
1615                         adev->family = AMDGPU_FAMILY_RV;
1616                 else
1617                         adev->family = AMDGPU_FAMILY_AI;
1618
1619                 r = soc15_set_ip_blocks(adev);
1620                 if (r)
1621                         return r;
1622                 break;
1623         case  CHIP_NAVI10:
1624         case  CHIP_NAVI14:
1625                 adev->family = AMDGPU_FAMILY_NV;
1626
1627                 r = nv_set_ip_blocks(adev);
1628                 if (r)
1629                         return r;
1630                 break;
1631         default:
1632                 /* FIXME: not supported yet */
1633                 return -EINVAL;
1634         }
1635
1636         r = amdgpu_device_parse_gpu_info_fw(adev);
1637         if (r)
1638                 return r;
1639
1640         amdgpu_amdkfd_device_probe(adev);
1641
1642         if (amdgpu_sriov_vf(adev)) {
1643                 r = amdgpu_virt_request_full_gpu(adev, true);
1644                 if (r)
1645                         return -EAGAIN;
1646
1647                 /* query the reg access mode at the very beginning */
1648                 amdgpu_virt_init_reg_access_mode(adev);
1649         }
1650
1651         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1652         if (amdgpu_sriov_vf(adev))
1653                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1654
1655         for (i = 0; i < adev->num_ip_blocks; i++) {
1656                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1657                         DRM_ERROR("disabled ip block: %d <%s>\n",
1658                                   i, adev->ip_blocks[i].version->funcs->name);
1659                         adev->ip_blocks[i].status.valid = false;
1660                 } else {
1661                         if (adev->ip_blocks[i].version->funcs->early_init) {
1662                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1663                                 if (r == -ENOENT) {
1664                                         adev->ip_blocks[i].status.valid = false;
1665                                 } else if (r) {
1666                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1667                                                   adev->ip_blocks[i].version->funcs->name, r);
1668                                         return r;
1669                                 } else {
1670                                         adev->ip_blocks[i].status.valid = true;
1671                                 }
1672                         } else {
1673                                 adev->ip_blocks[i].status.valid = true;
1674                         }
1675                 }
1676                 /* get the vbios after the asic_funcs are set up */
1677                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1678                         /* Read BIOS */
1679                         if (!amdgpu_get_bios(adev))
1680                                 return -EINVAL;
1681
1682                         r = amdgpu_atombios_init(adev);
1683                         if (r) {
1684                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1685                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1686                                 return r;
1687                         }
1688                 }
1689         }
1690
1691         adev->cg_flags &= amdgpu_cg_mask;
1692         adev->pg_flags &= amdgpu_pg_mask;
1693
1694         return 0;
1695 }
1696
1697 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1698 {
1699         int i, r;
1700
1701         for (i = 0; i < adev->num_ip_blocks; i++) {
1702                 if (!adev->ip_blocks[i].status.sw)
1703                         continue;
1704                 if (adev->ip_blocks[i].status.hw)
1705                         continue;
1706                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1707                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1708                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1709                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1710                         if (r) {
1711                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1712                                           adev->ip_blocks[i].version->funcs->name, r);
1713                                 return r;
1714                         }
1715                         adev->ip_blocks[i].status.hw = true;
1716                 }
1717         }
1718
1719         return 0;
1720 }
1721
1722 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1723 {
1724         int i, r;
1725
1726         for (i = 0; i < adev->num_ip_blocks; i++) {
1727                 if (!adev->ip_blocks[i].status.sw)
1728                         continue;
1729                 if (adev->ip_blocks[i].status.hw)
1730                         continue;
1731                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1732                 if (r) {
1733                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1734                                   adev->ip_blocks[i].version->funcs->name, r);
1735                         return r;
1736                 }
1737                 adev->ip_blocks[i].status.hw = true;
1738         }
1739
1740         return 0;
1741 }
1742
1743 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1744 {
1745         int r = 0;
1746         int i;
1747         uint32_t smu_version;
1748
1749         if (adev->asic_type >= CHIP_VEGA10) {
1750                 for (i = 0; i < adev->num_ip_blocks; i++) {
1751                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
1752                                 if (adev->in_gpu_reset || adev->in_suspend) {
1753                                         if (amdgpu_sriov_vf(adev) && adev->in_gpu_reset)
1754                                                 break; /* sriov gpu reset, psp need to do hw_init before IH because of hw limit */
1755                                         r = adev->ip_blocks[i].version->funcs->resume(adev);
1756                                         if (r) {
1757                                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
1758                                                           adev->ip_blocks[i].version->funcs->name, r);
1759                                                 return r;
1760                                         }
1761                                 } else {
1762                                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1763                                         if (r) {
1764                                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1765                                                   adev->ip_blocks[i].version->funcs->name, r);
1766                                                 return r;
1767                                         }
1768                                 }
1769                                 adev->ip_blocks[i].status.hw = true;
1770                         }
1771                 }
1772         }
1773         r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1774
1775         return r;
1776 }
1777
1778 /**
1779  * amdgpu_device_ip_init - run init for hardware IPs
1780  *
1781  * @adev: amdgpu_device pointer
1782  *
1783  * Main initialization pass for hardware IPs.  The list of all the hardware
1784  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1785  * are run.  sw_init initializes the software state associated with each IP
1786  * and hw_init initializes the hardware associated with each IP.
1787  * Returns 0 on success, negative error code on failure.
1788  */
1789 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1790 {
1791         int i, r;
1792
1793         r = amdgpu_ras_init(adev);
1794         if (r)
1795                 return r;
1796
1797         for (i = 0; i < adev->num_ip_blocks; i++) {
1798                 if (!adev->ip_blocks[i].status.valid)
1799                         continue;
1800                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1801                 if (r) {
1802                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1803                                   adev->ip_blocks[i].version->funcs->name, r);
1804                         goto init_failed;
1805                 }
1806                 adev->ip_blocks[i].status.sw = true;
1807
1808                 /* need to do gmc hw init early so we can allocate gpu mem */
1809                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
1810                         r = amdgpu_device_vram_scratch_init(adev);
1811                         if (r) {
1812                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
1813                                 goto init_failed;
1814                         }
1815                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
1816                         if (r) {
1817                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
1818                                 goto init_failed;
1819                         }
1820                         r = amdgpu_device_wb_init(adev);
1821                         if (r) {
1822                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
1823                                 goto init_failed;
1824                         }
1825                         adev->ip_blocks[i].status.hw = true;
1826
1827                         /* right after GMC hw init, we create CSA */
1828                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
1829                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
1830                                                                 AMDGPU_GEM_DOMAIN_VRAM,
1831                                                                 AMDGPU_CSA_SIZE);
1832                                 if (r) {
1833                                         DRM_ERROR("allocate CSA failed %d\n", r);
1834                                         goto init_failed;
1835                                 }
1836                         }
1837                 }
1838         }
1839
1840         r = amdgpu_ib_pool_init(adev);
1841         if (r) {
1842                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
1843                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
1844                 goto init_failed;
1845         }
1846
1847         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
1848         if (r)
1849                 goto init_failed;
1850
1851         r = amdgpu_device_ip_hw_init_phase1(adev);
1852         if (r)
1853                 goto init_failed;
1854
1855         r = amdgpu_device_fw_loading(adev);
1856         if (r)
1857                 goto init_failed;
1858
1859         r = amdgpu_device_ip_hw_init_phase2(adev);
1860         if (r)
1861                 goto init_failed;
1862
1863         if (adev->gmc.xgmi.num_physical_nodes > 1)
1864                 amdgpu_xgmi_add_device(adev);
1865         amdgpu_amdkfd_device_init(adev);
1866
1867 init_failed:
1868         if (amdgpu_sriov_vf(adev)) {
1869                 if (!r)
1870                         amdgpu_virt_init_data_exchange(adev);
1871                 amdgpu_virt_release_full_gpu(adev, true);
1872         }
1873
1874         return r;
1875 }
1876
1877 /**
1878  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
1879  *
1880  * @adev: amdgpu_device pointer
1881  *
1882  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
1883  * this function before a GPU reset.  If the value is retained after a
1884  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
1885  */
1886 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
1887 {
1888         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
1889 }
1890
1891 /**
1892  * amdgpu_device_check_vram_lost - check if vram is valid
1893  *
1894  * @adev: amdgpu_device pointer
1895  *
1896  * Checks the reset magic value written to the gart pointer in VRAM.
1897  * The driver calls this after a GPU reset to see if the contents of
1898  * VRAM is lost or now.
1899  * returns true if vram is lost, false if not.
1900  */
1901 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
1902 {
1903         return !!memcmp(adev->gart.ptr, adev->reset_magic,
1904                         AMDGPU_RESET_MAGIC_NUM);
1905 }
1906
1907 /**
1908  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
1909  *
1910  * @adev: amdgpu_device pointer
1911  *
1912  * The list of all the hardware IPs that make up the asic is walked and the
1913  * set_clockgating_state callbacks are run.
1914  * Late initialization pass enabling clockgating for hardware IPs.
1915  * Fini or suspend, pass disabling clockgating for hardware IPs.
1916  * Returns 0 on success, negative error code on failure.
1917  */
1918
1919 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
1920                                                 enum amd_clockgating_state state)
1921 {
1922         int i, j, r;
1923
1924         if (amdgpu_emu_mode == 1)
1925                 return 0;
1926
1927         for (j = 0; j < adev->num_ip_blocks; j++) {
1928                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
1929                 if (!adev->ip_blocks[i].status.late_initialized)
1930                         continue;
1931                 /* skip CG for VCE/UVD, it's handled specially */
1932                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
1933                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
1934                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
1935                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
1936                         /* enable clockgating to save power */
1937                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
1938                                                                                      state);
1939                         if (r) {
1940                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
1941                                           adev->ip_blocks[i].version->funcs->name, r);
1942                                 return r;
1943                         }
1944                 }
1945         }
1946
1947         return 0;
1948 }
1949
1950 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
1951 {
1952         int i, j, r;
1953
1954         if (amdgpu_emu_mode == 1)
1955                 return 0;
1956
1957         for (j = 0; j < adev->num_ip_blocks; j++) {
1958                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
1959                 if (!adev->ip_blocks[i].status.late_initialized)
1960                         continue;
1961                 /* skip CG for VCE/UVD, it's handled specially */
1962                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
1963                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
1964                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
1965                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
1966                         /* enable powergating to save power */
1967                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
1968                                                                                         state);
1969                         if (r) {
1970                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
1971                                           adev->ip_blocks[i].version->funcs->name, r);
1972                                 return r;
1973                         }
1974                 }
1975         }
1976         return 0;
1977 }
1978
1979 static int amdgpu_device_enable_mgpu_fan_boost(void)
1980 {
1981         struct amdgpu_gpu_instance *gpu_ins;
1982         struct amdgpu_device *adev;
1983         int i, ret = 0;
1984
1985         mutex_lock(&mgpu_info.mutex);
1986
1987         /*
1988          * MGPU fan boost feature should be enabled
1989          * only when there are two or more dGPUs in
1990          * the system
1991          */
1992         if (mgpu_info.num_dgpu < 2)
1993                 goto out;
1994
1995         for (i = 0; i < mgpu_info.num_dgpu; i++) {
1996                 gpu_ins = &(mgpu_info.gpu_ins[i]);
1997                 adev = gpu_ins->adev;
1998                 if (!(adev->flags & AMD_IS_APU) &&
1999                     !gpu_ins->mgpu_fan_enabled &&
2000                     adev->powerplay.pp_funcs &&
2001                     adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2002                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2003                         if (ret)
2004                                 break;
2005
2006                         gpu_ins->mgpu_fan_enabled = 1;
2007                 }
2008         }
2009
2010 out:
2011         mutex_unlock(&mgpu_info.mutex);
2012
2013         return ret;
2014 }
2015
2016 /**
2017  * amdgpu_device_ip_late_init - run late init for hardware IPs
2018  *
2019  * @adev: amdgpu_device pointer
2020  *
2021  * Late initialization pass for hardware IPs.  The list of all the hardware
2022  * IPs that make up the asic is walked and the late_init callbacks are run.
2023  * late_init covers any special initialization that an IP requires
2024  * after all of the have been initialized or something that needs to happen
2025  * late in the init process.
2026  * Returns 0 on success, negative error code on failure.
2027  */
2028 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2029 {
2030         int i = 0, r;
2031
2032         for (i = 0; i < adev->num_ip_blocks; i++) {
2033                 if (!adev->ip_blocks[i].status.hw)
2034                         continue;
2035                 if (adev->ip_blocks[i].version->funcs->late_init) {
2036                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2037                         if (r) {
2038                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2039                                           adev->ip_blocks[i].version->funcs->name, r);
2040                                 return r;
2041                         }
2042                 }
2043                 adev->ip_blocks[i].status.late_initialized = true;
2044         }
2045
2046         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2047         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2048
2049         amdgpu_device_fill_reset_magic(adev);
2050
2051         r = amdgpu_device_enable_mgpu_fan_boost();
2052         if (r)
2053                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2054
2055         /* set to low pstate by default */
2056         amdgpu_xgmi_set_pstate(adev, 0);
2057
2058         return 0;
2059 }
2060
2061 /**
2062  * amdgpu_device_ip_fini - run fini for hardware IPs
2063  *
2064  * @adev: amdgpu_device pointer
2065  *
2066  * Main teardown pass for hardware IPs.  The list of all the hardware
2067  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2068  * are run.  hw_fini tears down the hardware associated with each IP
2069  * and sw_fini tears down any software state associated with each IP.
2070  * Returns 0 on success, negative error code on failure.
2071  */
2072 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2073 {
2074         int i, r;
2075
2076         amdgpu_ras_pre_fini(adev);
2077
2078         if (adev->gmc.xgmi.num_physical_nodes > 1)
2079                 amdgpu_xgmi_remove_device(adev);
2080
2081         amdgpu_amdkfd_device_fini(adev);
2082
2083         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2084         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2085
2086         /* need to disable SMC first */
2087         for (i = 0; i < adev->num_ip_blocks; i++) {
2088                 if (!adev->ip_blocks[i].status.hw)
2089                         continue;
2090                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2091                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2092                         /* XXX handle errors */
2093                         if (r) {
2094                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2095                                           adev->ip_blocks[i].version->funcs->name, r);
2096                         }
2097                         adev->ip_blocks[i].status.hw = false;
2098                         break;
2099                 }
2100         }
2101
2102         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2103                 if (!adev->ip_blocks[i].status.hw)
2104                         continue;
2105
2106                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2107                 /* XXX handle errors */
2108                 if (r) {
2109                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2110                                   adev->ip_blocks[i].version->funcs->name, r);
2111                 }
2112
2113                 adev->ip_blocks[i].status.hw = false;
2114         }
2115
2116
2117         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2118                 if (!adev->ip_blocks[i].status.sw)
2119                         continue;
2120
2121                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2122                         amdgpu_ucode_free_bo(adev);
2123                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2124                         amdgpu_device_wb_fini(adev);
2125                         amdgpu_device_vram_scratch_fini(adev);
2126                         amdgpu_ib_pool_fini(adev);
2127                 }
2128
2129                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2130                 /* XXX handle errors */
2131                 if (r) {
2132                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2133                                   adev->ip_blocks[i].version->funcs->name, r);
2134                 }
2135                 adev->ip_blocks[i].status.sw = false;
2136                 adev->ip_blocks[i].status.valid = false;
2137         }
2138
2139         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2140                 if (!adev->ip_blocks[i].status.late_initialized)
2141                         continue;
2142                 if (adev->ip_blocks[i].version->funcs->late_fini)
2143                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2144                 adev->ip_blocks[i].status.late_initialized = false;
2145         }
2146
2147         amdgpu_ras_fini(adev);
2148
2149         if (amdgpu_sriov_vf(adev))
2150                 if (amdgpu_virt_release_full_gpu(adev, false))
2151                         DRM_ERROR("failed to release exclusive mode on fini\n");
2152
2153         return 0;
2154 }
2155
2156 /**
2157  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2158  *
2159  * @work: work_struct.
2160  */
2161 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2162 {
2163         struct amdgpu_device *adev =
2164                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2165         int r;
2166
2167         r = amdgpu_ib_ring_tests(adev);
2168         if (r)
2169                 DRM_ERROR("ib ring test failed (%d).\n", r);
2170 }
2171
2172 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2173 {
2174         struct amdgpu_device *adev =
2175                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2176
2177         mutex_lock(&adev->gfx.gfx_off_mutex);
2178         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2179                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2180                         adev->gfx.gfx_off_state = true;
2181         }
2182         mutex_unlock(&adev->gfx.gfx_off_mutex);
2183 }
2184
2185 /**
2186  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2187  *
2188  * @adev: amdgpu_device pointer
2189  *
2190  * Main suspend function for hardware IPs.  The list of all the hardware
2191  * IPs that make up the asic is walked, clockgating is disabled and the
2192  * suspend callbacks are run.  suspend puts the hardware and software state
2193  * in each IP into a state suitable for suspend.
2194  * Returns 0 on success, negative error code on failure.
2195  */
2196 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2197 {
2198         int i, r;
2199
2200         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2201         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2202
2203         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2204                 if (!adev->ip_blocks[i].status.valid)
2205                         continue;
2206                 /* displays are handled separately */
2207                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
2208                         /* XXX handle errors */
2209                         r = adev->ip_blocks[i].version->funcs->suspend(adev);
2210                         /* XXX handle errors */
2211                         if (r) {
2212                                 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2213                                           adev->ip_blocks[i].version->funcs->name, r);
2214                         }
2215                 }
2216         }
2217
2218         return 0;
2219 }
2220
2221 /**
2222  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2223  *
2224  * @adev: amdgpu_device pointer
2225  *
2226  * Main suspend function for hardware IPs.  The list of all the hardware
2227  * IPs that make up the asic is walked, clockgating is disabled and the
2228  * suspend callbacks are run.  suspend puts the hardware and software state
2229  * in each IP into a state suitable for suspend.
2230  * Returns 0 on success, negative error code on failure.
2231  */
2232 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2233 {
2234         int i, r;
2235
2236         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2237                 if (!adev->ip_blocks[i].status.valid)
2238                         continue;
2239                 /* displays are handled in phase1 */
2240                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2241                         continue;
2242                 /* XXX handle errors */
2243                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2244                 /* XXX handle errors */
2245                 if (r) {
2246                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2247                                   adev->ip_blocks[i].version->funcs->name, r);
2248                 }
2249                 /* handle putting the SMC in the appropriate state */
2250                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2251                         if (is_support_sw_smu(adev)) {
2252                                 /* todo */
2253                         } else if (adev->powerplay.pp_funcs &&
2254                                    adev->powerplay.pp_funcs->set_mp1_state) {
2255                                 r = adev->powerplay.pp_funcs->set_mp1_state(
2256                                         adev->powerplay.pp_handle,
2257                                         adev->mp1_state);
2258                                 if (r) {
2259                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2260                                                   adev->mp1_state, r);
2261                                 }
2262                         }
2263                 }
2264         }
2265
2266         return 0;
2267 }
2268
2269 /**
2270  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2271  *
2272  * @adev: amdgpu_device pointer
2273  *
2274  * Main suspend function for hardware IPs.  The list of all the hardware
2275  * IPs that make up the asic is walked, clockgating is disabled and the
2276  * suspend callbacks are run.  suspend puts the hardware and software state
2277  * in each IP into a state suitable for suspend.
2278  * Returns 0 on success, negative error code on failure.
2279  */
2280 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2281 {
2282         int r;
2283
2284         if (amdgpu_sriov_vf(adev))
2285                 amdgpu_virt_request_full_gpu(adev, false);
2286
2287         r = amdgpu_device_ip_suspend_phase1(adev);
2288         if (r)
2289                 return r;
2290         r = amdgpu_device_ip_suspend_phase2(adev);
2291
2292         if (amdgpu_sriov_vf(adev))
2293                 amdgpu_virt_release_full_gpu(adev, false);
2294
2295         return r;
2296 }
2297
2298 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2299 {
2300         int i, r;
2301
2302         static enum amd_ip_block_type ip_order[] = {
2303                 AMD_IP_BLOCK_TYPE_GMC,
2304                 AMD_IP_BLOCK_TYPE_COMMON,
2305                 AMD_IP_BLOCK_TYPE_PSP,
2306                 AMD_IP_BLOCK_TYPE_IH,
2307         };
2308
2309         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2310                 int j;
2311                 struct amdgpu_ip_block *block;
2312
2313                 for (j = 0; j < adev->num_ip_blocks; j++) {
2314                         block = &adev->ip_blocks[j];
2315
2316                         if (block->version->type != ip_order[i] ||
2317                                 !block->status.valid)
2318                                 continue;
2319
2320                         r = block->version->funcs->hw_init(adev);
2321                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2322                         if (r)
2323                                 return r;
2324                 }
2325         }
2326
2327         return 0;
2328 }
2329
2330 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2331 {
2332         int i, r;
2333
2334         static enum amd_ip_block_type ip_order[] = {
2335                 AMD_IP_BLOCK_TYPE_SMC,
2336                 AMD_IP_BLOCK_TYPE_DCE,
2337                 AMD_IP_BLOCK_TYPE_GFX,
2338                 AMD_IP_BLOCK_TYPE_SDMA,
2339                 AMD_IP_BLOCK_TYPE_UVD,
2340                 AMD_IP_BLOCK_TYPE_VCE
2341         };
2342
2343         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2344                 int j;
2345                 struct amdgpu_ip_block *block;
2346
2347                 for (j = 0; j < adev->num_ip_blocks; j++) {
2348                         block = &adev->ip_blocks[j];
2349
2350                         if (block->version->type != ip_order[i] ||
2351                                 !block->status.valid)
2352                                 continue;
2353
2354                         r = block->version->funcs->hw_init(adev);
2355                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2356                         if (r)
2357                                 return r;
2358                 }
2359         }
2360
2361         return 0;
2362 }
2363
2364 /**
2365  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2366  *
2367  * @adev: amdgpu_device pointer
2368  *
2369  * First resume function for hardware IPs.  The list of all the hardware
2370  * IPs that make up the asic is walked and the resume callbacks are run for
2371  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2372  * after a suspend and updates the software state as necessary.  This
2373  * function is also used for restoring the GPU after a GPU reset.
2374  * Returns 0 on success, negative error code on failure.
2375  */
2376 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2377 {
2378         int i, r;
2379
2380         for (i = 0; i < adev->num_ip_blocks; i++) {
2381                 if (!adev->ip_blocks[i].status.valid)
2382                         continue;
2383                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2384                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2385                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2386                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2387                         if (r) {
2388                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2389                                           adev->ip_blocks[i].version->funcs->name, r);
2390                                 return r;
2391                         }
2392                 }
2393         }
2394
2395         return 0;
2396 }
2397
2398 /**
2399  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2400  *
2401  * @adev: amdgpu_device pointer
2402  *
2403  * First resume function for hardware IPs.  The list of all the hardware
2404  * IPs that make up the asic is walked and the resume callbacks are run for
2405  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2406  * functional state after a suspend and updates the software state as
2407  * necessary.  This function is also used for restoring the GPU after a GPU
2408  * reset.
2409  * Returns 0 on success, negative error code on failure.
2410  */
2411 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2412 {
2413         int i, r;
2414
2415         for (i = 0; i < adev->num_ip_blocks; i++) {
2416                 if (!adev->ip_blocks[i].status.valid)
2417                         continue;
2418                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2419                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2420                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2421                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2422                         continue;
2423                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2424                 if (r) {
2425                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2426                                   adev->ip_blocks[i].version->funcs->name, r);
2427                         return r;
2428                 }
2429         }
2430
2431         return 0;
2432 }
2433
2434 /**
2435  * amdgpu_device_ip_resume - run resume for hardware IPs
2436  *
2437  * @adev: amdgpu_device pointer
2438  *
2439  * Main resume function for hardware IPs.  The hardware IPs
2440  * are split into two resume functions because they are
2441  * are also used in in recovering from a GPU reset and some additional
2442  * steps need to be take between them.  In this case (S3/S4) they are
2443  * run sequentially.
2444  * Returns 0 on success, negative error code on failure.
2445  */
2446 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2447 {
2448         int r;
2449
2450         r = amdgpu_device_ip_resume_phase1(adev);
2451         if (r)
2452                 return r;
2453
2454         r = amdgpu_device_fw_loading(adev);
2455         if (r)
2456                 return r;
2457
2458         r = amdgpu_device_ip_resume_phase2(adev);
2459
2460         return r;
2461 }
2462
2463 /**
2464  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2465  *
2466  * @adev: amdgpu_device pointer
2467  *
2468  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2469  */
2470 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2471 {
2472         if (amdgpu_sriov_vf(adev)) {
2473                 if (adev->is_atom_fw) {
2474                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2475                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2476                 } else {
2477                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2478                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2479                 }
2480
2481                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2482                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2483         }
2484 }
2485
2486 /**
2487  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2488  *
2489  * @asic_type: AMD asic type
2490  *
2491  * Check if there is DC (new modesetting infrastructre) support for an asic.
2492  * returns true if DC has support, false if not.
2493  */
2494 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2495 {
2496         switch (asic_type) {
2497 #if defined(CONFIG_DRM_AMD_DC)
2498         case CHIP_BONAIRE:
2499         case CHIP_KAVERI:
2500         case CHIP_KABINI:
2501         case CHIP_MULLINS:
2502                 /*
2503                  * We have systems in the wild with these ASICs that require
2504                  * LVDS and VGA support which is not supported with DC.
2505                  *
2506                  * Fallback to the non-DC driver here by default so as not to
2507                  * cause regressions.
2508                  */
2509                 return amdgpu_dc > 0;
2510         case CHIP_HAWAII:
2511         case CHIP_CARRIZO:
2512         case CHIP_STONEY:
2513         case CHIP_POLARIS10:
2514         case CHIP_POLARIS11:
2515         case CHIP_POLARIS12:
2516         case CHIP_VEGAM:
2517         case CHIP_TONGA:
2518         case CHIP_FIJI:
2519         case CHIP_VEGA10:
2520         case CHIP_VEGA12:
2521         case CHIP_VEGA20:
2522 #if defined(CONFIG_DRM_AMD_DC_DCN1_0)
2523         case CHIP_RAVEN:
2524 #endif
2525 #if defined(CONFIG_DRM_AMD_DC_DCN2_0)
2526         case CHIP_NAVI10:
2527         case CHIP_NAVI14:
2528 #endif
2529                 return amdgpu_dc != 0;
2530 #endif
2531         default:
2532                 return false;
2533         }
2534 }
2535
2536 /**
2537  * amdgpu_device_has_dc_support - check if dc is supported
2538  *
2539  * @adev: amdgpu_device_pointer
2540  *
2541  * Returns true for supported, false for not supported
2542  */
2543 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2544 {
2545         if (amdgpu_sriov_vf(adev))
2546                 return false;
2547
2548         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2549 }
2550
2551
2552 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2553 {
2554         struct amdgpu_device *adev =
2555                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2556
2557         adev->asic_reset_res =  amdgpu_asic_reset(adev);
2558         if (adev->asic_reset_res)
2559                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2560                          adev->asic_reset_res, adev->ddev->unique);
2561 }
2562
2563
2564 /**
2565  * amdgpu_device_init - initialize the driver
2566  *
2567  * @adev: amdgpu_device pointer
2568  * @ddev: drm dev pointer
2569  * @pdev: pci dev pointer
2570  * @flags: driver flags
2571  *
2572  * Initializes the driver info and hw (all asics).
2573  * Returns 0 for success or an error on failure.
2574  * Called at driver startup.
2575  */
2576 int amdgpu_device_init(struct amdgpu_device *adev,
2577                        struct drm_device *ddev,
2578                        struct pci_dev *pdev,
2579                        uint32_t flags)
2580 {
2581         int r, i;
2582         bool runtime = false;
2583         u32 max_MBps;
2584
2585         adev->shutdown = false;
2586         adev->dev = &pdev->dev;
2587         adev->ddev = ddev;
2588         adev->pdev = pdev;
2589         adev->flags = flags;
2590         adev->asic_type = flags & AMD_ASIC_MASK;
2591         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2592         if (amdgpu_emu_mode == 1)
2593                 adev->usec_timeout *= 2;
2594         adev->gmc.gart_size = 512 * 1024 * 1024;
2595         adev->accel_working = false;
2596         adev->num_rings = 0;
2597         adev->mman.buffer_funcs = NULL;
2598         adev->mman.buffer_funcs_ring = NULL;
2599         adev->vm_manager.vm_pte_funcs = NULL;
2600         adev->vm_manager.vm_pte_num_rqs = 0;
2601         adev->gmc.gmc_funcs = NULL;
2602         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
2603         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
2604
2605         adev->smc_rreg = &amdgpu_invalid_rreg;
2606         adev->smc_wreg = &amdgpu_invalid_wreg;
2607         adev->pcie_rreg = &amdgpu_invalid_rreg;
2608         adev->pcie_wreg = &amdgpu_invalid_wreg;
2609         adev->pciep_rreg = &amdgpu_invalid_rreg;
2610         adev->pciep_wreg = &amdgpu_invalid_wreg;
2611         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
2612         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
2613         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
2614         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
2615         adev->didt_rreg = &amdgpu_invalid_rreg;
2616         adev->didt_wreg = &amdgpu_invalid_wreg;
2617         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
2618         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
2619         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
2620         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
2621
2622         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
2623                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
2624                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
2625
2626         /* mutex initialization are all done here so we
2627          * can recall function without having locking issues */
2628         atomic_set(&adev->irq.ih.lock, 0);
2629         mutex_init(&adev->firmware.mutex);
2630         mutex_init(&adev->pm.mutex);
2631         mutex_init(&adev->gfx.gpu_clock_mutex);
2632         mutex_init(&adev->srbm_mutex);
2633         mutex_init(&adev->gfx.pipe_reserve_mutex);
2634         mutex_init(&adev->gfx.gfx_off_mutex);
2635         mutex_init(&adev->grbm_idx_mutex);
2636         mutex_init(&adev->mn_lock);
2637         mutex_init(&adev->virt.vf_errors.lock);
2638         hash_init(adev->mn_hash);
2639         mutex_init(&adev->lock_reset);
2640         mutex_init(&adev->virt.dpm_mutex);
2641         mutex_init(&adev->psp.mutex);
2642
2643         r = amdgpu_device_check_arguments(adev);
2644         if (r)
2645                 return r;
2646
2647         spin_lock_init(&adev->mmio_idx_lock);
2648         spin_lock_init(&adev->smc_idx_lock);
2649         spin_lock_init(&adev->pcie_idx_lock);
2650         spin_lock_init(&adev->uvd_ctx_idx_lock);
2651         spin_lock_init(&adev->didt_idx_lock);
2652         spin_lock_init(&adev->gc_cac_idx_lock);
2653         spin_lock_init(&adev->se_cac_idx_lock);
2654         spin_lock_init(&adev->audio_endpt_idx_lock);
2655         spin_lock_init(&adev->mm_stats.lock);
2656
2657         INIT_LIST_HEAD(&adev->shadow_list);
2658         mutex_init(&adev->shadow_list_lock);
2659
2660         INIT_LIST_HEAD(&adev->ring_lru_list);
2661         spin_lock_init(&adev->ring_lru_list_lock);
2662
2663         INIT_DELAYED_WORK(&adev->delayed_init_work,
2664                           amdgpu_device_delayed_init_work_handler);
2665         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
2666                           amdgpu_device_delay_enable_gfx_off);
2667
2668         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
2669
2670         adev->gfx.gfx_off_req_count = 1;
2671         adev->pm.ac_power = power_supply_is_system_supplied() > 0 ? true : false;
2672
2673         /* Registers mapping */
2674         /* TODO: block userspace mapping of io register */
2675         if (adev->asic_type >= CHIP_BONAIRE) {
2676                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
2677                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
2678         } else {
2679                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
2680                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
2681         }
2682
2683         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
2684         if (adev->rmmio == NULL) {
2685                 return -ENOMEM;
2686         }
2687         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
2688         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
2689
2690         /* io port mapping */
2691         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
2692                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
2693                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
2694                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
2695                         break;
2696                 }
2697         }
2698         if (adev->rio_mem == NULL)
2699                 DRM_INFO("PCI I/O BAR is not found.\n");
2700
2701         /* enable PCIE atomic ops */
2702         r = pci_enable_atomic_ops_to_root(adev->pdev,
2703                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
2704                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
2705         if (r) {
2706                 adev->have_atomics_support = false;
2707                 DRM_INFO("PCIE atomic ops is not supported\n");
2708         } else {
2709                 adev->have_atomics_support = true;
2710         }
2711
2712         amdgpu_device_get_pcie_info(adev);
2713
2714         if (amdgpu_mcbp)
2715                 DRM_INFO("MCBP is enabled\n");
2716
2717         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
2718                 adev->enable_mes = true;
2719
2720         if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10) {
2721                 r = amdgpu_discovery_init(adev);
2722                 if (r) {
2723                         dev_err(adev->dev, "amdgpu_discovery_init failed\n");
2724                         return r;
2725                 }
2726         }
2727
2728         /* early init functions */
2729         r = amdgpu_device_ip_early_init(adev);
2730         if (r)
2731                 return r;
2732
2733         /* doorbell bar mapping and doorbell index init*/
2734         amdgpu_device_doorbell_init(adev);
2735
2736         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
2737         /* this will fail for cards that aren't VGA class devices, just
2738          * ignore it */
2739         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
2740
2741         if (amdgpu_device_is_px(ddev))
2742                 runtime = true;
2743         if (!pci_is_thunderbolt_attached(adev->pdev))
2744                 vga_switcheroo_register_client(adev->pdev,
2745                                                &amdgpu_switcheroo_ops, runtime);
2746         if (runtime)
2747                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
2748
2749         if (amdgpu_emu_mode == 1) {
2750                 /* post the asic on emulation mode */
2751                 emu_soc_asic_init(adev);
2752                 goto fence_driver_init;
2753         }
2754
2755         /* detect if we are with an SRIOV vbios */
2756         amdgpu_device_detect_sriov_bios(adev);
2757
2758         /* check if we need to reset the asic
2759          *  E.g., driver was not cleanly unloaded previously, etc.
2760          */
2761         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
2762                 r = amdgpu_asic_reset(adev);
2763                 if (r) {
2764                         dev_err(adev->dev, "asic reset on init failed\n");
2765                         goto failed;
2766                 }
2767         }
2768
2769         /* Post card if necessary */
2770         if (amdgpu_device_need_post(adev)) {
2771                 if (!adev->bios) {
2772                         dev_err(adev->dev, "no vBIOS found\n");
2773                         r = -EINVAL;
2774                         goto failed;
2775                 }
2776                 DRM_INFO("GPU posting now...\n");
2777                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
2778                 if (r) {
2779                         dev_err(adev->dev, "gpu post error!\n");
2780                         goto failed;
2781                 }
2782         }
2783
2784         if (adev->is_atom_fw) {
2785                 /* Initialize clocks */
2786                 r = amdgpu_atomfirmware_get_clock_info(adev);
2787                 if (r) {
2788                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
2789                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
2790                         goto failed;
2791                 }
2792         } else {
2793                 /* Initialize clocks */
2794                 r = amdgpu_atombios_get_clock_info(adev);
2795                 if (r) {
2796                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
2797                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
2798                         goto failed;
2799                 }
2800                 /* init i2c buses */
2801                 if (!amdgpu_device_has_dc_support(adev))
2802                         amdgpu_atombios_i2c_init(adev);
2803         }
2804
2805 fence_driver_init:
2806         /* Fence driver */
2807         r = amdgpu_fence_driver_init(adev);
2808         if (r) {
2809                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
2810                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
2811                 goto failed;
2812         }
2813
2814         /* init the mode config */
2815         drm_mode_config_init(adev->ddev);
2816
2817         r = amdgpu_device_ip_init(adev);
2818         if (r) {
2819                 /* failed in exclusive mode due to timeout */
2820                 if (amdgpu_sriov_vf(adev) &&
2821                     !amdgpu_sriov_runtime(adev) &&
2822                     amdgpu_virt_mmio_blocked(adev) &&
2823                     !amdgpu_virt_wait_reset(adev)) {
2824                         dev_err(adev->dev, "VF exclusive mode timeout\n");
2825                         /* Don't send request since VF is inactive. */
2826                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
2827                         adev->virt.ops = NULL;
2828                         r = -EAGAIN;
2829                         goto failed;
2830                 }
2831                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
2832                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
2833                 if (amdgpu_virt_request_full_gpu(adev, false))
2834                         amdgpu_virt_release_full_gpu(adev, false);
2835                 goto failed;
2836         }
2837
2838         adev->accel_working = true;
2839
2840         amdgpu_vm_check_compute_bug(adev);
2841
2842         /* Initialize the buffer migration limit. */
2843         if (amdgpu_moverate >= 0)
2844                 max_MBps = amdgpu_moverate;
2845         else
2846                 max_MBps = 8; /* Allow 8 MB/s. */
2847         /* Get a log2 for easy divisions. */
2848         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
2849
2850         amdgpu_fbdev_init(adev);
2851
2852         if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev))
2853                 amdgpu_pm_virt_sysfs_init(adev);
2854
2855         r = amdgpu_pm_sysfs_init(adev);
2856         if (r)
2857                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
2858
2859         r = amdgpu_ucode_sysfs_init(adev);
2860         if (r)
2861                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
2862
2863         r = amdgpu_debugfs_gem_init(adev);
2864         if (r)
2865                 DRM_ERROR("registering gem debugfs failed (%d).\n", r);
2866
2867         r = amdgpu_debugfs_regs_init(adev);
2868         if (r)
2869                 DRM_ERROR("registering register debugfs failed (%d).\n", r);
2870
2871         r = amdgpu_debugfs_firmware_init(adev);
2872         if (r)
2873                 DRM_ERROR("registering firmware debugfs failed (%d).\n", r);
2874
2875         r = amdgpu_debugfs_init(adev);
2876         if (r)
2877                 DRM_ERROR("Creating debugfs files failed (%d).\n", r);
2878
2879         if ((amdgpu_testing & 1)) {
2880                 if (adev->accel_working)
2881                         amdgpu_test_moves(adev);
2882                 else
2883                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
2884         }
2885         if (amdgpu_benchmarking) {
2886                 if (adev->accel_working)
2887                         amdgpu_benchmark(adev, amdgpu_benchmarking);
2888                 else
2889                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
2890         }
2891
2892         /* enable clockgating, etc. after ib tests, etc. since some blocks require
2893          * explicit gating rather than handling it automatically.
2894          */
2895         r = amdgpu_device_ip_late_init(adev);
2896         if (r) {
2897                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
2898                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
2899                 goto failed;
2900         }
2901
2902         /* must succeed. */
2903         amdgpu_ras_resume(adev);
2904
2905         queue_delayed_work(system_wq, &adev->delayed_init_work,
2906                            msecs_to_jiffies(AMDGPU_RESUME_MS));
2907
2908         r = device_create_file(adev->dev, &dev_attr_pcie_replay_count);
2909         if (r) {
2910                 dev_err(adev->dev, "Could not create pcie_replay_count");
2911                 return r;
2912         }
2913
2914         if (IS_ENABLED(CONFIG_PERF_EVENTS))
2915                 r = amdgpu_pmu_init(adev);
2916         if (r)
2917                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
2918
2919         return 0;
2920
2921 failed:
2922         amdgpu_vf_error_trans_all(adev);
2923         if (runtime)
2924                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
2925
2926         return r;
2927 }
2928
2929 /**
2930  * amdgpu_device_fini - tear down the driver
2931  *
2932  * @adev: amdgpu_device pointer
2933  *
2934  * Tear down the driver info (all asics).
2935  * Called at driver shutdown.
2936  */
2937 void amdgpu_device_fini(struct amdgpu_device *adev)
2938 {
2939         int r;
2940
2941         DRM_INFO("amdgpu: finishing device.\n");
2942         adev->shutdown = true;
2943         /* disable all interrupts */
2944         amdgpu_irq_disable_all(adev);
2945         if (adev->mode_info.mode_config_initialized){
2946                 if (!amdgpu_device_has_dc_support(adev))
2947                         drm_helper_force_disable_all(adev->ddev);
2948                 else
2949                         drm_atomic_helper_shutdown(adev->ddev);
2950         }
2951         amdgpu_fence_driver_fini(adev);
2952         amdgpu_pm_sysfs_fini(adev);
2953         amdgpu_fbdev_fini(adev);
2954         r = amdgpu_device_ip_fini(adev);
2955         if (adev->firmware.gpu_info_fw) {
2956                 release_firmware(adev->firmware.gpu_info_fw);
2957                 adev->firmware.gpu_info_fw = NULL;
2958         }
2959         adev->accel_working = false;
2960         cancel_delayed_work_sync(&adev->delayed_init_work);
2961         /* free i2c buses */
2962         if (!amdgpu_device_has_dc_support(adev))
2963                 amdgpu_i2c_fini(adev);
2964
2965         if (amdgpu_emu_mode != 1)
2966                 amdgpu_atombios_fini(adev);
2967
2968         kfree(adev->bios);
2969         adev->bios = NULL;
2970         if (!pci_is_thunderbolt_attached(adev->pdev))
2971                 vga_switcheroo_unregister_client(adev->pdev);
2972         if (adev->flags & AMD_IS_PX)
2973                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
2974         vga_client_register(adev->pdev, NULL, NULL, NULL);
2975         if (adev->rio_mem)
2976                 pci_iounmap(adev->pdev, adev->rio_mem);
2977         adev->rio_mem = NULL;
2978         iounmap(adev->rmmio);
2979         adev->rmmio = NULL;
2980         amdgpu_device_doorbell_fini(adev);
2981         if (amdgpu_sriov_vf(adev) && amdgim_is_hwperf(adev))
2982                 amdgpu_pm_virt_sysfs_fini(adev);
2983
2984         amdgpu_debugfs_regs_cleanup(adev);
2985         device_remove_file(adev->dev, &dev_attr_pcie_replay_count);
2986         amdgpu_ucode_sysfs_fini(adev);
2987         if (IS_ENABLED(CONFIG_PERF_EVENTS))
2988                 amdgpu_pmu_fini(adev);
2989         amdgpu_debugfs_preempt_cleanup(adev);
2990         if (amdgpu_discovery && adev->asic_type >= CHIP_NAVI10)
2991                 amdgpu_discovery_fini(adev);
2992 }
2993
2994
2995 /*
2996  * Suspend & resume.
2997  */
2998 /**
2999  * amdgpu_device_suspend - initiate device suspend
3000  *
3001  * @dev: drm dev pointer
3002  * @suspend: suspend state
3003  * @fbcon : notify the fbdev of suspend
3004  *
3005  * Puts the hw in the suspend state (all asics).
3006  * Returns 0 for success or an error on failure.
3007  * Called at driver suspend.
3008  */
3009 int amdgpu_device_suspend(struct drm_device *dev, bool suspend, bool fbcon)
3010 {
3011         struct amdgpu_device *adev;
3012         struct drm_crtc *crtc;
3013         struct drm_connector *connector;
3014         int r;
3015
3016         if (dev == NULL || dev->dev_private == NULL) {
3017                 return -ENODEV;
3018         }
3019
3020         adev = dev->dev_private;
3021
3022         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3023                 return 0;
3024
3025         adev->in_suspend = true;
3026         drm_kms_helper_poll_disable(dev);
3027
3028         if (fbcon)
3029                 amdgpu_fbdev_set_suspend(adev, 1);
3030
3031         cancel_delayed_work_sync(&adev->delayed_init_work);
3032
3033         if (!amdgpu_device_has_dc_support(adev)) {
3034                 /* turn off display hw */
3035                 drm_modeset_lock_all(dev);
3036                 list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
3037                         drm_helper_connector_dpms(connector, DRM_MODE_DPMS_OFF);
3038                 }
3039                 drm_modeset_unlock_all(dev);
3040                         /* unpin the front buffers and cursors */
3041                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3042                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3043                         struct drm_framebuffer *fb = crtc->primary->fb;
3044                         struct amdgpu_bo *robj;
3045
3046                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3047                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3048                                 r = amdgpu_bo_reserve(aobj, true);
3049                                 if (r == 0) {
3050                                         amdgpu_bo_unpin(aobj);
3051                                         amdgpu_bo_unreserve(aobj);
3052                                 }
3053                         }
3054
3055                         if (fb == NULL || fb->obj[0] == NULL) {
3056                                 continue;
3057                         }
3058                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3059                         /* don't unpin kernel fb objects */
3060                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3061                                 r = amdgpu_bo_reserve(robj, true);
3062                                 if (r == 0) {
3063                                         amdgpu_bo_unpin(robj);
3064                                         amdgpu_bo_unreserve(robj);
3065                                 }
3066                         }
3067                 }
3068         }
3069
3070         amdgpu_amdkfd_suspend(adev);
3071
3072         amdgpu_ras_suspend(adev);
3073
3074         r = amdgpu_device_ip_suspend_phase1(adev);
3075
3076         /* evict vram memory */
3077         amdgpu_bo_evict_vram(adev);
3078
3079         amdgpu_fence_driver_suspend(adev);
3080
3081         r = amdgpu_device_ip_suspend_phase2(adev);
3082
3083         /* evict remaining vram memory
3084          * This second call to evict vram is to evict the gart page table
3085          * using the CPU.
3086          */
3087         amdgpu_bo_evict_vram(adev);
3088
3089         pci_save_state(dev->pdev);
3090         if (suspend) {
3091                 /* Shut down the device */
3092                 pci_disable_device(dev->pdev);
3093                 pci_set_power_state(dev->pdev, PCI_D3hot);
3094         } else {
3095                 r = amdgpu_asic_reset(adev);
3096                 if (r)
3097                         DRM_ERROR("amdgpu asic reset failed\n");
3098         }
3099
3100         return 0;
3101 }
3102
3103 /**
3104  * amdgpu_device_resume - initiate device resume
3105  *
3106  * @dev: drm dev pointer
3107  * @resume: resume state
3108  * @fbcon : notify the fbdev of resume
3109  *
3110  * Bring the hw back to operating state (all asics).
3111  * Returns 0 for success or an error on failure.
3112  * Called at driver resume.
3113  */
3114 int amdgpu_device_resume(struct drm_device *dev, bool resume, bool fbcon)
3115 {
3116         struct drm_connector *connector;
3117         struct amdgpu_device *adev = dev->dev_private;
3118         struct drm_crtc *crtc;
3119         int r = 0;
3120
3121         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3122                 return 0;
3123
3124         if (resume) {
3125                 pci_set_power_state(dev->pdev, PCI_D0);
3126                 pci_restore_state(dev->pdev);
3127                 r = pci_enable_device(dev->pdev);
3128                 if (r)
3129                         return r;
3130         }
3131
3132         /* post card */
3133         if (amdgpu_device_need_post(adev)) {
3134                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3135                 if (r)
3136                         DRM_ERROR("amdgpu asic init failed\n");
3137         }
3138
3139         r = amdgpu_device_ip_resume(adev);
3140         if (r) {
3141                 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3142                 return r;
3143         }
3144         amdgpu_fence_driver_resume(adev);
3145
3146
3147         r = amdgpu_device_ip_late_init(adev);
3148         if (r)
3149                 return r;
3150
3151         queue_delayed_work(system_wq, &adev->delayed_init_work,
3152                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3153
3154         if (!amdgpu_device_has_dc_support(adev)) {
3155                 /* pin cursors */
3156                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3157                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3158
3159                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3160                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3161                                 r = amdgpu_bo_reserve(aobj, true);
3162                                 if (r == 0) {
3163                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3164                                         if (r != 0)
3165                                                 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3166                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3167                                         amdgpu_bo_unreserve(aobj);
3168                                 }
3169                         }
3170                 }
3171         }
3172         r = amdgpu_amdkfd_resume(adev);
3173         if (r)
3174                 return r;
3175
3176         /* Make sure IB tests flushed */
3177         flush_delayed_work(&adev->delayed_init_work);
3178
3179         /* blat the mode back in */
3180         if (fbcon) {
3181                 if (!amdgpu_device_has_dc_support(adev)) {
3182                         /* pre DCE11 */
3183                         drm_helper_resume_force_mode(dev);
3184
3185                         /* turn on display hw */
3186                         drm_modeset_lock_all(dev);
3187                         list_for_each_entry(connector, &dev->mode_config.connector_list, head) {
3188                                 drm_helper_connector_dpms(connector, DRM_MODE_DPMS_ON);
3189                         }
3190                         drm_modeset_unlock_all(dev);
3191                 }
3192                 amdgpu_fbdev_set_suspend(adev, 0);
3193         }
3194
3195         drm_kms_helper_poll_enable(dev);
3196
3197         amdgpu_ras_resume(adev);
3198
3199         /*
3200          * Most of the connector probing functions try to acquire runtime pm
3201          * refs to ensure that the GPU is powered on when connector polling is
3202          * performed. Since we're calling this from a runtime PM callback,
3203          * trying to acquire rpm refs will cause us to deadlock.
3204          *
3205          * Since we're guaranteed to be holding the rpm lock, it's safe to
3206          * temporarily disable the rpm helpers so this doesn't deadlock us.
3207          */
3208 #ifdef CONFIG_PM
3209         dev->dev->power.disable_depth++;
3210 #endif
3211         if (!amdgpu_device_has_dc_support(adev))
3212                 drm_helper_hpd_irq_event(dev);
3213         else
3214                 drm_kms_helper_hotplug_event(dev);
3215 #ifdef CONFIG_PM
3216         dev->dev->power.disable_depth--;
3217 #endif
3218         adev->in_suspend = false;
3219
3220         return 0;
3221 }
3222
3223 /**
3224  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3225  *
3226  * @adev: amdgpu_device pointer
3227  *
3228  * The list of all the hardware IPs that make up the asic is walked and
3229  * the check_soft_reset callbacks are run.  check_soft_reset determines
3230  * if the asic is still hung or not.
3231  * Returns true if any of the IPs are still in a hung state, false if not.
3232  */
3233 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3234 {
3235         int i;
3236         bool asic_hang = false;
3237
3238         if (amdgpu_sriov_vf(adev))
3239                 return true;
3240
3241         if (amdgpu_asic_need_full_reset(adev))
3242                 return true;
3243
3244         for (i = 0; i < adev->num_ip_blocks; i++) {
3245                 if (!adev->ip_blocks[i].status.valid)
3246                         continue;
3247                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3248                         adev->ip_blocks[i].status.hang =
3249                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3250                 if (adev->ip_blocks[i].status.hang) {
3251                         DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3252                         asic_hang = true;
3253                 }
3254         }
3255         return asic_hang;
3256 }
3257
3258 /**
3259  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3260  *
3261  * @adev: amdgpu_device pointer
3262  *
3263  * The list of all the hardware IPs that make up the asic is walked and the
3264  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3265  * handles any IP specific hardware or software state changes that are
3266  * necessary for a soft reset to succeed.
3267  * Returns 0 on success, negative error code on failure.
3268  */
3269 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3270 {
3271         int i, r = 0;
3272
3273         for (i = 0; i < adev->num_ip_blocks; i++) {
3274                 if (!adev->ip_blocks[i].status.valid)
3275                         continue;
3276                 if (adev->ip_blocks[i].status.hang &&
3277                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3278                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3279                         if (r)
3280                                 return r;
3281                 }
3282         }
3283
3284         return 0;
3285 }
3286
3287 /**
3288  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3289  *
3290  * @adev: amdgpu_device pointer
3291  *
3292  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3293  * reset is necessary to recover.
3294  * Returns true if a full asic reset is required, false if not.
3295  */
3296 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3297 {
3298         int i;
3299
3300         if (amdgpu_asic_need_full_reset(adev))
3301                 return true;
3302
3303         for (i = 0; i < adev->num_ip_blocks; i++) {
3304                 if (!adev->ip_blocks[i].status.valid)
3305                         continue;
3306                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3307                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3308                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3309                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3310                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3311                         if (adev->ip_blocks[i].status.hang) {
3312                                 DRM_INFO("Some block need full reset!\n");
3313                                 return true;
3314                         }
3315                 }
3316         }
3317         return false;
3318 }
3319
3320 /**
3321  * amdgpu_device_ip_soft_reset - do a soft reset
3322  *
3323  * @adev: amdgpu_device pointer
3324  *
3325  * The list of all the hardware IPs that make up the asic is walked and the
3326  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3327  * IP specific hardware or software state changes that are necessary to soft
3328  * reset the IP.
3329  * Returns 0 on success, negative error code on failure.
3330  */
3331 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3332 {
3333         int i, r = 0;
3334
3335         for (i = 0; i < adev->num_ip_blocks; i++) {
3336                 if (!adev->ip_blocks[i].status.valid)
3337                         continue;
3338                 if (adev->ip_blocks[i].status.hang &&
3339                     adev->ip_blocks[i].version->funcs->soft_reset) {
3340                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3341                         if (r)
3342                                 return r;
3343                 }
3344         }
3345
3346         return 0;
3347 }
3348
3349 /**
3350  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3351  *
3352  * @adev: amdgpu_device pointer
3353  *
3354  * The list of all the hardware IPs that make up the asic is walked and the
3355  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3356  * handles any IP specific hardware or software state changes that are
3357  * necessary after the IP has been soft reset.
3358  * Returns 0 on success, negative error code on failure.
3359  */
3360 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3361 {
3362         int i, r = 0;
3363
3364         for (i = 0; i < adev->num_ip_blocks; i++) {
3365                 if (!adev->ip_blocks[i].status.valid)
3366                         continue;
3367                 if (adev->ip_blocks[i].status.hang &&
3368                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3369                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3370                 if (r)
3371                         return r;
3372         }
3373
3374         return 0;
3375 }
3376
3377 /**
3378  * amdgpu_device_recover_vram - Recover some VRAM contents
3379  *
3380  * @adev: amdgpu_device pointer
3381  *
3382  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3383  * restore things like GPUVM page tables after a GPU reset where
3384  * the contents of VRAM might be lost.
3385  *
3386  * Returns:
3387  * 0 on success, negative error code on failure.
3388  */
3389 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3390 {
3391         struct dma_fence *fence = NULL, *next = NULL;
3392         struct amdgpu_bo *shadow;
3393         long r = 1, tmo;
3394
3395         if (amdgpu_sriov_runtime(adev))
3396                 tmo = msecs_to_jiffies(8000);
3397         else
3398                 tmo = msecs_to_jiffies(100);
3399
3400         DRM_INFO("recover vram bo from shadow start\n");
3401         mutex_lock(&adev->shadow_list_lock);
3402         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3403
3404                 /* No need to recover an evicted BO */
3405                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3406                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3407                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3408                         continue;
3409
3410                 r = amdgpu_bo_restore_shadow(shadow, &next);
3411                 if (r)
3412                         break;
3413
3414                 if (fence) {
3415                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3416                         dma_fence_put(fence);
3417                         fence = next;
3418                         if (tmo == 0) {
3419                                 r = -ETIMEDOUT;
3420                                 break;
3421                         } else if (tmo < 0) {
3422                                 r = tmo;
3423                                 break;
3424                         }
3425                 } else {
3426                         fence = next;
3427                 }
3428         }
3429         mutex_unlock(&adev->shadow_list_lock);
3430
3431         if (fence)
3432                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3433         dma_fence_put(fence);
3434
3435         if (r < 0 || tmo <= 0) {
3436                 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3437                 return -EIO;
3438         }
3439
3440         DRM_INFO("recover vram bo from shadow done\n");
3441         return 0;
3442 }
3443
3444
3445 /**
3446  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3447  *
3448  * @adev: amdgpu device pointer
3449  * @from_hypervisor: request from hypervisor
3450  *
3451  * do VF FLR and reinitialize Asic
3452  * return 0 means succeeded otherwise failed
3453  */
3454 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3455                                      bool from_hypervisor)
3456 {
3457         int r;
3458
3459         if (from_hypervisor)
3460                 r = amdgpu_virt_request_full_gpu(adev, true);
3461         else
3462                 r = amdgpu_virt_reset_gpu(adev);
3463         if (r)
3464                 return r;
3465
3466         amdgpu_amdkfd_pre_reset(adev);
3467
3468         /* Resume IP prior to SMC */
3469         r = amdgpu_device_ip_reinit_early_sriov(adev);
3470         if (r)
3471                 goto error;
3472
3473         /* we need recover gart prior to run SMC/CP/SDMA resume */
3474         amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3475
3476         r = amdgpu_device_fw_loading(adev);
3477         if (r)
3478                 return r;
3479
3480         /* now we are okay to resume SMC/CP/SDMA */
3481         r = amdgpu_device_ip_reinit_late_sriov(adev);
3482         if (r)
3483                 goto error;
3484
3485         amdgpu_irq_gpu_reset_resume_helper(adev);
3486         r = amdgpu_ib_ring_tests(adev);
3487         amdgpu_amdkfd_post_reset(adev);
3488
3489 error:
3490         amdgpu_virt_init_data_exchange(adev);
3491         amdgpu_virt_release_full_gpu(adev, true);
3492         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3493                 atomic_inc(&adev->vram_lost_counter);
3494                 r = amdgpu_device_recover_vram(adev);
3495         }
3496
3497         return r;
3498 }
3499
3500 /**
3501  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3502  *
3503  * @adev: amdgpu device pointer
3504  *
3505  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3506  * a hung GPU.
3507  */
3508 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3509 {
3510         if (!amdgpu_device_ip_check_soft_reset(adev)) {
3511                 DRM_INFO("Timeout, but no hardware hang detected.\n");
3512                 return false;
3513         }
3514
3515         if (amdgpu_gpu_recovery == 0)
3516                 goto disabled;
3517
3518         if (amdgpu_sriov_vf(adev))
3519                 return true;
3520
3521         if (amdgpu_gpu_recovery == -1) {
3522                 switch (adev->asic_type) {
3523                 case CHIP_BONAIRE:
3524                 case CHIP_HAWAII:
3525                 case CHIP_TOPAZ:
3526                 case CHIP_TONGA:
3527                 case CHIP_FIJI:
3528                 case CHIP_POLARIS10:
3529                 case CHIP_POLARIS11:
3530                 case CHIP_POLARIS12:
3531                 case CHIP_VEGAM:
3532                 case CHIP_VEGA20:
3533                 case CHIP_VEGA10:
3534                 case CHIP_VEGA12:
3535                         break;
3536                 default:
3537                         goto disabled;
3538                 }
3539         }
3540
3541         return true;
3542
3543 disabled:
3544                 DRM_INFO("GPU recovery disabled.\n");
3545                 return false;
3546 }
3547
3548
3549 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3550                                         struct amdgpu_job *job,
3551                                         bool *need_full_reset_arg)
3552 {
3553         int i, r = 0;
3554         bool need_full_reset  = *need_full_reset_arg;
3555
3556         /* block all schedulers and reset given job's ring */
3557         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3558                 struct amdgpu_ring *ring = adev->rings[i];
3559
3560                 if (!ring || !ring->sched.thread)
3561                         continue;
3562
3563                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3564                 amdgpu_fence_driver_force_completion(ring);
3565         }
3566
3567         if(job)
3568                 drm_sched_increase_karma(&job->base);
3569
3570         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
3571         if (!amdgpu_sriov_vf(adev)) {
3572
3573                 if (!need_full_reset)
3574                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3575
3576                 if (!need_full_reset) {
3577                         amdgpu_device_ip_pre_soft_reset(adev);
3578                         r = amdgpu_device_ip_soft_reset(adev);
3579                         amdgpu_device_ip_post_soft_reset(adev);
3580                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
3581                                 DRM_INFO("soft reset failed, will fallback to full reset!\n");
3582                                 need_full_reset = true;
3583                         }
3584                 }
3585
3586                 if (need_full_reset)
3587                         r = amdgpu_device_ip_suspend(adev);
3588
3589                 *need_full_reset_arg = need_full_reset;
3590         }
3591
3592         return r;
3593 }
3594
3595 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
3596                                struct list_head *device_list_handle,
3597                                bool *need_full_reset_arg)
3598 {
3599         struct amdgpu_device *tmp_adev = NULL;
3600         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
3601         int r = 0;
3602
3603         /*
3604          * ASIC reset has to be done on all HGMI hive nodes ASAP
3605          * to allow proper links negotiation in FW (within 1 sec)
3606          */
3607         if (need_full_reset) {
3608                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3609                         /* For XGMI run all resets in parallel to speed up the process */
3610                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
3611                                 if (!queue_work(system_highpri_wq, &tmp_adev->xgmi_reset_work))
3612                                         r = -EALREADY;
3613                         } else
3614                                 r = amdgpu_asic_reset(tmp_adev);
3615
3616                         if (r) {
3617                                 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
3618                                          r, tmp_adev->ddev->unique);
3619                                 break;
3620                         }
3621                 }
3622
3623                 /* For XGMI wait for all PSP resets to complete before proceed */
3624                 if (!r) {
3625                         list_for_each_entry(tmp_adev, device_list_handle,
3626                                             gmc.xgmi.head) {
3627                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
3628                                         flush_work(&tmp_adev->xgmi_reset_work);
3629                                         r = tmp_adev->asic_reset_res;
3630                                         if (r)
3631                                                 break;
3632                                 }
3633                         }
3634
3635                         list_for_each_entry(tmp_adev, device_list_handle,
3636                                         gmc.xgmi.head) {
3637                                 amdgpu_ras_reserve_bad_pages(tmp_adev);
3638                         }
3639                 }
3640         }
3641
3642
3643         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3644                 if (need_full_reset) {
3645                         /* post card */
3646                         if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
3647                                 DRM_WARN("asic atom init failed!");
3648
3649                         if (!r) {
3650                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
3651                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
3652                                 if (r)
3653                                         goto out;
3654
3655                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
3656                                 if (vram_lost) {
3657                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
3658                                         atomic_inc(&tmp_adev->vram_lost_counter);
3659                                 }
3660
3661                                 r = amdgpu_gtt_mgr_recover(
3662                                         &tmp_adev->mman.bdev.man[TTM_PL_TT]);
3663                                 if (r)
3664                                         goto out;
3665
3666                                 r = amdgpu_device_fw_loading(tmp_adev);
3667                                 if (r)
3668                                         return r;
3669
3670                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
3671                                 if (r)
3672                                         goto out;
3673
3674                                 if (vram_lost)
3675                                         amdgpu_device_fill_reset_magic(tmp_adev);
3676
3677                                 /*
3678                                  * Add this ASIC as tracked as reset was already
3679                                  * complete successfully.
3680                                  */
3681                                 amdgpu_register_gpu_instance(tmp_adev);
3682
3683                                 r = amdgpu_device_ip_late_init(tmp_adev);
3684                                 if (r)
3685                                         goto out;
3686
3687                                 /* must succeed. */
3688                                 amdgpu_ras_resume(tmp_adev);
3689
3690                                 /* Update PSP FW topology after reset */
3691                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
3692                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
3693                         }
3694                 }
3695
3696
3697 out:
3698                 if (!r) {
3699                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
3700                         r = amdgpu_ib_ring_tests(tmp_adev);
3701                         if (r) {
3702                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
3703                                 r = amdgpu_device_ip_suspend(tmp_adev);
3704                                 need_full_reset = true;
3705                                 r = -EAGAIN;
3706                                 goto end;
3707                         }
3708                 }
3709
3710                 if (!r)
3711                         r = amdgpu_device_recover_vram(tmp_adev);
3712                 else
3713                         tmp_adev->asic_reset_res = r;
3714         }
3715
3716 end:
3717         *need_full_reset_arg = need_full_reset;
3718         return r;
3719 }
3720
3721 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
3722 {
3723         if (trylock) {
3724                 if (!mutex_trylock(&adev->lock_reset))
3725                         return false;
3726         } else
3727                 mutex_lock(&adev->lock_reset);
3728
3729         atomic_inc(&adev->gpu_reset_counter);
3730         adev->in_gpu_reset = 1;
3731         switch (amdgpu_asic_reset_method(adev)) {
3732         case AMD_RESET_METHOD_MODE1:
3733                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
3734                 break;
3735         case AMD_RESET_METHOD_MODE2:
3736                 adev->mp1_state = PP_MP1_STATE_RESET;
3737                 break;
3738         default:
3739                 adev->mp1_state = PP_MP1_STATE_NONE;
3740                 break;
3741         }
3742         /* Block kfd: SRIOV would do it separately */
3743         if (!amdgpu_sriov_vf(adev))
3744                 amdgpu_amdkfd_pre_reset(adev);
3745
3746         return true;
3747 }
3748
3749 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
3750 {
3751         /*unlock kfd: SRIOV would do it separately */
3752         if (!amdgpu_sriov_vf(adev))
3753                 amdgpu_amdkfd_post_reset(adev);
3754         amdgpu_vf_error_trans_all(adev);
3755         adev->mp1_state = PP_MP1_STATE_NONE;
3756         adev->in_gpu_reset = 0;
3757         mutex_unlock(&adev->lock_reset);
3758 }
3759
3760
3761 /**
3762  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
3763  *
3764  * @adev: amdgpu device pointer
3765  * @job: which job trigger hang
3766  *
3767  * Attempt to reset the GPU if it has hung (all asics).
3768  * Attempt to do soft-reset or full-reset and reinitialize Asic
3769  * Returns 0 for success or an error on failure.
3770  */
3771
3772 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
3773                               struct amdgpu_job *job)
3774 {
3775         struct list_head device_list, *device_list_handle =  NULL;
3776         bool need_full_reset, job_signaled;
3777         struct amdgpu_hive_info *hive = NULL;
3778         struct amdgpu_device *tmp_adev = NULL;
3779         int i, r = 0;
3780
3781         need_full_reset = job_signaled = false;
3782         INIT_LIST_HEAD(&device_list);
3783
3784         dev_info(adev->dev, "GPU reset begin!\n");
3785
3786         cancel_delayed_work_sync(&adev->delayed_init_work);
3787
3788         hive = amdgpu_get_xgmi_hive(adev, false);
3789
3790         /*
3791          * Here we trylock to avoid chain of resets executing from
3792          * either trigger by jobs on different adevs in XGMI hive or jobs on
3793          * different schedulers for same device while this TO handler is running.
3794          * We always reset all schedulers for device and all devices for XGMI
3795          * hive so that should take care of them too.
3796          */
3797
3798         if (hive && !mutex_trylock(&hive->reset_lock)) {
3799                 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
3800                          job->base.id, hive->hive_id);
3801                 return 0;
3802         }
3803
3804         /* Start with adev pre asic reset first for soft reset check.*/
3805         if (!amdgpu_device_lock_adev(adev, !hive)) {
3806                 DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
3807                                          job->base.id);
3808                 return 0;
3809         }
3810
3811         /* Build list of devices to reset */
3812         if  (adev->gmc.xgmi.num_physical_nodes > 1) {
3813                 if (!hive) {
3814                         amdgpu_device_unlock_adev(adev);
3815                         return -ENODEV;
3816                 }
3817
3818                 /*
3819                  * In case we are in XGMI hive mode device reset is done for all the
3820                  * nodes in the hive to retrain all XGMI links and hence the reset
3821                  * sequence is executed in loop on all nodes.
3822                  */
3823                 device_list_handle = &hive->device_list;
3824         } else {
3825                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
3826                 device_list_handle = &device_list;
3827         }
3828
3829         /*
3830          * Mark these ASICs to be reseted as untracked first
3831          * And add them back after reset completed
3832          */
3833         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head)
3834                 amdgpu_unregister_gpu_instance(tmp_adev);
3835
3836         /* block all schedulers and reset given job's ring */
3837         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3838                 /* disable ras on ALL IPs */
3839                 if (amdgpu_device_ip_need_full_reset(tmp_adev))
3840                         amdgpu_ras_suspend(tmp_adev);
3841
3842                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3843                         struct amdgpu_ring *ring = tmp_adev->rings[i];
3844
3845                         if (!ring || !ring->sched.thread)
3846                                 continue;
3847
3848                         drm_sched_stop(&ring->sched, &job->base);
3849                 }
3850         }
3851
3852
3853         /*
3854          * Must check guilty signal here since after this point all old
3855          * HW fences are force signaled.
3856          *
3857          * job->base holds a reference to parent fence
3858          */
3859         if (job && job->base.s_fence->parent &&
3860             dma_fence_is_signaled(job->base.s_fence->parent))
3861                 job_signaled = true;
3862
3863         if (!amdgpu_device_ip_need_full_reset(adev))
3864                 device_list_handle = &device_list;
3865
3866         if (job_signaled) {
3867                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
3868                 goto skip_hw_reset;
3869         }
3870
3871
3872         /* Guilty job will be freed after this*/
3873         r = amdgpu_device_pre_asic_reset(adev,
3874                                          job,
3875                                          &need_full_reset);
3876         if (r) {
3877                 /*TODO Should we stop ?*/
3878                 DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
3879                           r, adev->ddev->unique);
3880                 adev->asic_reset_res = r;
3881         }
3882
3883 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
3884         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3885
3886                 if (tmp_adev == adev)
3887                         continue;
3888
3889                 amdgpu_device_lock_adev(tmp_adev, false);
3890                 r = amdgpu_device_pre_asic_reset(tmp_adev,
3891                                                  NULL,
3892                                                  &need_full_reset);
3893                 /*TODO Should we stop ?*/
3894                 if (r) {
3895                         DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
3896                                   r, tmp_adev->ddev->unique);
3897                         tmp_adev->asic_reset_res = r;
3898                 }
3899         }
3900
3901         /* Actual ASIC resets if needed.*/
3902         /* TODO Implement XGMI hive reset logic for SRIOV */
3903         if (amdgpu_sriov_vf(adev)) {
3904                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
3905                 if (r)
3906                         adev->asic_reset_res = r;
3907         } else {
3908                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
3909                 if (r && r == -EAGAIN)
3910                         goto retry;
3911         }
3912
3913 skip_hw_reset:
3914
3915         /* Post ASIC reset for all devs .*/
3916         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
3917                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3918                         struct amdgpu_ring *ring = tmp_adev->rings[i];
3919
3920                         if (!ring || !ring->sched.thread)
3921                                 continue;
3922
3923                         /* No point to resubmit jobs if we didn't HW reset*/
3924                         if (!tmp_adev->asic_reset_res && !job_signaled)
3925                                 drm_sched_resubmit_jobs(&ring->sched);
3926
3927                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
3928                 }
3929
3930                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
3931                         drm_helper_resume_force_mode(tmp_adev->ddev);
3932                 }
3933
3934                 tmp_adev->asic_reset_res = 0;
3935
3936                 if (r) {
3937                         /* bad news, how to tell it to userspace ? */
3938                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&adev->gpu_reset_counter));
3939                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
3940                 } else {
3941                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&adev->gpu_reset_counter));
3942                 }
3943
3944                 amdgpu_device_unlock_adev(tmp_adev);
3945         }
3946
3947         if (hive)
3948                 mutex_unlock(&hive->reset_lock);
3949
3950         if (r)
3951                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
3952         return r;
3953 }
3954
3955 /**
3956  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
3957  *
3958  * @adev: amdgpu_device pointer
3959  *
3960  * Fetchs and stores in the driver the PCIE capabilities (gen speed
3961  * and lanes) of the slot the device is in. Handles APUs and
3962  * virtualized environments where PCIE config space may not be available.
3963  */
3964 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
3965 {
3966         struct pci_dev *pdev;
3967         enum pci_bus_speed speed_cap, platform_speed_cap;
3968         enum pcie_link_width platform_link_width;
3969
3970         if (amdgpu_pcie_gen_cap)
3971                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
3972
3973         if (amdgpu_pcie_lane_cap)
3974                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
3975
3976         /* covers APUs as well */
3977         if (pci_is_root_bus(adev->pdev->bus)) {
3978                 if (adev->pm.pcie_gen_mask == 0)
3979                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
3980                 if (adev->pm.pcie_mlw_mask == 0)
3981                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
3982                 return;
3983         }
3984
3985         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
3986                 return;
3987
3988         pcie_bandwidth_available(adev->pdev, NULL,
3989                                  &platform_speed_cap, &platform_link_width);
3990
3991         if (adev->pm.pcie_gen_mask == 0) {
3992                 /* asic caps */
3993                 pdev = adev->pdev;
3994                 speed_cap = pcie_get_speed_cap(pdev);
3995                 if (speed_cap == PCI_SPEED_UNKNOWN) {
3996                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
3997                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
3998                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
3999                 } else {
4000                         if (speed_cap == PCIE_SPEED_16_0GT)
4001                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4002                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4003                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4004                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4005                         else if (speed_cap == PCIE_SPEED_8_0GT)
4006                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4007                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4008                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4009                         else if (speed_cap == PCIE_SPEED_5_0GT)
4010                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4011                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4012                         else
4013                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4014                 }
4015                 /* platform caps */
4016                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4017                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4018                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4019                 } else {
4020                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4021                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4022                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4023                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4024                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4025                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4026                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4027                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4028                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4029                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4030                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4031                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4032                         else
4033                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4034
4035                 }
4036         }
4037         if (adev->pm.pcie_mlw_mask == 0) {
4038                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4039                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4040                 } else {
4041                         switch (platform_link_width) {
4042                         case PCIE_LNK_X32:
4043                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4044                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4045                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4046                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4047                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4048                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4049                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4050                                 break;
4051                         case PCIE_LNK_X16:
4052                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4053                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4054                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4055                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4056                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4057                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4058                                 break;
4059                         case PCIE_LNK_X12:
4060                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4061                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4062                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4063                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4064                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4065                                 break;
4066                         case PCIE_LNK_X8:
4067                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4068                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4069                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4070                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4071                                 break;
4072                         case PCIE_LNK_X4:
4073                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4074                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4075                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4076                                 break;
4077                         case PCIE_LNK_X2:
4078                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4079                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4080                                 break;
4081                         case PCIE_LNK_X1:
4082                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4083                                 break;
4084                         default:
4085                                 break;
4086                         }
4087                 }
4088         }
4089 }
4090