drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33 #include <linux/iommu.h>
  34 #include <linux/pci.h>
  35 #include <linux/devcoredump.h>
  36 #include <generated/utsrelease.h>
  37 #include <linux/pci-p2pdma.h>
  38 #include <linux/apple-gmux.h>
  39
  40 #include <drm/drm_aperture.h>
  41 #include <drm/drm_atomic_helper.h>
  42 #include <drm/drm_crtc_helper.h>
  43 #include <drm/drm_fb_helper.h>
  44 #include <drm/drm_probe_helper.h>
  45 #include <drm/amdgpu_drm.h>
  46 #include <linux/vgaarb.h>
  47 #include <linux/vga_switcheroo.h>
  48 #include <linux/efi.h>
  49 #include "amdgpu.h"
  50 #include "amdgpu_trace.h"
  51 #include "amdgpu_i2c.h"
  52 #include "atom.h"
  53 #include "amdgpu_atombios.h"
  54 #include "amdgpu_atomfirmware.h"
  55 #include "amd_pcie.h"
  56 #ifdef CONFIG_DRM_AMDGPU_SI
  57 #include "si.h"
  58 #endif
  59 #ifdef CONFIG_DRM_AMDGPU_CIK
  60 #include "cik.h"
  61 #endif
  62 #include "vi.h"
  63 #include "soc15.h"
  64 #include "nv.h"
  65 #include "bif/bif_4_1_d.h"
  66 #include <linux/firmware.h>
  67 #include "amdgpu_vf_error.h"
  68
  69 #include "amdgpu_amdkfd.h"
  70 #include "amdgpu_pm.h"
  71
  72 #include "amdgpu_xgmi.h"
  73 #include "amdgpu_ras.h"
  74 #include "amdgpu_pmu.h"
  75 #include "amdgpu_fru_eeprom.h"
  76 #include "amdgpu_reset.h"
  77
  78 #include <linux/suspend.h>
  79 #include <drm/task_barrier.h>
  80 #include <linux/pm_runtime.h>
  81
  82 #include <drm/drm_drv.h>
  83
  84 #if IS_ENABLED(CONFIG_X86)
  85 #include <asm/intel-family.h>
  86 #endif
  87
  88 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  89 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  90 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  91 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  92 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  93 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  94 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  95
  96 #define AMDGPU_RESUME_MS                2000
  97 #define AMDGPU_MAX_RETRY_LIMIT          2
  98 #define AMDGPU_RETRY_SRIOV_RESET(r) ((r) == -EBUSY || (r) == -ETIMEDOUT || (r) == -EINVAL)
  99
 100 static const struct drm_driver amdgpu_kms_driver;
 101
 102 const char *amdgpu_asic_name[] = {
 103         "TAHITI",
 104         "PITCAIRN",
 105         "VERDE",
 106         "OLAND",
 107         "HAINAN",
 108         "BONAIRE",
 109         "KAVERI",
 110         "KABINI",
 111         "HAWAII",
 112         "MULLINS",
 113         "TOPAZ",
 114         "TONGA",
 115         "FIJI",
 116         "CARRIZO",
 117         "STONEY",
 118         "POLARIS10",
 119         "POLARIS11",
 120         "POLARIS12",
 121         "VEGAM",
 122         "VEGA10",
 123         "VEGA12",
 124         "VEGA20",
 125         "RAVEN",
 126         "ARCTURUS",
 127         "RENOIR",
 128         "ALDEBARAN",
 129         "NAVI10",
 130         "CYAN_SKILLFISH",
 131         "NAVI14",
 132         "NAVI12",
 133         "SIENNA_CICHLID",
 134         "NAVY_FLOUNDER",
 135         "VANGOGH",
 136         "DIMGREY_CAVEFISH",
 137         "BEIGE_GOBY",
 138         "YELLOW_CARP",
 139         "IP DISCOVERY",
 140         "LAST",
 141 };
 142
 143 /**
 144  * DOC: pcie_replay_count
 145  *
 146  * The amdgpu driver provides a sysfs API for reporting the total number
 147  * of PCIe replays (NAKs)
 148  * The file pcie_replay_count is used for this and returns the total
 149  * number of replays as a sum of the NAKs generated and NAKs received
 150  */
 151
 152 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 153                 struct device_attribute *attr, char *buf)
 154 {
 155         struct drm_device *ddev = dev_get_drvdata(dev);
 156         struct amdgpu_device *adev = drm_to_adev(ddev);
 157         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 158
 159         return sysfs_emit(buf, "%llu\n", cnt);
 160 }
 161
 162 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 163                 amdgpu_device_get_pcie_replay_count, NULL);
 164
 165 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 166
 167 /**
 168  * DOC: product_name
 169  *
 170  * The amdgpu driver provides a sysfs API for reporting the product name
 171  * for the device
 172  * The file product_name is used for this and returns the product name
 173  * as returned from the FRU.
 174  * NOTE: This is only available for certain server cards
 175  */
 176
 177 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 178                 struct device_attribute *attr, char *buf)
 179 {
 180         struct drm_device *ddev = dev_get_drvdata(dev);
 181         struct amdgpu_device *adev = drm_to_adev(ddev);
 182
 183         return sysfs_emit(buf, "%s\n", adev->product_name);
 184 }
 185
 186 static DEVICE_ATTR(product_name, S_IRUGO,
 187                 amdgpu_device_get_product_name, NULL);
 188
 189 /**
 190  * DOC: product_number
 191  *
 192  * The amdgpu driver provides a sysfs API for reporting the part number
 193  * for the device
 194  * The file product_number is used for this and returns the part number
 195  * as returned from the FRU.
 196  * NOTE: This is only available for certain server cards
 197  */
 198
 199 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 200                 struct device_attribute *attr, char *buf)
 201 {
 202         struct drm_device *ddev = dev_get_drvdata(dev);
 203         struct amdgpu_device *adev = drm_to_adev(ddev);
 204
 205         return sysfs_emit(buf, "%s\n", adev->product_number);
 206 }
 207
 208 static DEVICE_ATTR(product_number, S_IRUGO,
 209                 amdgpu_device_get_product_number, NULL);
 210
 211 /**
 212  * DOC: serial_number
 213  *
 214  * The amdgpu driver provides a sysfs API for reporting the serial number
 215  * for the device
 216  * The file serial_number is used for this and returns the serial number
 217  * as returned from the FRU.
 218  * NOTE: This is only available for certain server cards
 219  */
 220
 221 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 222                 struct device_attribute *attr, char *buf)
 223 {
 224         struct drm_device *ddev = dev_get_drvdata(dev);
 225         struct amdgpu_device *adev = drm_to_adev(ddev);
 226
 227         return sysfs_emit(buf, "%s\n", adev->serial);
 228 }
 229
 230 static DEVICE_ATTR(serial_number, S_IRUGO,
 231                 amdgpu_device_get_serial_number, NULL);
 232
 233 /**
 234  * amdgpu_device_supports_px - Is the device a dGPU with ATPX power control
 235  *
 236  * @dev: drm_device pointer
 237  *
 238  * Returns true if the device is a dGPU with ATPX power control,
 239  * otherwise return false.
 240  */
 241 bool amdgpu_device_supports_px(struct drm_device *dev)
 242 {
 243         struct amdgpu_device *adev = drm_to_adev(dev);
 244
 245         if ((adev->flags & AMD_IS_PX) && !amdgpu_is_atpx_hybrid())
 246                 return true;
 247         return false;
 248 }
 249
 250 /**
 251  * amdgpu_device_supports_boco - Is the device a dGPU with ACPI power resources
 252  *
 253  * @dev: drm_device pointer
 254  *
 255  * Returns true if the device is a dGPU with ACPI power control,
 256  * otherwise return false.
 257  */
 258 bool amdgpu_device_supports_boco(struct drm_device *dev)
 259 {
 260         struct amdgpu_device *adev = drm_to_adev(dev);
 261
 262         if (adev->has_pr3 ||
 263             ((adev->flags & AMD_IS_PX) && amdgpu_is_atpx_hybrid()))
 264                 return true;
 265         return false;
 266 }
 267
 268 /**
 269  * amdgpu_device_supports_baco - Does the device support BACO
 270  *
 271  * @dev: drm_device pointer
 272  *
 273  * Returns true if the device supporte BACO,
 274  * otherwise return false.
 275  */
 276 bool amdgpu_device_supports_baco(struct drm_device *dev)
 277 {
 278         struct amdgpu_device *adev = drm_to_adev(dev);
 279
 280         return amdgpu_asic_supports_baco(adev);
 281 }
 282
 283 /**
 284  * amdgpu_device_supports_smart_shift - Is the device dGPU with
 285  * smart shift support
 286  *
 287  * @dev: drm_device pointer
 288  *
 289  * Returns true if the device is a dGPU with Smart Shift support,
 290  * otherwise returns false.
 291  */
 292 bool amdgpu_device_supports_smart_shift(struct drm_device *dev)
 293 {
 294         return (amdgpu_device_supports_boco(dev) &&
 295                 amdgpu_acpi_is_power_shift_control_supported());
 296 }
 297
 298 /*
 299  * VRAM access helper functions
 300  */
 301
 302 /**
 303  * amdgpu_device_mm_access - access vram by MM_INDEX/MM_DATA
 304  *
 305  * @adev: amdgpu_device pointer
 306  * @pos: offset of the buffer in vram
 307  * @buf: virtual address of the buffer in system memory
 308  * @size: read/write size, sizeof(@buf) must > @size
 309  * @write: true - write to vram, otherwise - read from vram
 310  */
 311 void amdgpu_device_mm_access(struct amdgpu_device *adev, loff_t pos,
 312                              void *buf, size_t size, bool write)
 313 {
 314         unsigned long flags;
 315         uint32_t hi = ~0, tmp = 0;
 316         uint32_t *data = buf;
 317         uint64_t last;
 318         int idx;
 319
 320         if (!drm_dev_enter(adev_to_drm(adev), &idx))
 321                 return;
 322
 323         BUG_ON(!IS_ALIGNED(pos, 4) || !IS_ALIGNED(size, 4));
 324
 325         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 326         for (last = pos + size; pos < last; pos += 4) {
 327                 tmp = pos >> 31;
 328
 329                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 330                 if (tmp != hi) {
 331                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 332                         hi = tmp;
 333                 }
 334                 if (write)
 335                         WREG32_NO_KIQ(mmMM_DATA, *data++);
 336                 else
 337                         *data++ = RREG32_NO_KIQ(mmMM_DATA);
 338         }
 339
 340         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 341         drm_dev_exit(idx);
 342 }
 343
 344 /**
 345  * amdgpu_device_aper_access - access vram by vram aperature
 346  *
 347  * @adev: amdgpu_device pointer
 348  * @pos: offset of the buffer in vram
 349  * @buf: virtual address of the buffer in system memory
 350  * @size: read/write size, sizeof(@buf) must > @size
 351  * @write: true - write to vram, otherwise - read from vram
 352  *
 353  * The return value means how many bytes have been transferred.
 354  */
 355 size_t amdgpu_device_aper_access(struct amdgpu_device *adev, loff_t pos,
 356                                  void *buf, size_t size, bool write)
 357 {
 358 #ifdef CONFIG_64BIT
 359         void __iomem *addr;
 360         size_t count = 0;
 361         uint64_t last;
 362
 363         if (!adev->mman.aper_base_kaddr)
 364                 return 0;
 365
 366         last = min(pos + size, adev->gmc.visible_vram_size);
 367         if (last > pos) {
 368                 addr = adev->mman.aper_base_kaddr + pos;
 369                 count = last - pos;
 370
 371                 if (write) {
 372                         memcpy_toio(addr, buf, count);
 373                         mb();
 374                         amdgpu_device_flush_hdp(adev, NULL);
 375                 } else {
 376                         amdgpu_device_invalidate_hdp(adev, NULL);
 377                         mb();
 378                         memcpy_fromio(buf, addr, count);
 379                 }
 380
 381         }
 382
 383         return count;
 384 #else
 385         return 0;
 386 #endif
 387 }
 388
 389 /**
 390  * amdgpu_device_vram_access - read/write a buffer in vram
 391  *
 392  * @adev: amdgpu_device pointer
 393  * @pos: offset of the buffer in vram
 394  * @buf: virtual address of the buffer in system memory
 395  * @size: read/write size, sizeof(@buf) must > @size
 396  * @write: true - write to vram, otherwise - read from vram
 397  */
 398 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 399                                void *buf, size_t size, bool write)
 400 {
 401         size_t count;
 402
 403         /* try to using vram apreature to access vram first */
 404         count = amdgpu_device_aper_access(adev, pos, buf, size, write);
 405         size -= count;
 406         if (size) {
 407                 /* using MM to access rest vram */
 408                 pos += count;
 409                 buf += count;
 410                 amdgpu_device_mm_access(adev, pos, buf, size, write);
 411         }
 412 }
 413
 414 /*
 415  * register access helper functions.
 416  */
 417
 418 /* Check if hw access should be skipped because of hotplug or device error */
 419 bool amdgpu_device_skip_hw_access(struct amdgpu_device *adev)
 420 {
 421         if (adev->no_hw_access)
 422                 return true;
 423
 424 #ifdef CONFIG_LOCKDEP
 425         /*
 426          * This is a bit complicated to understand, so worth a comment. What we assert
 427          * here is that the GPU reset is not running on another thread in parallel.
 428          *
 429          * For this we trylock the read side of the reset semaphore, if that succeeds
 430          * we know that the reset is not running in paralell.
 431          *
 432          * If the trylock fails we assert that we are either already holding the read
 433          * side of the lock or are the reset thread itself and hold the write side of
 434          * the lock.
 435          */
 436         if (in_task()) {
 437                 if (down_read_trylock(&adev->reset_domain->sem))
 438                         up_read(&adev->reset_domain->sem);
 439                 else
 440                         lockdep_assert_held(&adev->reset_domain->sem);
 441         }
 442 #endif
 443         return false;
 444 }
 445
 446 /**
 447  * amdgpu_device_rreg - read a memory mapped IO or indirect register
 448  *
 449  * @adev: amdgpu_device pointer
 450  * @reg: dword aligned register offset
 451  * @acc_flags: access flags which require special behavior
 452  *
 453  * Returns the 32 bit value from the offset specified.
 454  */
 455 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev,
 456                             uint32_t reg, uint32_t acc_flags)
 457 {
 458         uint32_t ret;
 459
 460         if (amdgpu_device_skip_hw_access(adev))
 461                 return 0;
 462
 463         if ((reg * 4) < adev->rmmio_size) {
 464                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 465                     amdgpu_sriov_runtime(adev) &&
 466                     down_read_trylock(&adev->reset_domain->sem)) {
 467                         ret = amdgpu_kiq_rreg(adev, reg);
 468                         up_read(&adev->reset_domain->sem);
 469                 } else {
 470                         ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 471                 }
 472         } else {
 473                 ret = adev->pcie_rreg(adev, reg * 4);
 474         }
 475
 476         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 477
 478         return ret;
 479 }
 480
 481 /*
 482  * MMIO register read with bytes helper functions
 483  * @offset:bytes offset from MMIO start
 484  *
 485 */
 486
 487 /**
 488  * amdgpu_mm_rreg8 - read a memory mapped IO register
 489  *
 490  * @adev: amdgpu_device pointer
 491  * @offset: byte aligned register offset
 492  *
 493  * Returns the 8 bit value from the offset specified.
 494  */
 495 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset)
 496 {
 497         if (amdgpu_device_skip_hw_access(adev))
 498                 return 0;
 499
 500         if (offset < adev->rmmio_size)
 501                 return (readb(adev->rmmio + offset));
 502         BUG();
 503 }
 504
 505 /*
 506  * MMIO register write with bytes helper functions
 507  * @offset:bytes offset from MMIO start
 508  * @value: the value want to be written to the register
 509  *
 510 */
 511 /**
 512  * amdgpu_mm_wreg8 - read a memory mapped IO register
 513  *
 514  * @adev: amdgpu_device pointer
 515  * @offset: byte aligned register offset
 516  * @value: 8 bit value to write
 517  *
 518  * Writes the value specified to the offset specified.
 519  */
 520 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value)
 521 {
 522         if (amdgpu_device_skip_hw_access(adev))
 523                 return;
 524
 525         if (offset < adev->rmmio_size)
 526                 writeb(value, adev->rmmio + offset);
 527         else
 528                 BUG();
 529 }
 530
 531 /**
 532  * amdgpu_device_wreg - write to a memory mapped IO or indirect register
 533  *
 534  * @adev: amdgpu_device pointer
 535  * @reg: dword aligned register offset
 536  * @v: 32 bit value to write to the register
 537  * @acc_flags: access flags which require special behavior
 538  *
 539  * Writes the value specified to the offset specified.
 540  */
 541 void amdgpu_device_wreg(struct amdgpu_device *adev,
 542                         uint32_t reg, uint32_t v,
 543                         uint32_t acc_flags)
 544 {
 545         if (amdgpu_device_skip_hw_access(adev))
 546                 return;
 547
 548         if ((reg * 4) < adev->rmmio_size) {
 549                 if (!(acc_flags & AMDGPU_REGS_NO_KIQ) &&
 550                     amdgpu_sriov_runtime(adev) &&
 551                     down_read_trylock(&adev->reset_domain->sem)) {
 552                         amdgpu_kiq_wreg(adev, reg, v);
 553                         up_read(&adev->reset_domain->sem);
 554                 } else {
 555                         writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 556                 }
 557         } else {
 558                 adev->pcie_wreg(adev, reg * 4, v);
 559         }
 560
 561         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 562 }
 563
 564 /**
 565  * amdgpu_mm_wreg_mmio_rlc -  write register either with direct/indirect mmio or with RLC path if in range
 566  *
 567  * @adev: amdgpu_device pointer
 568  * @reg: mmio/rlc register
 569  * @v: value to write
 570  *
 571  * this function is invoked only for the debugfs register access
 572  */
 573 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev,
 574                              uint32_t reg, uint32_t v,
 575                              uint32_t xcc_id)
 576 {
 577         if (amdgpu_device_skip_hw_access(adev))
 578                 return;
 579
 580         if (amdgpu_sriov_fullaccess(adev) &&
 581             adev->gfx.rlc.funcs &&
 582             adev->gfx.rlc.funcs->is_rlcg_access_range) {
 583                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 584                         return amdgpu_sriov_wreg(adev, reg, v, 0, 0, xcc_id);
 585         } else if ((reg * 4) >= adev->rmmio_size) {
 586                 adev->pcie_wreg(adev, reg * 4, v);
 587         } else {
 588                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 589         }
 590 }
 591
 592 /**
 593  * amdgpu_device_indirect_rreg - read an indirect register
 594  *
 595  * @adev: amdgpu_device pointer
 596  * @reg_addr: indirect register address to read from
 597  *
 598  * Returns the value of indirect register @reg_addr
 599  */
 600 u32 amdgpu_device_indirect_rreg(struct amdgpu_device *adev,
 601                                 u32 reg_addr)
 602 {
 603         unsigned long flags, pcie_index, pcie_data;
 604         void __iomem *pcie_index_offset;
 605         void __iomem *pcie_data_offset;
 606         u32 r;
 607
 608         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 609         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 610
 611         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 612         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 613         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 614
 615         writel(reg_addr, pcie_index_offset);
 616         readl(pcie_index_offset);
 617         r = readl(pcie_data_offset);
 618         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 619
 620         return r;
 621 }
 622
 623 u32 amdgpu_device_indirect_rreg_ext(struct amdgpu_device *adev,
 624                                     u64 reg_addr)
 625 {
 626         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 627         u32 r;
 628         void __iomem *pcie_index_offset;
 629         void __iomem *pcie_index_hi_offset;
 630         void __iomem *pcie_data_offset;
 631
 632         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 633         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 634         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 635                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 636         else
 637                 pcie_index_hi = 0;
 638
 639         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 640         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 641         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 642         if (pcie_index_hi != 0)
 643                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 644                                 pcie_index_hi * 4;
 645
 646         writel(reg_addr, pcie_index_offset);
 647         readl(pcie_index_offset);
 648         if (pcie_index_hi != 0) {
 649                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 650                 readl(pcie_index_hi_offset);
 651         }
 652         r = readl(pcie_data_offset);
 653
 654         /* clear the high bits */
 655         if (pcie_index_hi != 0) {
 656                 writel(0, pcie_index_hi_offset);
 657                 readl(pcie_index_hi_offset);
 658         }
 659
 660         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 661
 662         return r;
 663 }
 664
 665 /**
 666  * amdgpu_device_indirect_rreg64 - read a 64bits indirect register
 667  *
 668  * @adev: amdgpu_device pointer
 669  * @reg_addr: indirect register address to read from
 670  *
 671  * Returns the value of indirect register @reg_addr
 672  */
 673 u64 amdgpu_device_indirect_rreg64(struct amdgpu_device *adev,
 674                                   u32 reg_addr)
 675 {
 676         unsigned long flags, pcie_index, pcie_data;
 677         void __iomem *pcie_index_offset;
 678         void __iomem *pcie_data_offset;
 679         u64 r;
 680
 681         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 682         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 683
 684         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 685         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 686         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 687
 688         /* read low 32 bits */
 689         writel(reg_addr, pcie_index_offset);
 690         readl(pcie_index_offset);
 691         r = readl(pcie_data_offset);
 692         /* read high 32 bits */
 693         writel(reg_addr + 4, pcie_index_offset);
 694         readl(pcie_index_offset);
 695         r |= ((u64)readl(pcie_data_offset) << 32);
 696         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 697
 698         return r;
 699 }
 700
 701 /**
 702  * amdgpu_device_indirect_wreg - write an indirect register address
 703  *
 704  * @adev: amdgpu_device pointer
 705  * @reg_addr: indirect register offset
 706  * @reg_data: indirect register data
 707  *
 708  */
 709 void amdgpu_device_indirect_wreg(struct amdgpu_device *adev,
 710                                  u32 reg_addr, u32 reg_data)
 711 {
 712         unsigned long flags, pcie_index, pcie_data;
 713         void __iomem *pcie_index_offset;
 714         void __iomem *pcie_data_offset;
 715
 716         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 717         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 718
 719         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 720         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 721         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 722
 723         writel(reg_addr, pcie_index_offset);
 724         readl(pcie_index_offset);
 725         writel(reg_data, pcie_data_offset);
 726         readl(pcie_data_offset);
 727         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 728 }
 729
 730 void amdgpu_device_indirect_wreg_ext(struct amdgpu_device *adev,
 731                                      u64 reg_addr, u32 reg_data)
 732 {
 733         unsigned long flags, pcie_index, pcie_index_hi, pcie_data;
 734         void __iomem *pcie_index_offset;
 735         void __iomem *pcie_index_hi_offset;
 736         void __iomem *pcie_data_offset;
 737
 738         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 739         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 740         if (adev->nbio.funcs->get_pcie_index_hi_offset)
 741                 pcie_index_hi = adev->nbio.funcs->get_pcie_index_hi_offset(adev);
 742         else
 743                 pcie_index_hi = 0;
 744
 745         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 746         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 747         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 748         if (pcie_index_hi != 0)
 749                 pcie_index_hi_offset = (void __iomem *)adev->rmmio +
 750                                 pcie_index_hi * 4;
 751
 752         writel(reg_addr, pcie_index_offset);
 753         readl(pcie_index_offset);
 754         if (pcie_index_hi != 0) {
 755                 writel((reg_addr >> 32) & 0xff, pcie_index_hi_offset);
 756                 readl(pcie_index_hi_offset);
 757         }
 758         writel(reg_data, pcie_data_offset);
 759         readl(pcie_data_offset);
 760
 761         /* clear the high bits */
 762         if (pcie_index_hi != 0) {
 763                 writel(0, pcie_index_hi_offset);
 764                 readl(pcie_index_hi_offset);
 765         }
 766
 767         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 768 }
 769
 770 /**
 771  * amdgpu_device_indirect_wreg64 - write a 64bits indirect register address
 772  *
 773  * @adev: amdgpu_device pointer
 774  * @reg_addr: indirect register offset
 775  * @reg_data: indirect register data
 776  *
 777  */
 778 void amdgpu_device_indirect_wreg64(struct amdgpu_device *adev,
 779                                    u32 reg_addr, u64 reg_data)
 780 {
 781         unsigned long flags, pcie_index, pcie_data;
 782         void __iomem *pcie_index_offset;
 783         void __iomem *pcie_data_offset;
 784
 785         pcie_index = adev->nbio.funcs->get_pcie_index_offset(adev);
 786         pcie_data = adev->nbio.funcs->get_pcie_data_offset(adev);
 787
 788         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
 789         pcie_index_offset = (void __iomem *)adev->rmmio + pcie_index * 4;
 790         pcie_data_offset = (void __iomem *)adev->rmmio + pcie_data * 4;
 791
 792         /* write low 32 bits */
 793         writel(reg_addr, pcie_index_offset);
 794         readl(pcie_index_offset);
 795         writel((u32)(reg_data & 0xffffffffULL), pcie_data_offset);
 796         readl(pcie_data_offset);
 797         /* write high 32 bits */
 798         writel(reg_addr + 4, pcie_index_offset);
 799         readl(pcie_index_offset);
 800         writel((u32)(reg_data >> 32), pcie_data_offset);
 801         readl(pcie_data_offset);
 802         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
 803 }
 804
 805 /**
 806  * amdgpu_device_get_rev_id - query device rev_id
 807  *
 808  * @adev: amdgpu_device pointer
 809  *
 810  * Return device rev_id
 811  */
 812 u32 amdgpu_device_get_rev_id(struct amdgpu_device *adev)
 813 {
 814         return adev->nbio.funcs->get_rev_id(adev);
 815 }
 816
 817 /**
 818  * amdgpu_invalid_rreg - dummy reg read function
 819  *
 820  * @adev: amdgpu_device pointer
 821  * @reg: offset of register
 822  *
 823  * Dummy register read function.  Used for register blocks
 824  * that certain asics don't have (all asics).
 825  * Returns the value in the register.
 826  */
 827 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 828 {
 829         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 830         BUG();
 831         return 0;
 832 }
 833
 834 static uint32_t amdgpu_invalid_rreg_ext(struct amdgpu_device *adev, uint64_t reg)
 835 {
 836         DRM_ERROR("Invalid callback to read register 0x%llX\n", reg);
 837         BUG();
 838         return 0;
 839 }
 840
 841 /**
 842  * amdgpu_invalid_wreg - dummy reg write function
 843  *
 844  * @adev: amdgpu_device pointer
 845  * @reg: offset of register
 846  * @v: value to write to the register
 847  *
 848  * Dummy register read function.  Used for register blocks
 849  * that certain asics don't have (all asics).
 850  */
 851 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 852 {
 853         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 854                   reg, v);
 855         BUG();
 856 }
 857
 858 static void amdgpu_invalid_wreg_ext(struct amdgpu_device *adev, uint64_t reg, uint32_t v)
 859 {
 860         DRM_ERROR("Invalid callback to write register 0x%llX with 0x%08X\n",
 861                   reg, v);
 862         BUG();
 863 }
 864
 865 /**
 866  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 867  *
 868  * @adev: amdgpu_device pointer
 869  * @reg: offset of register
 870  *
 871  * Dummy register read function.  Used for register blocks
 872  * that certain asics don't have (all asics).
 873  * Returns the value in the register.
 874  */
 875 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 876 {
 877         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 878         BUG();
 879         return 0;
 880 }
 881
 882 /**
 883  * amdgpu_invalid_wreg64 - dummy reg write function
 884  *
 885  * @adev: amdgpu_device pointer
 886  * @reg: offset of register
 887  * @v: value to write to the register
 888  *
 889  * Dummy register read function.  Used for register blocks
 890  * that certain asics don't have (all asics).
 891  */
 892 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 893 {
 894         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 895                   reg, v);
 896         BUG();
 897 }
 898
 899 /**
 900  * amdgpu_block_invalid_rreg - dummy reg read function
 901  *
 902  * @adev: amdgpu_device pointer
 903  * @block: offset of instance
 904  * @reg: offset of register
 905  *
 906  * Dummy register read function.  Used for register blocks
 907  * that certain asics don't have (all asics).
 908  * Returns the value in the register.
 909  */
 910 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 911                                           uint32_t block, uint32_t reg)
 912 {
 913         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 914                   reg, block);
 915         BUG();
 916         return 0;
 917 }
 918
 919 /**
 920  * amdgpu_block_invalid_wreg - dummy reg write function
 921  *
 922  * @adev: amdgpu_device pointer
 923  * @block: offset of instance
 924  * @reg: offset of register
 925  * @v: value to write to the register
 926  *
 927  * Dummy register read function.  Used for register blocks
 928  * that certain asics don't have (all asics).
 929  */
 930 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 931                                       uint32_t block,
 932                                       uint32_t reg, uint32_t v)
 933 {
 934         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 935                   reg, block, v);
 936         BUG();
 937 }
 938
 939 /**
 940  * amdgpu_device_asic_init - Wrapper for atom asic_init
 941  *
 942  * @adev: amdgpu_device pointer
 943  *
 944  * Does any asic specific work and then calls atom asic init.
 945  */
 946 static int amdgpu_device_asic_init(struct amdgpu_device *adev)
 947 {
 948         amdgpu_asic_pre_asic_init(adev);
 949
 950         if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 3) ||
 951             adev->ip_versions[GC_HWIP][0] >= IP_VERSION(11, 0, 0))
 952                 return amdgpu_atomfirmware_asic_init(adev, true);
 953         else
 954                 return amdgpu_atom_asic_init(adev->mode_info.atom_context);
 955 }
 956
 957 /**
 958  * amdgpu_device_mem_scratch_init - allocate the VRAM scratch page
 959  *
 960  * @adev: amdgpu_device pointer
 961  *
 962  * Allocates a scratch page of VRAM for use by various things in the
 963  * driver.
 964  */
 965 static int amdgpu_device_mem_scratch_init(struct amdgpu_device *adev)
 966 {
 967         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE, PAGE_SIZE,
 968                                        AMDGPU_GEM_DOMAIN_VRAM |
 969                                        AMDGPU_GEM_DOMAIN_GTT,
 970                                        &adev->mem_scratch.robj,
 971                                        &adev->mem_scratch.gpu_addr,
 972                                        (void **)&adev->mem_scratch.ptr);
 973 }
 974
 975 /**
 976  * amdgpu_device_mem_scratch_fini - Free the VRAM scratch page
 977  *
 978  * @adev: amdgpu_device pointer
 979  *
 980  * Frees the VRAM scratch page.
 981  */
 982 static void amdgpu_device_mem_scratch_fini(struct amdgpu_device *adev)
 983 {
 984         amdgpu_bo_free_kernel(&adev->mem_scratch.robj, NULL, NULL);
 985 }
 986
 987 /**
 988  * amdgpu_device_program_register_sequence - program an array of registers.
 989  *
 990  * @adev: amdgpu_device pointer
 991  * @registers: pointer to the register array
 992  * @array_size: size of the register array
 993  *
 994  * Programs an array or registers with and and or masks.
 995  * This is a helper for setting golden registers.
 996  */
 997 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 998                                              const u32 *registers,
 999                                              const u32 array_size)
1000 {
1001         u32 tmp, reg, and_mask, or_mask;
1002         int i;
1003
1004         if (array_size % 3)
1005                 return;
1006
1007         for (i = 0; i < array_size; i += 3) {
1008                 reg = registers[i + 0];
1009                 and_mask = registers[i + 1];
1010                 or_mask = registers[i + 2];
1011
1012                 if (and_mask == 0xffffffff) {
1013                         tmp = or_mask;
1014                 } else {
1015                         tmp = RREG32(reg);
1016                         tmp &= ~and_mask;
1017                         if (adev->family >= AMDGPU_FAMILY_AI)
1018                                 tmp |= (or_mask & and_mask);
1019                         else
1020                                 tmp |= or_mask;
1021                 }
1022                 WREG32(reg, tmp);
1023         }
1024 }
1025
1026 /**
1027  * amdgpu_device_pci_config_reset - reset the GPU
1028  *
1029  * @adev: amdgpu_device pointer
1030  *
1031  * Resets the GPU using the pci config reset sequence.
1032  * Only applicable to asics prior to vega10.
1033  */
1034 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
1035 {
1036         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
1037 }
1038
1039 /**
1040  * amdgpu_device_pci_reset - reset the GPU using generic PCI means
1041  *
1042  * @adev: amdgpu_device pointer
1043  *
1044  * Resets the GPU using generic pci reset interfaces (FLR, SBR, etc.).
1045  */
1046 int amdgpu_device_pci_reset(struct amdgpu_device *adev)
1047 {
1048         return pci_reset_function(adev->pdev);
1049 }
1050
1051 /*
1052  * amdgpu_device_wb_*()
1053  * Writeback is the method by which the GPU updates special pages in memory
1054  * with the status of certain GPU events (fences, ring pointers,etc.).
1055  */
1056
1057 /**
1058  * amdgpu_device_wb_fini - Disable Writeback and free memory
1059  *
1060  * @adev: amdgpu_device pointer
1061  *
1062  * Disables Writeback and frees the Writeback memory (all asics).
1063  * Used at driver shutdown.
1064  */
1065 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
1066 {
1067         if (adev->wb.wb_obj) {
1068                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
1069                                       &adev->wb.gpu_addr,
1070                                       (void **)&adev->wb.wb);
1071                 adev->wb.wb_obj = NULL;
1072         }
1073 }
1074
1075 /**
1076  * amdgpu_device_wb_init - Init Writeback driver info and allocate memory
1077  *
1078  * @adev: amdgpu_device pointer
1079  *
1080  * Initializes writeback and allocates writeback memory (all asics).
1081  * Used at driver startup.
1082  * Returns 0 on success or an -error on failure.
1083  */
1084 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
1085 {
1086         int r;
1087
1088         if (adev->wb.wb_obj == NULL) {
1089                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
1090                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
1091                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
1092                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
1093                                             (void **)&adev->wb.wb);
1094                 if (r) {
1095                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
1096                         return r;
1097                 }
1098
1099                 adev->wb.num_wb = AMDGPU_MAX_WB;
1100                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
1101
1102                 /* clear wb memory */
1103                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
1104         }
1105
1106         return 0;
1107 }
1108
1109 /**
1110  * amdgpu_device_wb_get - Allocate a wb entry
1111  *
1112  * @adev: amdgpu_device pointer
1113  * @wb: wb index
1114  *
1115  * Allocate a wb slot for use by the driver (all asics).
1116  * Returns 0 on success or -EINVAL on failure.
1117  */
1118 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
1119 {
1120         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
1121
1122         if (offset < adev->wb.num_wb) {
1123                 __set_bit(offset, adev->wb.used);
1124                 *wb = offset << 3; /* convert to dw offset */
1125                 return 0;
1126         } else {
1127                 return -EINVAL;
1128         }
1129 }
1130
1131 /**
1132  * amdgpu_device_wb_free - Free a wb entry
1133  *
1134  * @adev: amdgpu_device pointer
1135  * @wb: wb index
1136  *
1137  * Free a wb slot allocated for use by the driver (all asics)
1138  */
1139 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
1140 {
1141         wb >>= 3;
1142         if (wb < adev->wb.num_wb)
1143                 __clear_bit(wb, adev->wb.used);
1144 }
1145
1146 /**
1147  * amdgpu_device_resize_fb_bar - try to resize FB BAR
1148  *
1149  * @adev: amdgpu_device pointer
1150  *
1151  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
1152  * to fail, but if any of the BARs is not accessible after the size we abort
1153  * driver loading by returning -ENODEV.
1154  */
1155 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
1156 {
1157         int rbar_size = pci_rebar_bytes_to_size(adev->gmc.real_vram_size);
1158         struct pci_bus *root;
1159         struct resource *res;
1160         unsigned i;
1161         u16 cmd;
1162         int r;
1163
1164         if (!IS_ENABLED(CONFIG_PHYS_ADDR_T_64BIT))
1165                 return 0;
1166
1167         /* Bypass for VF */
1168         if (amdgpu_sriov_vf(adev))
1169                 return 0;
1170
1171         /* skip if the bios has already enabled large BAR */
1172         if (adev->gmc.real_vram_size &&
1173             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
1174                 return 0;
1175
1176         /* Check if the root BUS has 64bit memory resources */
1177         root = adev->pdev->bus;
1178         while (root->parent)
1179                 root = root->parent;
1180
1181         pci_bus_for_each_resource(root, res, i) {
1182                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
1183                     res->start > 0x100000000ull)
1184                         break;
1185         }
1186
1187         /* Trying to resize is pointless without a root hub window above 4GB */
1188         if (!res)
1189                 return 0;
1190
1191         /* Limit the BAR size to what is available */
1192         rbar_size = min(fls(pci_rebar_get_possible_sizes(adev->pdev, 0)) - 1,
1193                         rbar_size);
1194
1195         /* Disable memory decoding while we change the BAR addresses and size */
1196         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
1197         pci_write_config_word(adev->pdev, PCI_COMMAND,
1198                               cmd & ~PCI_COMMAND_MEMORY);
1199
1200         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
1201         amdgpu_doorbell_fini(adev);
1202         if (adev->asic_type >= CHIP_BONAIRE)
1203                 pci_release_resource(adev->pdev, 2);
1204
1205         pci_release_resource(adev->pdev, 0);
1206
1207         r = pci_resize_resource(adev->pdev, 0, rbar_size);
1208         if (r == -ENOSPC)
1209                 DRM_INFO("Not enough PCI address space for a large BAR.");
1210         else if (r && r != -ENOTSUPP)
1211                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
1212
1213         pci_assign_unassigned_bus_resources(adev->pdev->bus);
1214
1215         /* When the doorbell or fb BAR isn't available we have no chance of
1216          * using the device.
1217          */
1218         r = amdgpu_doorbell_init(adev);
1219         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
1220                 return -ENODEV;
1221
1222         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
1223
1224         return 0;
1225 }
1226
1227 static bool amdgpu_device_read_bios(struct amdgpu_device *adev)
1228 {
1229         if (hweight32(adev->aid_mask) && (adev->flags & AMD_IS_APU)) {
1230                 return false;
1231         }
1232
1233         return true;
1234 }
1235
1236 /*
1237  * GPU helpers function.
1238  */
1239 /**
1240  * amdgpu_device_need_post - check if the hw need post or not
1241  *
1242  * @adev: amdgpu_device pointer
1243  *
1244  * Check if the asic has been initialized (all asics) at driver startup
1245  * or post is needed if  hw reset is performed.
1246  * Returns true if need or false if not.
1247  */
1248 bool amdgpu_device_need_post(struct amdgpu_device *adev)
1249 {
1250         uint32_t reg;
1251
1252         if (amdgpu_sriov_vf(adev))
1253                 return false;
1254
1255         if (!amdgpu_device_read_bios(adev))
1256                 return false;
1257
1258         if (amdgpu_passthrough(adev)) {
1259                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
1260                  * some old smc fw still need driver do vPost otherwise gpu hang, while
1261                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1262                  * vpost executed for smc version below 22.15
1263                  */
1264                 if (adev->asic_type == CHIP_FIJI) {
1265                         int err;
1266                         uint32_t fw_ver;
1267                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1268                         /* force vPost if error occured */
1269                         if (err)
1270                                 return true;
1271
1272                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1273                         if (fw_ver < 0x00160e00)
1274                                 return true;
1275                 }
1276         }
1277
1278         /* Don't post if we need to reset whole hive on init */
1279         if (adev->gmc.xgmi.pending_reset)
1280                 return false;
1281
1282         if (adev->has_hw_reset) {
1283                 adev->has_hw_reset = false;
1284                 return true;
1285         }
1286
1287         /* bios scratch used on CIK+ */
1288         if (adev->asic_type >= CHIP_BONAIRE)
1289                 return amdgpu_atombios_scratch_need_asic_init(adev);
1290
1291         /* check MEM_SIZE for older asics */
1292         reg = amdgpu_asic_get_config_memsize(adev);
1293
1294         if ((reg != 0) && (reg != 0xffffffff))
1295                 return false;
1296
1297         return true;
1298 }
1299
1300 /*
1301  * Intel hosts such as Raptor Lake and Sapphire Rapids don't support dynamic
1302  * speed switching. Until we have confirmation from Intel that a specific host
1303  * supports it, it's safer that we keep it disabled for all.
1304  *
1305  * https://edc.intel.com/content/www/us/en/design/products/platforms/details/raptor-lake-s/13th-generation-core-processors-datasheet-volume-1-of-2/005/pci-express-support/
1306  * https://gitlab.freedesktop.org/drm/amd/-/issues/2663
1307  */
1308 bool amdgpu_device_pcie_dynamic_switching_supported(void)
1309 {
1310 #if IS_ENABLED(CONFIG_X86)
1311         struct cpuinfo_x86 *c = &cpu_data(0);
1312
1313         if (c->x86_vendor == X86_VENDOR_INTEL)
1314                 return false;
1315 #endif
1316         return true;
1317 }
1318
1319 /**
1320  * amdgpu_device_should_use_aspm - check if the device should program ASPM
1321  *
1322  * @adev: amdgpu_device pointer
1323  *
1324  * Confirm whether the module parameter and pcie bridge agree that ASPM should
1325  * be set for this device.
1326  *
1327  * Returns true if it should be used or false if not.
1328  */
1329 bool amdgpu_device_should_use_aspm(struct amdgpu_device *adev)
1330 {
1331         switch (amdgpu_aspm) {
1332         case -1:
1333                 break;
1334         case 0:
1335                 return false;
1336         case 1:
1337                 return true;
1338         default:
1339                 return false;
1340         }
1341         return pcie_aspm_enabled(adev->pdev);
1342 }
1343
1344 bool amdgpu_device_aspm_support_quirk(void)
1345 {
1346 #if IS_ENABLED(CONFIG_X86)
1347         struct cpuinfo_x86 *c = &cpu_data(0);
1348
1349         return !(c->x86 == 6 && c->x86_model == INTEL_FAM6_ALDERLAKE);
1350 #else
1351         return true;
1352 #endif
1353 }
1354
1355 /* if we get transitioned to only one device, take VGA back */
1356 /**
1357  * amdgpu_device_vga_set_decode - enable/disable vga decode
1358  *
1359  * @pdev: PCI device pointer
1360  * @state: enable/disable vga decode
1361  *
1362  * Enable/disable vga decode (all asics).
1363  * Returns VGA resource flags.
1364  */
1365 static unsigned int amdgpu_device_vga_set_decode(struct pci_dev *pdev,
1366                 bool state)
1367 {
1368         struct amdgpu_device *adev = drm_to_adev(pci_get_drvdata(pdev));
1369         amdgpu_asic_set_vga_state(adev, state);
1370         if (state)
1371                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1372                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1373         else
1374                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1375 }
1376
1377 /**
1378  * amdgpu_device_check_block_size - validate the vm block size
1379  *
1380  * @adev: amdgpu_device pointer
1381  *
1382  * Validates the vm block size specified via module parameter.
1383  * The vm block size defines number of bits in page table versus page directory,
1384  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1385  * page table and the remaining bits are in the page directory.
1386  */
1387 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1388 {
1389         /* defines number of bits in page table versus page directory,
1390          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1391          * page table and the remaining bits are in the page directory */
1392         if (amdgpu_vm_block_size == -1)
1393                 return;
1394
1395         if (amdgpu_vm_block_size < 9) {
1396                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1397                          amdgpu_vm_block_size);
1398                 amdgpu_vm_block_size = -1;
1399         }
1400 }
1401
1402 /**
1403  * amdgpu_device_check_vm_size - validate the vm size
1404  *
1405  * @adev: amdgpu_device pointer
1406  *
1407  * Validates the vm size in GB specified via module parameter.
1408  * The VM size is the size of the GPU virtual memory space in GB.
1409  */
1410 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1411 {
1412         /* no need to check the default value */
1413         if (amdgpu_vm_size == -1)
1414                 return;
1415
1416         if (amdgpu_vm_size < 1) {
1417                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1418                          amdgpu_vm_size);
1419                 amdgpu_vm_size = -1;
1420         }
1421 }
1422
1423 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1424 {
1425         struct sysinfo si;
1426         bool is_os_64 = (sizeof(void *) == 8);
1427         uint64_t total_memory;
1428         uint64_t dram_size_seven_GB = 0x1B8000000;
1429         uint64_t dram_size_three_GB = 0xB8000000;
1430
1431         if (amdgpu_smu_memory_pool_size == 0)
1432                 return;
1433
1434         if (!is_os_64) {
1435                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1436                 goto def_value;
1437         }
1438         si_meminfo(&si);
1439         total_memory = (uint64_t)si.totalram * si.mem_unit;
1440
1441         if ((amdgpu_smu_memory_pool_size == 1) ||
1442                 (amdgpu_smu_memory_pool_size == 2)) {
1443                 if (total_memory < dram_size_three_GB)
1444                         goto def_value1;
1445         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1446                 (amdgpu_smu_memory_pool_size == 8)) {
1447                 if (total_memory < dram_size_seven_GB)
1448                         goto def_value1;
1449         } else {
1450                 DRM_WARN("Smu memory pool size not supported\n");
1451                 goto def_value;
1452         }
1453         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1454
1455         return;
1456
1457 def_value1:
1458         DRM_WARN("No enough system memory\n");
1459 def_value:
1460         adev->pm.smu_prv_buffer_size = 0;
1461 }
1462
1463 static int amdgpu_device_init_apu_flags(struct amdgpu_device *adev)
1464 {
1465         if (!(adev->flags & AMD_IS_APU) ||
1466             adev->asic_type < CHIP_RAVEN)
1467                 return 0;
1468
1469         switch (adev->asic_type) {
1470         case CHIP_RAVEN:
1471                 if (adev->pdev->device == 0x15dd)
1472                         adev->apu_flags |= AMD_APU_IS_RAVEN;
1473                 if (adev->pdev->device == 0x15d8)
1474                         adev->apu_flags |= AMD_APU_IS_PICASSO;
1475                 break;
1476         case CHIP_RENOIR:
1477                 if ((adev->pdev->device == 0x1636) ||
1478                     (adev->pdev->device == 0x164c))
1479                         adev->apu_flags |= AMD_APU_IS_RENOIR;
1480                 else
1481                         adev->apu_flags |= AMD_APU_IS_GREEN_SARDINE;
1482                 break;
1483         case CHIP_VANGOGH:
1484                 adev->apu_flags |= AMD_APU_IS_VANGOGH;
1485                 break;
1486         case CHIP_YELLOW_CARP:
1487                 break;
1488         case CHIP_CYAN_SKILLFISH:
1489                 if ((adev->pdev->device == 0x13FE) ||
1490                     (adev->pdev->device == 0x143F))
1491                         adev->apu_flags |= AMD_APU_IS_CYAN_SKILLFISH2;
1492                 break;
1493         default:
1494                 break;
1495         }
1496
1497         return 0;
1498 }
1499
1500 /**
1501  * amdgpu_device_check_arguments - validate module params
1502  *
1503  * @adev: amdgpu_device pointer
1504  *
1505  * Validates certain module parameters and updates
1506  * the associated values used by the driver (all asics).
1507  */
1508 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1509 {
1510         if (amdgpu_sched_jobs < 4) {
1511                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1512                          amdgpu_sched_jobs);
1513                 amdgpu_sched_jobs = 4;
1514         } else if (!is_power_of_2(amdgpu_sched_jobs)) {
1515                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1516                          amdgpu_sched_jobs);
1517                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1518         }
1519
1520         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1521                 /* gart size must be greater or equal to 32M */
1522                 dev_warn(adev->dev, "gart size (%d) too small\n",
1523                          amdgpu_gart_size);
1524                 amdgpu_gart_size = -1;
1525         }
1526
1527         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1528                 /* gtt size must be greater or equal to 32M */
1529                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1530                                  amdgpu_gtt_size);
1531                 amdgpu_gtt_size = -1;
1532         }
1533
1534         /* valid range is between 4 and 9 inclusive */
1535         if (amdgpu_vm_fragment_size != -1 &&
1536             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1537                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1538                 amdgpu_vm_fragment_size = -1;
1539         }
1540
1541         if (amdgpu_sched_hw_submission < 2) {
1542                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1543                          amdgpu_sched_hw_submission);
1544                 amdgpu_sched_hw_submission = 2;
1545         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1546                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1547                          amdgpu_sched_hw_submission);
1548                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1549         }
1550
1551         if (amdgpu_reset_method < -1 || amdgpu_reset_method > 4) {
1552                 dev_warn(adev->dev, "invalid option for reset method, reverting to default\n");
1553                 amdgpu_reset_method = -1;
1554         }
1555
1556         amdgpu_device_check_smu_prv_buffer_size(adev);
1557
1558         amdgpu_device_check_vm_size(adev);
1559
1560         amdgpu_device_check_block_size(adev);
1561
1562         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1563
1564         return 0;
1565 }
1566
1567 /**
1568  * amdgpu_switcheroo_set_state - set switcheroo state
1569  *
1570  * @pdev: pci dev pointer
1571  * @state: vga_switcheroo state
1572  *
1573  * Callback for the switcheroo driver.  Suspends or resumes
1574  * the asics before or after it is powered up using ACPI methods.
1575  */
1576 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev,
1577                                         enum vga_switcheroo_state state)
1578 {
1579         struct drm_device *dev = pci_get_drvdata(pdev);
1580         int r;
1581
1582         if (amdgpu_device_supports_px(dev) && state == VGA_SWITCHEROO_OFF)
1583                 return;
1584
1585         if (state == VGA_SWITCHEROO_ON) {
1586                 pr_info("switched on\n");
1587                 /* don't suspend or resume card normally */
1588                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1589
1590                 pci_set_power_state(pdev, PCI_D0);
1591                 amdgpu_device_load_pci_state(pdev);
1592                 r = pci_enable_device(pdev);
1593                 if (r)
1594                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1595                 amdgpu_device_resume(dev, true);
1596
1597                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1598         } else {
1599                 pr_info("switched off\n");
1600                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1601                 amdgpu_device_suspend(dev, true);
1602                 amdgpu_device_cache_pci_state(pdev);
1603                 /* Shut down the device */
1604                 pci_disable_device(pdev);
1605                 pci_set_power_state(pdev, PCI_D3cold);
1606                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1607         }
1608 }
1609
1610 /**
1611  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1612  *
1613  * @pdev: pci dev pointer
1614  *
1615  * Callback for the switcheroo driver.  Check of the switcheroo
1616  * state can be changed.
1617  * Returns true if the state can be changed, false if not.
1618  */
1619 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1620 {
1621         struct drm_device *dev = pci_get_drvdata(pdev);
1622
1623         /*
1624         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1625         * locking inversion with the driver load path. And the access here is
1626         * completely racy anyway. So don't bother with locking for now.
1627         */
1628         return atomic_read(&dev->open_count) == 0;
1629 }
1630
1631 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1632         .set_gpu_state = amdgpu_switcheroo_set_state,
1633         .reprobe = NULL,
1634         .can_switch = amdgpu_switcheroo_can_switch,
1635 };
1636
1637 /**
1638  * amdgpu_device_ip_set_clockgating_state - set the CG state
1639  *
1640  * @dev: amdgpu_device pointer
1641  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1642  * @state: clockgating state (gate or ungate)
1643  *
1644  * Sets the requested clockgating state for all instances of
1645  * the hardware IP specified.
1646  * Returns the error code from the last instance.
1647  */
1648 int amdgpu_device_ip_set_clockgating_state(void *dev,
1649                                            enum amd_ip_block_type block_type,
1650                                            enum amd_clockgating_state state)
1651 {
1652         struct amdgpu_device *adev = dev;
1653         int i, r = 0;
1654
1655         for (i = 0; i < adev->num_ip_blocks; i++) {
1656                 if (!adev->ip_blocks[i].status.valid)
1657                         continue;
1658                 if (adev->ip_blocks[i].version->type != block_type)
1659                         continue;
1660                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1661                         continue;
1662                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1663                         (void *)adev, state);
1664                 if (r)
1665                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1666                                   adev->ip_blocks[i].version->funcs->name, r);
1667         }
1668         return r;
1669 }
1670
1671 /**
1672  * amdgpu_device_ip_set_powergating_state - set the PG state
1673  *
1674  * @dev: amdgpu_device pointer
1675  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1676  * @state: powergating state (gate or ungate)
1677  *
1678  * Sets the requested powergating state for all instances of
1679  * the hardware IP specified.
1680  * Returns the error code from the last instance.
1681  */
1682 int amdgpu_device_ip_set_powergating_state(void *dev,
1683                                            enum amd_ip_block_type block_type,
1684                                            enum amd_powergating_state state)
1685 {
1686         struct amdgpu_device *adev = dev;
1687         int i, r = 0;
1688
1689         for (i = 0; i < adev->num_ip_blocks; i++) {
1690                 if (!adev->ip_blocks[i].status.valid)
1691                         continue;
1692                 if (adev->ip_blocks[i].version->type != block_type)
1693                         continue;
1694                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1695                         continue;
1696                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1697                         (void *)adev, state);
1698                 if (r)
1699                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1700                                   adev->ip_blocks[i].version->funcs->name, r);
1701         }
1702         return r;
1703 }
1704
1705 /**
1706  * amdgpu_device_ip_get_clockgating_state - get the CG state
1707  *
1708  * @adev: amdgpu_device pointer
1709  * @flags: clockgating feature flags
1710  *
1711  * Walks the list of IPs on the device and updates the clockgating
1712  * flags for each IP.
1713  * Updates @flags with the feature flags for each hardware IP where
1714  * clockgating is enabled.
1715  */
1716 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1717                                             u64 *flags)
1718 {
1719         int i;
1720
1721         for (i = 0; i < adev->num_ip_blocks; i++) {
1722                 if (!adev->ip_blocks[i].status.valid)
1723                         continue;
1724                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1725                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1726         }
1727 }
1728
1729 /**
1730  * amdgpu_device_ip_wait_for_idle - wait for idle
1731  *
1732  * @adev: amdgpu_device pointer
1733  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1734  *
1735  * Waits for the request hardware IP to be idle.
1736  * Returns 0 for success or a negative error code on failure.
1737  */
1738 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1739                                    enum amd_ip_block_type block_type)
1740 {
1741         int i, r;
1742
1743         for (i = 0; i < adev->num_ip_blocks; i++) {
1744                 if (!adev->ip_blocks[i].status.valid)
1745                         continue;
1746                 if (adev->ip_blocks[i].version->type == block_type) {
1747                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1748                         if (r)
1749                                 return r;
1750                         break;
1751                 }
1752         }
1753         return 0;
1754
1755 }
1756
1757 /**
1758  * amdgpu_device_ip_is_idle - is the hardware IP idle
1759  *
1760  * @adev: amdgpu_device pointer
1761  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1762  *
1763  * Check if the hardware IP is idle or not.
1764  * Returns true if it the IP is idle, false if not.
1765  */
1766 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1767                               enum amd_ip_block_type block_type)
1768 {
1769         int i;
1770
1771         for (i = 0; i < adev->num_ip_blocks; i++) {
1772                 if (!adev->ip_blocks[i].status.valid)
1773                         continue;
1774                 if (adev->ip_blocks[i].version->type == block_type)
1775                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1776         }
1777         return true;
1778
1779 }
1780
1781 /**
1782  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1783  *
1784  * @adev: amdgpu_device pointer
1785  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1786  *
1787  * Returns a pointer to the hardware IP block structure
1788  * if it exists for the asic, otherwise NULL.
1789  */
1790 struct amdgpu_ip_block *
1791 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1792                               enum amd_ip_block_type type)
1793 {
1794         int i;
1795
1796         for (i = 0; i < adev->num_ip_blocks; i++)
1797                 if (adev->ip_blocks[i].version->type == type)
1798                         return &adev->ip_blocks[i];
1799
1800         return NULL;
1801 }
1802
1803 /**
1804  * amdgpu_device_ip_block_version_cmp
1805  *
1806  * @adev: amdgpu_device pointer
1807  * @type: enum amd_ip_block_type
1808  * @major: major version
1809  * @minor: minor version
1810  *
1811  * return 0 if equal or greater
1812  * return 1 if smaller or the ip_block doesn't exist
1813  */
1814 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1815                                        enum amd_ip_block_type type,
1816                                        u32 major, u32 minor)
1817 {
1818         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1819
1820         if (ip_block && ((ip_block->version->major > major) ||
1821                         ((ip_block->version->major == major) &&
1822                         (ip_block->version->minor >= minor))))
1823                 return 0;
1824
1825         return 1;
1826 }
1827
1828 /**
1829  * amdgpu_device_ip_block_add
1830  *
1831  * @adev: amdgpu_device pointer
1832  * @ip_block_version: pointer to the IP to add
1833  *
1834  * Adds the IP block driver information to the collection of IPs
1835  * on the asic.
1836  */
1837 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1838                                const struct amdgpu_ip_block_version *ip_block_version)
1839 {
1840         if (!ip_block_version)
1841                 return -EINVAL;
1842
1843         switch (ip_block_version->type) {
1844         case AMD_IP_BLOCK_TYPE_VCN:
1845                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_VCN_MASK)
1846                         return 0;
1847                 break;
1848         case AMD_IP_BLOCK_TYPE_JPEG:
1849                 if (adev->harvest_ip_mask & AMD_HARVEST_IP_JPEG_MASK)
1850                         return 0;
1851                 break;
1852         default:
1853                 break;
1854         }
1855
1856         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1857                   ip_block_version->funcs->name);
1858
1859         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1860
1861         return 0;
1862 }
1863
1864 /**
1865  * amdgpu_device_enable_virtual_display - enable virtual display feature
1866  *
1867  * @adev: amdgpu_device pointer
1868  *
1869  * Enabled the virtual display feature if the user has enabled it via
1870  * the module parameter virtual_display.  This feature provides a virtual
1871  * display hardware on headless boards or in virtualized environments.
1872  * This function parses and validates the configuration string specified by
1873  * the user and configues the virtual display configuration (number of
1874  * virtual connectors, crtcs, etc.) specified.
1875  */
1876 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1877 {
1878         adev->enable_virtual_display = false;
1879
1880         if (amdgpu_virtual_display) {
1881                 const char *pci_address_name = pci_name(adev->pdev);
1882                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1883
1884                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1885                 pciaddstr_tmp = pciaddstr;
1886                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1887                         pciaddname = strsep(&pciaddname_tmp, ",");
1888                         if (!strcmp("all", pciaddname)
1889                             || !strcmp(pci_address_name, pciaddname)) {
1890                                 long num_crtc;
1891                                 int res = -1;
1892
1893                                 adev->enable_virtual_display = true;
1894
1895                                 if (pciaddname_tmp)
1896                                         res = kstrtol(pciaddname_tmp, 10,
1897                                                       &num_crtc);
1898
1899                                 if (!res) {
1900                                         if (num_crtc < 1)
1901                                                 num_crtc = 1;
1902                                         if (num_crtc > 6)
1903                                                 num_crtc = 6;
1904                                         adev->mode_info.num_crtc = num_crtc;
1905                                 } else {
1906                                         adev->mode_info.num_crtc = 1;
1907                                 }
1908                                 break;
1909                         }
1910                 }
1911
1912                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1913                          amdgpu_virtual_display, pci_address_name,
1914                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1915
1916                 kfree(pciaddstr);
1917         }
1918 }
1919
1920 void amdgpu_device_set_sriov_virtual_display(struct amdgpu_device *adev)
1921 {
1922         if (amdgpu_sriov_vf(adev) && !adev->enable_virtual_display) {
1923                 adev->mode_info.num_crtc = 1;
1924                 adev->enable_virtual_display = true;
1925                 DRM_INFO("virtual_display:%d, num_crtc:%d\n",
1926                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1927         }
1928 }
1929
1930 /**
1931  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1932  *
1933  * @adev: amdgpu_device pointer
1934  *
1935  * Parses the asic configuration parameters specified in the gpu info
1936  * firmware and makes them availale to the driver for use in configuring
1937  * the asic.
1938  * Returns 0 on success, -EINVAL on failure.
1939  */
1940 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1941 {
1942         const char *chip_name;
1943         char fw_name[40];
1944         int err;
1945         const struct gpu_info_firmware_header_v1_0 *hdr;
1946
1947         adev->firmware.gpu_info_fw = NULL;
1948
1949         if (adev->mman.discovery_bin) {
1950                 /*
1951                  * FIXME: The bounding box is still needed by Navi12, so
1952                  * temporarily read it from gpu_info firmware. Should be dropped
1953                  * when DAL no longer needs it.
1954                  */
1955                 if (adev->asic_type != CHIP_NAVI12)
1956                         return 0;
1957         }
1958
1959         switch (adev->asic_type) {
1960         default:
1961                 return 0;
1962         case CHIP_VEGA10:
1963                 chip_name = "vega10";
1964                 break;
1965         case CHIP_VEGA12:
1966                 chip_name = "vega12";
1967                 break;
1968         case CHIP_RAVEN:
1969                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1970                         chip_name = "raven2";
1971                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1972                         chip_name = "picasso";
1973                 else
1974                         chip_name = "raven";
1975                 break;
1976         case CHIP_ARCTURUS:
1977                 chip_name = "arcturus";
1978                 break;
1979         case CHIP_NAVI12:
1980                 chip_name = "navi12";
1981                 break;
1982         }
1983
1984         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1985         err = amdgpu_ucode_request(adev, &adev->firmware.gpu_info_fw, fw_name);
1986         if (err) {
1987                 dev_err(adev->dev,
1988                         "Failed to get gpu_info firmware \"%s\"\n",
1989                         fw_name);
1990                 goto out;
1991         }
1992
1993         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1994         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1995
1996         switch (hdr->version_major) {
1997         case 1:
1998         {
1999                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
2000                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
2001                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2002
2003                 /*
2004                  * Should be droped when DAL no longer needs it.
2005                  */
2006                 if (adev->asic_type == CHIP_NAVI12)
2007                         goto parse_soc_bounding_box;
2008
2009                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
2010                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
2011                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
2012                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
2013                 adev->gfx.config.max_texture_channel_caches =
2014                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
2015                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
2016                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
2017                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
2018                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
2019                 adev->gfx.config.double_offchip_lds_buf =
2020                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
2021                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
2022                 adev->gfx.cu_info.max_waves_per_simd =
2023                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
2024                 adev->gfx.cu_info.max_scratch_slots_per_cu =
2025                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
2026                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
2027                 if (hdr->version_minor >= 1) {
2028                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
2029                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
2030                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2031                         adev->gfx.config.num_sc_per_sh =
2032                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
2033                         adev->gfx.config.num_packer_per_sc =
2034                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
2035                 }
2036
2037 parse_soc_bounding_box:
2038                 /*
2039                  * soc bounding box info is not integrated in disocovery table,
2040                  * we always need to parse it from gpu info firmware if needed.
2041                  */
2042                 if (hdr->version_minor == 2) {
2043                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
2044                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
2045                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
2046                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
2047                 }
2048                 break;
2049         }
2050         default:
2051                 dev_err(adev->dev,
2052                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
2053                 err = -EINVAL;
2054                 goto out;
2055         }
2056 out:
2057         return err;
2058 }
2059
2060 /**
2061  * amdgpu_device_ip_early_init - run early init for hardware IPs
2062  *
2063  * @adev: amdgpu_device pointer
2064  *
2065  * Early initialization pass for hardware IPs.  The hardware IPs that make
2066  * up each asic are discovered each IP's early_init callback is run.  This
2067  * is the first stage in initializing the asic.
2068  * Returns 0 on success, negative error code on failure.
2069  */
2070 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
2071 {
2072         struct drm_device *dev = adev_to_drm(adev);
2073         struct pci_dev *parent;
2074         int i, r;
2075         bool total;
2076
2077         amdgpu_device_enable_virtual_display(adev);
2078
2079         if (amdgpu_sriov_vf(adev)) {
2080                 r = amdgpu_virt_request_full_gpu(adev, true);
2081                 if (r)
2082                         return r;
2083         }
2084
2085         switch (adev->asic_type) {
2086 #ifdef CONFIG_DRM_AMDGPU_SI
2087         case CHIP_VERDE:
2088         case CHIP_TAHITI:
2089         case CHIP_PITCAIRN:
2090         case CHIP_OLAND:
2091         case CHIP_HAINAN:
2092                 adev->family = AMDGPU_FAMILY_SI;
2093                 r = si_set_ip_blocks(adev);
2094                 if (r)
2095                         return r;
2096                 break;
2097 #endif
2098 #ifdef CONFIG_DRM_AMDGPU_CIK
2099         case CHIP_BONAIRE:
2100         case CHIP_HAWAII:
2101         case CHIP_KAVERI:
2102         case CHIP_KABINI:
2103         case CHIP_MULLINS:
2104                 if (adev->flags & AMD_IS_APU)
2105                         adev->family = AMDGPU_FAMILY_KV;
2106                 else
2107                         adev->family = AMDGPU_FAMILY_CI;
2108
2109                 r = cik_set_ip_blocks(adev);
2110                 if (r)
2111                         return r;
2112                 break;
2113 #endif
2114         case CHIP_TOPAZ:
2115         case CHIP_TONGA:
2116         case CHIP_FIJI:
2117         case CHIP_POLARIS10:
2118         case CHIP_POLARIS11:
2119         case CHIP_POLARIS12:
2120         case CHIP_VEGAM:
2121         case CHIP_CARRIZO:
2122         case CHIP_STONEY:
2123                 if (adev->flags & AMD_IS_APU)
2124                         adev->family = AMDGPU_FAMILY_CZ;
2125                 else
2126                         adev->family = AMDGPU_FAMILY_VI;
2127
2128                 r = vi_set_ip_blocks(adev);
2129                 if (r)
2130                         return r;
2131                 break;
2132         default:
2133                 r = amdgpu_discovery_set_ip_blocks(adev);
2134                 if (r)
2135                         return r;
2136                 break;
2137         }
2138
2139         if (amdgpu_has_atpx() &&
2140             (amdgpu_is_atpx_hybrid() ||
2141              amdgpu_has_atpx_dgpu_power_cntl()) &&
2142             ((adev->flags & AMD_IS_APU) == 0) &&
2143             !pci_is_thunderbolt_attached(to_pci_dev(dev->dev)))
2144                 adev->flags |= AMD_IS_PX;
2145
2146         if (!(adev->flags & AMD_IS_APU)) {
2147                 parent = pci_upstream_bridge(adev->pdev);
2148                 adev->has_pr3 = parent ? pci_pr3_present(parent) : false;
2149         }
2150
2151
2152         adev->pm.pp_feature = amdgpu_pp_feature_mask;
2153         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
2154                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
2155         if (amdgpu_sriov_vf(adev) && adev->asic_type == CHIP_SIENNA_CICHLID)
2156                 adev->pm.pp_feature &= ~PP_OVERDRIVE_MASK;
2157
2158         total = true;
2159         for (i = 0; i < adev->num_ip_blocks; i++) {
2160                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
2161                         DRM_WARN("disabled ip block: %d <%s>\n",
2162                                   i, adev->ip_blocks[i].version->funcs->name);
2163                         adev->ip_blocks[i].status.valid = false;
2164                 } else {
2165                         if (adev->ip_blocks[i].version->funcs->early_init) {
2166                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
2167                                 if (r == -ENOENT) {
2168                                         adev->ip_blocks[i].status.valid = false;
2169                                 } else if (r) {
2170                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
2171                                                   adev->ip_blocks[i].version->funcs->name, r);
2172                                         total = false;
2173                                 } else {
2174                                         adev->ip_blocks[i].status.valid = true;
2175                                 }
2176                         } else {
2177                                 adev->ip_blocks[i].status.valid = true;
2178                         }
2179                 }
2180                 /* get the vbios after the asic_funcs are set up */
2181                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2182                         r = amdgpu_device_parse_gpu_info_fw(adev);
2183                         if (r)
2184                                 return r;
2185
2186                         /* Read BIOS */
2187                         if (amdgpu_device_read_bios(adev)) {
2188                                 if (!amdgpu_get_bios(adev))
2189                                         return -EINVAL;
2190
2191                                 r = amdgpu_atombios_init(adev);
2192                                 if (r) {
2193                                         dev_err(adev->dev, "amdgpu_atombios_init failed\n");
2194                                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
2195                                         return r;
2196                                 }
2197                         }
2198
2199                         /*get pf2vf msg info at it's earliest time*/
2200                         if (amdgpu_sriov_vf(adev))
2201                                 amdgpu_virt_init_data_exchange(adev);
2202
2203                 }
2204         }
2205         if (!total)
2206                 return -ENODEV;
2207
2208         amdgpu_amdkfd_device_probe(adev);
2209         adev->cg_flags &= amdgpu_cg_mask;
2210         adev->pg_flags &= amdgpu_pg_mask;
2211
2212         return 0;
2213 }
2214
2215 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
2216 {
2217         int i, r;
2218
2219         for (i = 0; i < adev->num_ip_blocks; i++) {
2220                 if (!adev->ip_blocks[i].status.sw)
2221                         continue;
2222                 if (adev->ip_blocks[i].status.hw)
2223                         continue;
2224                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2225                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
2226                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2227                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2228                         if (r) {
2229                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2230                                           adev->ip_blocks[i].version->funcs->name, r);
2231                                 return r;
2232                         }
2233                         adev->ip_blocks[i].status.hw = true;
2234                 }
2235         }
2236
2237         return 0;
2238 }
2239
2240 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
2241 {
2242         int i, r;
2243
2244         for (i = 0; i < adev->num_ip_blocks; i++) {
2245                 if (!adev->ip_blocks[i].status.sw)
2246                         continue;
2247                 if (adev->ip_blocks[i].status.hw)
2248                         continue;
2249                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2250                 if (r) {
2251                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2252                                   adev->ip_blocks[i].version->funcs->name, r);
2253                         return r;
2254                 }
2255                 adev->ip_blocks[i].status.hw = true;
2256         }
2257
2258         return 0;
2259 }
2260
2261 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
2262 {
2263         int r = 0;
2264         int i;
2265         uint32_t smu_version;
2266
2267         if (adev->asic_type >= CHIP_VEGA10) {
2268                 for (i = 0; i < adev->num_ip_blocks; i++) {
2269                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
2270                                 continue;
2271
2272                         if (!adev->ip_blocks[i].status.sw)
2273                                 continue;
2274
2275                         /* no need to do the fw loading again if already done*/
2276                         if (adev->ip_blocks[i].status.hw == true)
2277                                 break;
2278
2279                         if (amdgpu_in_reset(adev) || adev->in_suspend) {
2280                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2281                                 if (r) {
2282                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2283                                                           adev->ip_blocks[i].version->funcs->name, r);
2284                                         return r;
2285                                 }
2286                         } else {
2287                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
2288                                 if (r) {
2289                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
2290                                                           adev->ip_blocks[i].version->funcs->name, r);
2291                                         return r;
2292                                 }
2293                         }
2294
2295                         adev->ip_blocks[i].status.hw = true;
2296                         break;
2297                 }
2298         }
2299
2300         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
2301                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
2302
2303         return r;
2304 }
2305
2306 static int amdgpu_device_init_schedulers(struct amdgpu_device *adev)
2307 {
2308         long timeout;
2309         int r, i;
2310
2311         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
2312                 struct amdgpu_ring *ring = adev->rings[i];
2313
2314                 /* No need to setup the GPU scheduler for rings that don't need it */
2315                 if (!ring || ring->no_scheduler)
2316                         continue;
2317
2318                 switch (ring->funcs->type) {
2319                 case AMDGPU_RING_TYPE_GFX:
2320                         timeout = adev->gfx_timeout;
2321                         break;
2322                 case AMDGPU_RING_TYPE_COMPUTE:
2323                         timeout = adev->compute_timeout;
2324                         break;
2325                 case AMDGPU_RING_TYPE_SDMA:
2326                         timeout = adev->sdma_timeout;
2327                         break;
2328                 default:
2329                         timeout = adev->video_timeout;
2330                         break;
2331                 }
2332
2333                 r = drm_sched_init(&ring->sched, &amdgpu_sched_ops,
2334                                    ring->num_hw_submission, 0,
2335                                    timeout, adev->reset_domain->wq,
2336                                    ring->sched_score, ring->name,
2337                                    adev->dev);
2338                 if (r) {
2339                         DRM_ERROR("Failed to create scheduler on ring %s.\n",
2340                                   ring->name);
2341                         return r;
2342                 }
2343         }
2344
2345         amdgpu_xcp_update_partition_sched_list(adev);
2346
2347         return 0;
2348 }
2349
2350
2351 /**
2352  * amdgpu_device_ip_init - run init for hardware IPs
2353  *
2354  * @adev: amdgpu_device pointer
2355  *
2356  * Main initialization pass for hardware IPs.  The list of all the hardware
2357  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
2358  * are run.  sw_init initializes the software state associated with each IP
2359  * and hw_init initializes the hardware associated with each IP.
2360  * Returns 0 on success, negative error code on failure.
2361  */
2362 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
2363 {
2364         int i, r;
2365
2366         r = amdgpu_ras_init(adev);
2367         if (r)
2368                 return r;
2369
2370         for (i = 0; i < adev->num_ip_blocks; i++) {
2371                 if (!adev->ip_blocks[i].status.valid)
2372                         continue;
2373                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2374                 if (r) {
2375                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2376                                   adev->ip_blocks[i].version->funcs->name, r);
2377                         goto init_failed;
2378                 }
2379                 adev->ip_blocks[i].status.sw = true;
2380
2381                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
2382                         /* need to do common hw init early so everything is set up for gmc */
2383                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2384                         if (r) {
2385                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2386                                 goto init_failed;
2387                         }
2388                         adev->ip_blocks[i].status.hw = true;
2389                 } else if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2390                         /* need to do gmc hw init early so we can allocate gpu mem */
2391                         /* Try to reserve bad pages early */
2392                         if (amdgpu_sriov_vf(adev))
2393                                 amdgpu_virt_exchange_data(adev);
2394
2395                         r = amdgpu_device_mem_scratch_init(adev);
2396                         if (r) {
2397                                 DRM_ERROR("amdgpu_mem_scratch_init failed %d\n", r);
2398                                 goto init_failed;
2399                         }
2400                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2401                         if (r) {
2402                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2403                                 goto init_failed;
2404                         }
2405                         r = amdgpu_device_wb_init(adev);
2406                         if (r) {
2407                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2408                                 goto init_failed;
2409                         }
2410                         adev->ip_blocks[i].status.hw = true;
2411
2412                         /* right after GMC hw init, we create CSA */
2413                         if (adev->gfx.mcbp) {
2414                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2415                                                                AMDGPU_GEM_DOMAIN_VRAM |
2416                                                                AMDGPU_GEM_DOMAIN_GTT,
2417                                                                AMDGPU_CSA_SIZE);
2418                                 if (r) {
2419                                         DRM_ERROR("allocate CSA failed %d\n", r);
2420                                         goto init_failed;
2421                                 }
2422                         }
2423                 }
2424         }
2425
2426         if (amdgpu_sriov_vf(adev))
2427                 amdgpu_virt_init_data_exchange(adev);
2428
2429         r = amdgpu_ib_pool_init(adev);
2430         if (r) {
2431                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2432                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2433                 goto init_failed;
2434         }
2435
2436         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2437         if (r)
2438                 goto init_failed;
2439
2440         r = amdgpu_device_ip_hw_init_phase1(adev);
2441         if (r)
2442                 goto init_failed;
2443
2444         r = amdgpu_device_fw_loading(adev);
2445         if (r)
2446                 goto init_failed;
2447
2448         r = amdgpu_device_ip_hw_init_phase2(adev);
2449         if (r)
2450                 goto init_failed;
2451
2452         /*
2453          * retired pages will be loaded from eeprom and reserved here,
2454          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2455          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2456          * for I2C communication which only true at this point.
2457          *
2458          * amdgpu_ras_recovery_init may fail, but the upper only cares the
2459          * failure from bad gpu situation and stop amdgpu init process
2460          * accordingly. For other failed cases, it will still release all
2461          * the resource and print error message, rather than returning one
2462          * negative value to upper level.
2463          *
2464          * Note: theoretically, this should be called before all vram allocations
2465          * to protect retired page from abusing
2466          */
2467         r = amdgpu_ras_recovery_init(adev);
2468         if (r)
2469                 goto init_failed;
2470
2471         /**
2472          * In case of XGMI grab extra reference for reset domain for this device
2473          */
2474         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2475                 if (amdgpu_xgmi_add_device(adev) == 0) {
2476                         if (!amdgpu_sriov_vf(adev)) {
2477                                 struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
2478
2479                                 if (WARN_ON(!hive)) {
2480                                         r = -ENOENT;
2481                                         goto init_failed;
2482                                 }
2483
2484                                 if (!hive->reset_domain ||
2485                                     !amdgpu_reset_get_reset_domain(hive->reset_domain)) {
2486                                         r = -ENOENT;
2487                                         amdgpu_put_xgmi_hive(hive);
2488                                         goto init_failed;
2489                                 }
2490
2491                                 /* Drop the early temporary reset domain we created for device */
2492                                 amdgpu_reset_put_reset_domain(adev->reset_domain);
2493                                 adev->reset_domain = hive->reset_domain;
2494                                 amdgpu_put_xgmi_hive(hive);
2495                         }
2496                 }
2497         }
2498
2499         r = amdgpu_device_init_schedulers(adev);
2500         if (r)
2501                 goto init_failed;
2502
2503         /* Don't init kfd if whole hive need to be reset during init */
2504         if (!adev->gmc.xgmi.pending_reset) {
2505                 kgd2kfd_init_zone_device(adev);
2506                 amdgpu_amdkfd_device_init(adev);
2507         }
2508
2509         amdgpu_fru_get_product_info(adev);
2510
2511 init_failed:
2512
2513         return r;
2514 }
2515
2516 /**
2517  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2518  *
2519  * @adev: amdgpu_device pointer
2520  *
2521  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2522  * this function before a GPU reset.  If the value is retained after a
2523  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2524  */
2525 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2526 {
2527         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2528 }
2529
2530 /**
2531  * amdgpu_device_check_vram_lost - check if vram is valid
2532  *
2533  * @adev: amdgpu_device pointer
2534  *
2535  * Checks the reset magic value written to the gart pointer in VRAM.
2536  * The driver calls this after a GPU reset to see if the contents of
2537  * VRAM is lost or now.
2538  * returns true if vram is lost, false if not.
2539  */
2540 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2541 {
2542         if (memcmp(adev->gart.ptr, adev->reset_magic,
2543                         AMDGPU_RESET_MAGIC_NUM))
2544                 return true;
2545
2546         if (!amdgpu_in_reset(adev))
2547                 return false;
2548
2549         /*
2550          * For all ASICs with baco/mode1 reset, the VRAM is
2551          * always assumed to be lost.
2552          */
2553         switch (amdgpu_asic_reset_method(adev)) {
2554         case AMD_RESET_METHOD_BACO:
2555         case AMD_RESET_METHOD_MODE1:
2556                 return true;
2557         default:
2558                 return false;
2559         }
2560 }
2561
2562 /**
2563  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2564  *
2565  * @adev: amdgpu_device pointer
2566  * @state: clockgating state (gate or ungate)
2567  *
2568  * The list of all the hardware IPs that make up the asic is walked and the
2569  * set_clockgating_state callbacks are run.
2570  * Late initialization pass enabling clockgating for hardware IPs.
2571  * Fini or suspend, pass disabling clockgating for hardware IPs.
2572  * Returns 0 on success, negative error code on failure.
2573  */
2574
2575 int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2576                                enum amd_clockgating_state state)
2577 {
2578         int i, j, r;
2579
2580         if (amdgpu_emu_mode == 1)
2581                 return 0;
2582
2583         for (j = 0; j < adev->num_ip_blocks; j++) {
2584                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2585                 if (!adev->ip_blocks[i].status.late_initialized)
2586                         continue;
2587                 /* skip CG for GFX, SDMA on S0ix */
2588                 if (adev->in_s0ix &&
2589                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2590                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2591                         continue;
2592                 /* skip CG for VCE/UVD, it's handled specially */
2593                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2594                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2595                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2596                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2597                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2598                         /* enable clockgating to save power */
2599                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2600                                                                                      state);
2601                         if (r) {
2602                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2603                                           adev->ip_blocks[i].version->funcs->name, r);
2604                                 return r;
2605                         }
2606                 }
2607         }
2608
2609         return 0;
2610 }
2611
2612 int amdgpu_device_set_pg_state(struct amdgpu_device *adev,
2613                                enum amd_powergating_state state)
2614 {
2615         int i, j, r;
2616
2617         if (amdgpu_emu_mode == 1)
2618                 return 0;
2619
2620         for (j = 0; j < adev->num_ip_blocks; j++) {
2621                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2622                 if (!adev->ip_blocks[i].status.late_initialized)
2623                         continue;
2624                 /* skip PG for GFX, SDMA on S0ix */
2625                 if (adev->in_s0ix &&
2626                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
2627                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
2628                         continue;
2629                 /* skip CG for VCE/UVD, it's handled specially */
2630                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2631                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2632                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2633                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2634                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2635                         /* enable powergating to save power */
2636                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2637                                                                                         state);
2638                         if (r) {
2639                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2640                                           adev->ip_blocks[i].version->funcs->name, r);
2641                                 return r;
2642                         }
2643                 }
2644         }
2645         return 0;
2646 }
2647
2648 static int amdgpu_device_enable_mgpu_fan_boost(void)
2649 {
2650         struct amdgpu_gpu_instance *gpu_ins;
2651         struct amdgpu_device *adev;
2652         int i, ret = 0;
2653
2654         mutex_lock(&mgpu_info.mutex);
2655
2656         /*
2657          * MGPU fan boost feature should be enabled
2658          * only when there are two or more dGPUs in
2659          * the system
2660          */
2661         if (mgpu_info.num_dgpu < 2)
2662                 goto out;
2663
2664         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2665                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2666                 adev = gpu_ins->adev;
2667                 if (!(adev->flags & AMD_IS_APU) &&
2668                     !gpu_ins->mgpu_fan_enabled) {
2669                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2670                         if (ret)
2671                                 break;
2672
2673                         gpu_ins->mgpu_fan_enabled = 1;
2674                 }
2675         }
2676
2677 out:
2678         mutex_unlock(&mgpu_info.mutex);
2679
2680         return ret;
2681 }
2682
2683 /**
2684  * amdgpu_device_ip_late_init - run late init for hardware IPs
2685  *
2686  * @adev: amdgpu_device pointer
2687  *
2688  * Late initialization pass for hardware IPs.  The list of all the hardware
2689  * IPs that make up the asic is walked and the late_init callbacks are run.
2690  * late_init covers any special initialization that an IP requires
2691  * after all of the have been initialized or something that needs to happen
2692  * late in the init process.
2693  * Returns 0 on success, negative error code on failure.
2694  */
2695 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2696 {
2697         struct amdgpu_gpu_instance *gpu_instance;
2698         int i = 0, r;
2699
2700         for (i = 0; i < adev->num_ip_blocks; i++) {
2701                 if (!adev->ip_blocks[i].status.hw)
2702                         continue;
2703                 if (adev->ip_blocks[i].version->funcs->late_init) {
2704                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2705                         if (r) {
2706                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2707                                           adev->ip_blocks[i].version->funcs->name, r);
2708                                 return r;
2709                         }
2710                 }
2711                 adev->ip_blocks[i].status.late_initialized = true;
2712         }
2713
2714         r = amdgpu_ras_late_init(adev);
2715         if (r) {
2716                 DRM_ERROR("amdgpu_ras_late_init failed %d", r);
2717                 return r;
2718         }
2719
2720         amdgpu_ras_set_error_query_ready(adev, true);
2721
2722         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2723         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2724
2725         amdgpu_device_fill_reset_magic(adev);
2726
2727         r = amdgpu_device_enable_mgpu_fan_boost();
2728         if (r)
2729                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2730
2731         /* For passthrough configuration on arcturus and aldebaran, enable special handling SBR */
2732         if (amdgpu_passthrough(adev) &&
2733             ((adev->asic_type == CHIP_ARCTURUS && adev->gmc.xgmi.num_physical_nodes > 1) ||
2734              adev->asic_type == CHIP_ALDEBARAN))
2735                 amdgpu_dpm_handle_passthrough_sbr(adev, true);
2736
2737         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2738                 mutex_lock(&mgpu_info.mutex);
2739
2740                 /*
2741                  * Reset device p-state to low as this was booted with high.
2742                  *
2743                  * This should be performed only after all devices from the same
2744                  * hive get initialized.
2745                  *
2746                  * However, it's unknown how many device in the hive in advance.
2747                  * As this is counted one by one during devices initializations.
2748                  *
2749                  * So, we wait for all XGMI interlinked devices initialized.
2750                  * This may bring some delays as those devices may come from
2751                  * different hives. But that should be OK.
2752                  */
2753                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2754                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2755                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2756                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2757                                         continue;
2758
2759                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2760                                                 AMDGPU_XGMI_PSTATE_MIN);
2761                                 if (r) {
2762                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2763                                         break;
2764                                 }
2765                         }
2766                 }
2767
2768                 mutex_unlock(&mgpu_info.mutex);
2769         }
2770
2771         return 0;
2772 }
2773
2774 /**
2775  * amdgpu_device_smu_fini_early - smu hw_fini wrapper
2776  *
2777  * @adev: amdgpu_device pointer
2778  *
2779  * For ASICs need to disable SMC first
2780  */
2781 static void amdgpu_device_smu_fini_early(struct amdgpu_device *adev)
2782 {
2783         int i, r;
2784
2785         if (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))
2786                 return;
2787
2788         for (i = 0; i < adev->num_ip_blocks; i++) {
2789                 if (!adev->ip_blocks[i].status.hw)
2790                         continue;
2791                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2792                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2793                         /* XXX handle errors */
2794                         if (r) {
2795                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2796                                           adev->ip_blocks[i].version->funcs->name, r);
2797                         }
2798                         adev->ip_blocks[i].status.hw = false;
2799                         break;
2800                 }
2801         }
2802 }
2803
2804 static int amdgpu_device_ip_fini_early(struct amdgpu_device *adev)
2805 {
2806         int i, r;
2807
2808         for (i = 0; i < adev->num_ip_blocks; i++) {
2809                 if (!adev->ip_blocks[i].version->funcs->early_fini)
2810                         continue;
2811
2812                 r = adev->ip_blocks[i].version->funcs->early_fini((void *)adev);
2813                 if (r) {
2814                         DRM_DEBUG("early_fini of IP block <%s> failed %d\n",
2815                                   adev->ip_blocks[i].version->funcs->name, r);
2816                 }
2817         }
2818
2819         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2820         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2821
2822         amdgpu_amdkfd_suspend(adev, false);
2823
2824         /* Workaroud for ASICs need to disable SMC first */
2825         amdgpu_device_smu_fini_early(adev);
2826
2827         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2828                 if (!adev->ip_blocks[i].status.hw)
2829                         continue;
2830
2831                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2832                 /* XXX handle errors */
2833                 if (r) {
2834                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2835                                   adev->ip_blocks[i].version->funcs->name, r);
2836                 }
2837
2838                 adev->ip_blocks[i].status.hw = false;
2839         }
2840
2841         if (amdgpu_sriov_vf(adev)) {
2842                 if (amdgpu_virt_release_full_gpu(adev, false))
2843                         DRM_ERROR("failed to release exclusive mode on fini\n");
2844         }
2845
2846         return 0;
2847 }
2848
2849 /**
2850  * amdgpu_device_ip_fini - run fini for hardware IPs
2851  *
2852  * @adev: amdgpu_device pointer
2853  *
2854  * Main teardown pass for hardware IPs.  The list of all the hardware
2855  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2856  * are run.  hw_fini tears down the hardware associated with each IP
2857  * and sw_fini tears down any software state associated with each IP.
2858  * Returns 0 on success, negative error code on failure.
2859  */
2860 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2861 {
2862         int i, r;
2863
2864         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2865                 amdgpu_virt_release_ras_err_handler_data(adev);
2866
2867         if (adev->gmc.xgmi.num_physical_nodes > 1)
2868                 amdgpu_xgmi_remove_device(adev);
2869
2870         amdgpu_amdkfd_device_fini_sw(adev);
2871
2872         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2873                 if (!adev->ip_blocks[i].status.sw)
2874                         continue;
2875
2876                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2877                         amdgpu_ucode_free_bo(adev);
2878                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2879                         amdgpu_device_wb_fini(adev);
2880                         amdgpu_device_mem_scratch_fini(adev);
2881                         amdgpu_ib_pool_fini(adev);
2882                 }
2883
2884                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2885                 /* XXX handle errors */
2886                 if (r) {
2887                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2888                                   adev->ip_blocks[i].version->funcs->name, r);
2889                 }
2890                 adev->ip_blocks[i].status.sw = false;
2891                 adev->ip_blocks[i].status.valid = false;
2892         }
2893
2894         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2895                 if (!adev->ip_blocks[i].status.late_initialized)
2896                         continue;
2897                 if (adev->ip_blocks[i].version->funcs->late_fini)
2898                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2899                 adev->ip_blocks[i].status.late_initialized = false;
2900         }
2901
2902         amdgpu_ras_fini(adev);
2903
2904         return 0;
2905 }
2906
2907 /**
2908  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2909  *
2910  * @work: work_struct.
2911  */
2912 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2913 {
2914         struct amdgpu_device *adev =
2915                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2916         int r;
2917
2918         r = amdgpu_ib_ring_tests(adev);
2919         if (r)
2920                 DRM_ERROR("ib ring test failed (%d).\n", r);
2921 }
2922
2923 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2924 {
2925         struct amdgpu_device *adev =
2926                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2927
2928         WARN_ON_ONCE(adev->gfx.gfx_off_state);
2929         WARN_ON_ONCE(adev->gfx.gfx_off_req_count);
2930
2931         if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2932                 adev->gfx.gfx_off_state = true;
2933 }
2934
2935 /**
2936  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2937  *
2938  * @adev: amdgpu_device pointer
2939  *
2940  * Main suspend function for hardware IPs.  The list of all the hardware
2941  * IPs that make up the asic is walked, clockgating is disabled and the
2942  * suspend callbacks are run.  suspend puts the hardware and software state
2943  * in each IP into a state suitable for suspend.
2944  * Returns 0 on success, negative error code on failure.
2945  */
2946 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2947 {
2948         int i, r;
2949
2950         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2951         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2952
2953         /*
2954          * Per PMFW team's suggestion, driver needs to handle gfxoff
2955          * and df cstate features disablement for gpu reset(e.g. Mode1Reset)
2956          * scenario. Add the missing df cstate disablement here.
2957          */
2958         if (amdgpu_dpm_set_df_cstate(adev, DF_CSTATE_DISALLOW))
2959                 dev_warn(adev->dev, "Failed to disallow df cstate");
2960
2961         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2962                 if (!adev->ip_blocks[i].status.valid)
2963                         continue;
2964
2965                 /* displays are handled separately */
2966                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2967                         continue;
2968
2969                 /* XXX handle errors */
2970                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2971                 /* XXX handle errors */
2972                 if (r) {
2973                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2974                                   adev->ip_blocks[i].version->funcs->name, r);
2975                         return r;
2976                 }
2977
2978                 adev->ip_blocks[i].status.hw = false;
2979         }
2980
2981         return 0;
2982 }
2983
2984 /**
2985  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2986  *
2987  * @adev: amdgpu_device pointer
2988  *
2989  * Main suspend function for hardware IPs.  The list of all the hardware
2990  * IPs that make up the asic is walked, clockgating is disabled and the
2991  * suspend callbacks are run.  suspend puts the hardware and software state
2992  * in each IP into a state suitable for suspend.
2993  * Returns 0 on success, negative error code on failure.
2994  */
2995 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2996 {
2997         int i, r;
2998
2999         if (adev->in_s0ix)
3000                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D3Entry);
3001
3002         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
3003                 if (!adev->ip_blocks[i].status.valid)
3004                         continue;
3005                 /* displays are handled in phase1 */
3006                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
3007                         continue;
3008                 /* PSP lost connection when err_event_athub occurs */
3009                 if (amdgpu_ras_intr_triggered() &&
3010                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3011                         adev->ip_blocks[i].status.hw = false;
3012                         continue;
3013                 }
3014
3015                 /* skip unnecessary suspend if we do not initialize them yet */
3016                 if (adev->gmc.xgmi.pending_reset &&
3017                     !(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3018                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC ||
3019                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3020                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH)) {
3021                         adev->ip_blocks[i].status.hw = false;
3022                         continue;
3023                 }
3024
3025                 /* skip suspend of gfx/mes and psp for S0ix
3026                  * gfx is in gfxoff state, so on resume it will exit gfxoff just
3027                  * like at runtime. PSP is also part of the always on hardware
3028                  * so no need to suspend it.
3029                  */
3030                 if (adev->in_s0ix &&
3031                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP ||
3032                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GFX ||
3033                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_MES))
3034                         continue;
3035
3036                 /* SDMA 5.x+ is part of GFX power domain so it's covered by GFXOFF */
3037                 if (adev->in_s0ix &&
3038                     (adev->ip_versions[SDMA0_HWIP][0] >= IP_VERSION(5, 0, 0)) &&
3039                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SDMA))
3040                         continue;
3041
3042                 /* Once swPSP provides the IMU, RLC FW binaries to TOS during cold-boot.
3043                  * These are in TMR, hence are expected to be reused by PSP-TOS to reload
3044                  * from this location and RLC Autoload automatically also gets loaded
3045                  * from here based on PMFW -> PSP message during re-init sequence.
3046                  * Therefore, the psp suspend & resume should be skipped to avoid destroy
3047                  * the TMR and reload FWs again for IMU enabled APU ASICs.
3048                  */
3049                 if (amdgpu_in_reset(adev) &&
3050                     (adev->flags & AMD_IS_APU) && adev->gfx.imu.funcs &&
3051                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3052                         continue;
3053
3054                 /* XXX handle errors */
3055                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
3056                 /* XXX handle errors */
3057                 if (r) {
3058                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
3059                                   adev->ip_blocks[i].version->funcs->name, r);
3060                 }
3061                 adev->ip_blocks[i].status.hw = false;
3062                 /* handle putting the SMC in the appropriate state */
3063                 if (!amdgpu_sriov_vf(adev)) {
3064                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
3065                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
3066                                 if (r) {
3067                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
3068                                                         adev->mp1_state, r);
3069                                         return r;
3070                                 }
3071                         }
3072                 }
3073         }
3074
3075         return 0;
3076 }
3077
3078 /**
3079  * amdgpu_device_ip_suspend - run suspend for hardware IPs
3080  *
3081  * @adev: amdgpu_device pointer
3082  *
3083  * Main suspend function for hardware IPs.  The list of all the hardware
3084  * IPs that make up the asic is walked, clockgating is disabled and the
3085  * suspend callbacks are run.  suspend puts the hardware and software state
3086  * in each IP into a state suitable for suspend.
3087  * Returns 0 on success, negative error code on failure.
3088  */
3089 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
3090 {
3091         int r;
3092
3093         if (amdgpu_sriov_vf(adev)) {
3094                 amdgpu_virt_fini_data_exchange(adev);
3095                 amdgpu_virt_request_full_gpu(adev, false);
3096         }
3097
3098         r = amdgpu_device_ip_suspend_phase1(adev);
3099         if (r)
3100                 return r;
3101         r = amdgpu_device_ip_suspend_phase2(adev);
3102
3103         if (amdgpu_sriov_vf(adev))
3104                 amdgpu_virt_release_full_gpu(adev, false);
3105
3106         return r;
3107 }
3108
3109 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
3110 {
3111         int i, r;
3112
3113         static enum amd_ip_block_type ip_order[] = {
3114                 AMD_IP_BLOCK_TYPE_COMMON,
3115                 AMD_IP_BLOCK_TYPE_GMC,
3116                 AMD_IP_BLOCK_TYPE_PSP,
3117                 AMD_IP_BLOCK_TYPE_IH,
3118         };
3119
3120         for (i = 0; i < adev->num_ip_blocks; i++) {
3121                 int j;
3122                 struct amdgpu_ip_block *block;
3123
3124                 block = &adev->ip_blocks[i];
3125                 block->status.hw = false;
3126
3127                 for (j = 0; j < ARRAY_SIZE(ip_order); j++) {
3128
3129                         if (block->version->type != ip_order[j] ||
3130                                 !block->status.valid)
3131                                 continue;
3132
3133                         r = block->version->funcs->hw_init(adev);
3134                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3135                         if (r)
3136                                 return r;
3137                         block->status.hw = true;
3138                 }
3139         }
3140
3141         return 0;
3142 }
3143
3144 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
3145 {
3146         int i, r;
3147
3148         static enum amd_ip_block_type ip_order[] = {
3149                 AMD_IP_BLOCK_TYPE_SMC,
3150                 AMD_IP_BLOCK_TYPE_DCE,
3151                 AMD_IP_BLOCK_TYPE_GFX,
3152                 AMD_IP_BLOCK_TYPE_SDMA,
3153                 AMD_IP_BLOCK_TYPE_MES,
3154                 AMD_IP_BLOCK_TYPE_UVD,
3155                 AMD_IP_BLOCK_TYPE_VCE,
3156                 AMD_IP_BLOCK_TYPE_VCN,
3157                 AMD_IP_BLOCK_TYPE_JPEG
3158         };
3159
3160         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
3161                 int j;
3162                 struct amdgpu_ip_block *block;
3163
3164                 for (j = 0; j < adev->num_ip_blocks; j++) {
3165                         block = &adev->ip_blocks[j];
3166
3167                         if (block->version->type != ip_order[i] ||
3168                                 !block->status.valid ||
3169                                 block->status.hw)
3170                                 continue;
3171
3172                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
3173                                 r = block->version->funcs->resume(adev);
3174                         else
3175                                 r = block->version->funcs->hw_init(adev);
3176
3177                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
3178                         if (r)
3179                                 return r;
3180                         block->status.hw = true;
3181                 }
3182         }
3183
3184         return 0;
3185 }
3186
3187 /**
3188  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
3189  *
3190  * @adev: amdgpu_device pointer
3191  *
3192  * First resume function for hardware IPs.  The list of all the hardware
3193  * IPs that make up the asic is walked and the resume callbacks are run for
3194  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
3195  * after a suspend and updates the software state as necessary.  This
3196  * function is also used for restoring the GPU after a GPU reset.
3197  * Returns 0 on success, negative error code on failure.
3198  */
3199 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
3200 {
3201         int i, r;
3202
3203         for (i = 0; i < adev->num_ip_blocks; i++) {
3204                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3205                         continue;
3206                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3207                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3208                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3209                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP && amdgpu_sriov_vf(adev))) {
3210
3211                         r = adev->ip_blocks[i].version->funcs->resume(adev);
3212                         if (r) {
3213                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
3214                                           adev->ip_blocks[i].version->funcs->name, r);
3215                                 return r;
3216                         }
3217                         adev->ip_blocks[i].status.hw = true;
3218                 }
3219         }
3220
3221         return 0;
3222 }
3223
3224 /**
3225  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
3226  *
3227  * @adev: amdgpu_device pointer
3228  *
3229  * First resume function for hardware IPs.  The list of all the hardware
3230  * IPs that make up the asic is walked and the resume callbacks are run for
3231  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
3232  * functional state after a suspend and updates the software state as
3233  * necessary.  This function is also used for restoring the GPU after a GPU
3234  * reset.
3235  * Returns 0 on success, negative error code on failure.
3236  */
3237 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
3238 {
3239         int i, r;
3240
3241         for (i = 0; i < adev->num_ip_blocks; i++) {
3242                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
3243                         continue;
3244                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3245                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3246                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3247                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
3248                         continue;
3249                 r = adev->ip_blocks[i].version->funcs->resume(adev);
3250                 if (r) {
3251                         DRM_ERROR("resume of IP block <%s> failed %d\n",
3252                                   adev->ip_blocks[i].version->funcs->name, r);
3253                         return r;
3254                 }
3255                 adev->ip_blocks[i].status.hw = true;
3256         }
3257
3258         return 0;
3259 }
3260
3261 /**
3262  * amdgpu_device_ip_resume - run resume for hardware IPs
3263  *
3264  * @adev: amdgpu_device pointer
3265  *
3266  * Main resume function for hardware IPs.  The hardware IPs
3267  * are split into two resume functions because they are
3268  * are also used in in recovering from a GPU reset and some additional
3269  * steps need to be take between them.  In this case (S3/S4) they are
3270  * run sequentially.
3271  * Returns 0 on success, negative error code on failure.
3272  */
3273 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
3274 {
3275         int r;
3276
3277         if (!adev->in_s0ix) {
3278                 r = amdgpu_amdkfd_resume_iommu(adev);
3279                 if (r)
3280                         return r;
3281         }
3282
3283         r = amdgpu_device_ip_resume_phase1(adev);
3284         if (r)
3285                 return r;
3286
3287         r = amdgpu_device_fw_loading(adev);
3288         if (r)
3289                 return r;
3290
3291         r = amdgpu_device_ip_resume_phase2(adev);
3292
3293         return r;
3294 }
3295
3296 /**
3297  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
3298  *
3299  * @adev: amdgpu_device pointer
3300  *
3301  * Query the VBIOS data tables to determine if the board supports SR-IOV.
3302  */
3303 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
3304 {
3305         if (amdgpu_sriov_vf(adev)) {
3306                 if (adev->is_atom_fw) {
3307                         if (amdgpu_atomfirmware_gpu_virtualization_supported(adev))
3308                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3309                 } else {
3310                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
3311                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
3312                 }
3313
3314                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
3315                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
3316         }
3317 }
3318
3319 /**
3320  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
3321  *
3322  * @asic_type: AMD asic type
3323  *
3324  * Check if there is DC (new modesetting infrastructre) support for an asic.
3325  * returns true if DC has support, false if not.
3326  */
3327 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
3328 {
3329         switch (asic_type) {
3330 #ifdef CONFIG_DRM_AMDGPU_SI
3331         case CHIP_HAINAN:
3332 #endif
3333         case CHIP_TOPAZ:
3334                 /* chips with no display hardware */
3335                 return false;
3336 #if defined(CONFIG_DRM_AMD_DC)
3337         case CHIP_TAHITI:
3338         case CHIP_PITCAIRN:
3339         case CHIP_VERDE:
3340         case CHIP_OLAND:
3341                 /*
3342                  * We have systems in the wild with these ASICs that require
3343                  * LVDS and VGA support which is not supported with DC.
3344                  *
3345                  * Fallback to the non-DC driver here by default so as not to
3346                  * cause regressions.
3347                  */
3348 #if defined(CONFIG_DRM_AMD_DC_SI)
3349                 return amdgpu_dc > 0;
3350 #else
3351                 return false;
3352 #endif
3353         case CHIP_BONAIRE:
3354         case CHIP_KAVERI:
3355         case CHIP_KABINI:
3356         case CHIP_MULLINS:
3357                 /*
3358                  * We have systems in the wild with these ASICs that require
3359                  * VGA support which is not supported with DC.
3360                  *
3361                  * Fallback to the non-DC driver here by default so as not to
3362                  * cause regressions.
3363                  */
3364                 return amdgpu_dc > 0;
3365         default:
3366                 return amdgpu_dc != 0;
3367 #else
3368         default:
3369                 if (amdgpu_dc > 0)
3370                         DRM_INFO_ONCE("Display Core has been requested via kernel parameter "
3371                                          "but isn't supported by ASIC, ignoring\n");
3372                 return false;
3373 #endif
3374         }
3375 }
3376
3377 /**
3378  * amdgpu_device_has_dc_support - check if dc is supported
3379  *
3380  * @adev: amdgpu_device pointer
3381  *
3382  * Returns true for supported, false for not supported
3383  */
3384 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
3385 {
3386         if (adev->enable_virtual_display ||
3387             (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
3388                 return false;
3389
3390         return amdgpu_device_asic_has_dc_support(adev->asic_type);
3391 }
3392
3393 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
3394 {
3395         struct amdgpu_device *adev =
3396                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
3397         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev);
3398
3399         /* It's a bug to not have a hive within this function */
3400         if (WARN_ON(!hive))
3401                 return;
3402
3403         /*
3404          * Use task barrier to synchronize all xgmi reset works across the
3405          * hive. task_barrier_enter and task_barrier_exit will block
3406          * until all the threads running the xgmi reset works reach
3407          * those points. task_barrier_full will do both blocks.
3408          */
3409         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
3410
3411                 task_barrier_enter(&hive->tb);
3412                 adev->asic_reset_res = amdgpu_device_baco_enter(adev_to_drm(adev));
3413
3414                 if (adev->asic_reset_res)
3415                         goto fail;
3416
3417                 task_barrier_exit(&hive->tb);
3418                 adev->asic_reset_res = amdgpu_device_baco_exit(adev_to_drm(adev));
3419
3420                 if (adev->asic_reset_res)
3421                         goto fail;
3422
3423                 if (adev->mmhub.ras && adev->mmhub.ras->ras_block.hw_ops &&
3424                     adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
3425                         adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(adev);
3426         } else {
3427
3428                 task_barrier_full(&hive->tb);
3429                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
3430         }
3431
3432 fail:
3433         if (adev->asic_reset_res)
3434                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
3435                          adev->asic_reset_res, adev_to_drm(adev)->unique);
3436         amdgpu_put_xgmi_hive(hive);
3437 }
3438
3439 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
3440 {
3441         char *input = amdgpu_lockup_timeout;
3442         char *timeout_setting = NULL;
3443         int index = 0;
3444         long timeout;
3445         int ret = 0;
3446
3447         /*
3448          * By default timeout for non compute jobs is 10000
3449          * and 60000 for compute jobs.
3450          * In SR-IOV or passthrough mode, timeout for compute
3451          * jobs are 60000 by default.
3452          */
3453         adev->gfx_timeout = msecs_to_jiffies(10000);
3454         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3455         if (amdgpu_sriov_vf(adev))
3456                 adev->compute_timeout = amdgpu_sriov_is_pp_one_vf(adev) ?
3457                                         msecs_to_jiffies(60000) : msecs_to_jiffies(10000);
3458         else
3459                 adev->compute_timeout =  msecs_to_jiffies(60000);
3460
3461         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3462                 while ((timeout_setting = strsep(&input, ",")) &&
3463                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
3464                         ret = kstrtol(timeout_setting, 0, &timeout);
3465                         if (ret)
3466                                 return ret;
3467
3468                         if (timeout == 0) {
3469                                 index++;
3470                                 continue;
3471                         } else if (timeout < 0) {
3472                                 timeout = MAX_SCHEDULE_TIMEOUT;
3473                                 dev_warn(adev->dev, "lockup timeout disabled");
3474                                 add_taint(TAINT_SOFTLOCKUP, LOCKDEP_STILL_OK);
3475                         } else {
3476                                 timeout = msecs_to_jiffies(timeout);
3477                         }
3478
3479                         switch (index++) {
3480                         case 0:
3481                                 adev->gfx_timeout = timeout;
3482                                 break;
3483                         case 1:
3484                                 adev->compute_timeout = timeout;
3485                                 break;
3486                         case 2:
3487                                 adev->sdma_timeout = timeout;
3488                                 break;
3489                         case 3:
3490                                 adev->video_timeout = timeout;
3491                                 break;
3492                         default:
3493                                 break;
3494                         }
3495                 }
3496                 /*
3497                  * There is only one value specified and
3498                  * it should apply to all non-compute jobs.
3499                  */
3500                 if (index == 1) {
3501                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
3502                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
3503                                 adev->compute_timeout = adev->gfx_timeout;
3504                 }
3505         }
3506
3507         return ret;
3508 }
3509
3510 /**
3511  * amdgpu_device_check_iommu_direct_map - check if RAM direct mapped to GPU
3512  *
3513  * @adev: amdgpu_device pointer
3514  *
3515  * RAM direct mapped to GPU if IOMMU is not enabled or is pass through mode
3516  */
3517 static void amdgpu_device_check_iommu_direct_map(struct amdgpu_device *adev)
3518 {
3519         struct iommu_domain *domain;
3520
3521         domain = iommu_get_domain_for_dev(adev->dev);
3522         if (!domain || domain->type == IOMMU_DOMAIN_IDENTITY)
3523                 adev->ram_is_direct_mapped = true;
3524 }
3525
3526 static const struct attribute *amdgpu_dev_attributes[] = {
3527         &dev_attr_product_name.attr,
3528         &dev_attr_product_number.attr,
3529         &dev_attr_serial_number.attr,
3530         &dev_attr_pcie_replay_count.attr,
3531         NULL
3532 };
3533
3534 static void amdgpu_device_set_mcbp(struct amdgpu_device *adev)
3535 {
3536         if (amdgpu_mcbp == 1)
3537                 adev->gfx.mcbp = true;
3538
3539         if ((adev->ip_versions[GC_HWIP][0] >= IP_VERSION(9, 0, 0)) &&
3540             (adev->ip_versions[GC_HWIP][0] < IP_VERSION(10, 0, 0)) &&
3541             adev->gfx.num_gfx_rings)
3542                 adev->gfx.mcbp = true;
3543
3544         if (amdgpu_sriov_vf(adev))
3545                 adev->gfx.mcbp = true;
3546
3547         if (adev->gfx.mcbp)
3548                 DRM_INFO("MCBP is enabled\n");
3549 }
3550
3551 /**
3552  * amdgpu_device_init - initialize the driver
3553  *
3554  * @adev: amdgpu_device pointer
3555  * @flags: driver flags
3556  *
3557  * Initializes the driver info and hw (all asics).
3558  * Returns 0 for success or an error on failure.
3559  * Called at driver startup.
3560  */
3561 int amdgpu_device_init(struct amdgpu_device *adev,
3562                        uint32_t flags)
3563 {
3564         struct drm_device *ddev = adev_to_drm(adev);
3565         struct pci_dev *pdev = adev->pdev;
3566         int r, i;
3567         bool px = false;
3568         u32 max_MBps;
3569         int tmp;
3570
3571         adev->shutdown = false;
3572         adev->flags = flags;
3573
3574         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
3575                 adev->asic_type = amdgpu_force_asic_type;
3576         else
3577                 adev->asic_type = flags & AMD_ASIC_MASK;
3578
3579         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3580         if (amdgpu_emu_mode == 1)
3581                 adev->usec_timeout *= 10;
3582         adev->gmc.gart_size = 512 * 1024 * 1024;
3583         adev->accel_working = false;
3584         adev->num_rings = 0;
3585         RCU_INIT_POINTER(adev->gang_submit, dma_fence_get_stub());
3586         adev->mman.buffer_funcs = NULL;
3587         adev->mman.buffer_funcs_ring = NULL;
3588         adev->vm_manager.vm_pte_funcs = NULL;
3589         adev->vm_manager.vm_pte_num_scheds = 0;
3590         adev->gmc.gmc_funcs = NULL;
3591         adev->harvest_ip_mask = 0x0;
3592         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3593         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3594
3595         adev->smc_rreg = &amdgpu_invalid_rreg;
3596         adev->smc_wreg = &amdgpu_invalid_wreg;
3597         adev->pcie_rreg = &amdgpu_invalid_rreg;
3598         adev->pcie_wreg = &amdgpu_invalid_wreg;
3599         adev->pcie_rreg_ext = &amdgpu_invalid_rreg_ext;
3600         adev->pcie_wreg_ext = &amdgpu_invalid_wreg_ext;
3601         adev->pciep_rreg = &amdgpu_invalid_rreg;
3602         adev->pciep_wreg = &amdgpu_invalid_wreg;
3603         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3604         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3605         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3606         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3607         adev->didt_rreg = &amdgpu_invalid_rreg;
3608         adev->didt_wreg = &amdgpu_invalid_wreg;
3609         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3610         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3611         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3612         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3613
3614         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3615                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3616                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3617
3618         /* mutex initialization are all done here so we
3619          * can recall function without having locking issues */
3620         mutex_init(&adev->firmware.mutex);
3621         mutex_init(&adev->pm.mutex);
3622         mutex_init(&adev->gfx.gpu_clock_mutex);
3623         mutex_init(&adev->srbm_mutex);
3624         mutex_init(&adev->gfx.pipe_reserve_mutex);
3625         mutex_init(&adev->gfx.gfx_off_mutex);
3626         mutex_init(&adev->gfx.partition_mutex);
3627         mutex_init(&adev->grbm_idx_mutex);
3628         mutex_init(&adev->mn_lock);
3629         mutex_init(&adev->virt.vf_errors.lock);
3630         hash_init(adev->mn_hash);
3631         mutex_init(&adev->psp.mutex);
3632         mutex_init(&adev->notifier_lock);
3633         mutex_init(&adev->pm.stable_pstate_ctx_lock);
3634         mutex_init(&adev->benchmark_mutex);
3635
3636         amdgpu_device_init_apu_flags(adev);
3637
3638         r = amdgpu_device_check_arguments(adev);
3639         if (r)
3640                 return r;
3641
3642         spin_lock_init(&adev->mmio_idx_lock);
3643         spin_lock_init(&adev->smc_idx_lock);
3644         spin_lock_init(&adev->pcie_idx_lock);
3645         spin_lock_init(&adev->uvd_ctx_idx_lock);
3646         spin_lock_init(&adev->didt_idx_lock);
3647         spin_lock_init(&adev->gc_cac_idx_lock);
3648         spin_lock_init(&adev->se_cac_idx_lock);
3649         spin_lock_init(&adev->audio_endpt_idx_lock);
3650         spin_lock_init(&adev->mm_stats.lock);
3651
3652         INIT_LIST_HEAD(&adev->shadow_list);
3653         mutex_init(&adev->shadow_list_lock);
3654
3655         INIT_LIST_HEAD(&adev->reset_list);
3656
3657         INIT_LIST_HEAD(&adev->ras_list);
3658
3659         INIT_DELAYED_WORK(&adev->delayed_init_work,
3660                           amdgpu_device_delayed_init_work_handler);
3661         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3662                           amdgpu_device_delay_enable_gfx_off);
3663
3664         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3665
3666         adev->gfx.gfx_off_req_count = 1;
3667         adev->gfx.gfx_off_residency = 0;
3668         adev->gfx.gfx_off_entrycount = 0;
3669         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3670
3671         atomic_set(&adev->throttling_logging_enabled, 1);
3672         /*
3673          * If throttling continues, logging will be performed every minute
3674          * to avoid log flooding. "-1" is subtracted since the thermal
3675          * throttling interrupt comes every second. Thus, the total logging
3676          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3677          * for throttling interrupt) = 60 seconds.
3678          */
3679         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3680         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3681
3682         /* Registers mapping */
3683         /* TODO: block userspace mapping of io register */
3684         if (adev->asic_type >= CHIP_BONAIRE) {
3685                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3686                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3687         } else {
3688                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3689                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3690         }
3691
3692         for (i = 0; i < AMD_IP_BLOCK_TYPE_NUM; i++)
3693                 atomic_set(&adev->pm.pwr_state[i], POWER_STATE_UNKNOWN);
3694
3695         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3696         if (adev->rmmio == NULL) {
3697                 return -ENOMEM;
3698         }
3699         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3700         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3701
3702         /*
3703          * Reset domain needs to be present early, before XGMI hive discovered
3704          * (if any) and intitialized to use reset sem and in_gpu reset flag
3705          * early on during init and before calling to RREG32.
3706          */
3707         adev->reset_domain = amdgpu_reset_create_reset_domain(SINGLE_DEVICE, "amdgpu-reset-dev");
3708         if (!adev->reset_domain)
3709                 return -ENOMEM;
3710
3711         /* detect hw virtualization here */
3712         amdgpu_detect_virtualization(adev);
3713
3714         amdgpu_device_get_pcie_info(adev);
3715
3716         r = amdgpu_device_get_job_timeout_settings(adev);
3717         if (r) {
3718                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3719                 return r;
3720         }
3721
3722         /* early init functions */
3723         r = amdgpu_device_ip_early_init(adev);
3724         if (r)
3725                 return r;
3726
3727         amdgpu_device_set_mcbp(adev);
3728
3729         /* Get rid of things like offb */
3730         r = drm_aperture_remove_conflicting_pci_framebuffers(adev->pdev, &amdgpu_kms_driver);
3731         if (r)
3732                 return r;
3733
3734         /* Enable TMZ based on IP_VERSION */
3735         amdgpu_gmc_tmz_set(adev);
3736
3737         amdgpu_gmc_noretry_set(adev);
3738         /* Need to get xgmi info early to decide the reset behavior*/
3739         if (adev->gmc.xgmi.supported) {
3740                 r = adev->gfxhub.funcs->get_xgmi_info(adev);
3741                 if (r)
3742                         return r;
3743         }
3744
3745         /* enable PCIE atomic ops */
3746         if (amdgpu_sriov_vf(adev)) {
3747                 if (adev->virt.fw_reserve.p_pf2vf)
3748                         adev->have_atomics_support = ((struct amd_sriov_msg_pf2vf_info *)
3749                                                       adev->virt.fw_reserve.p_pf2vf)->pcie_atomic_ops_support_flags ==
3750                                 (PCI_EXP_DEVCAP2_ATOMIC_COMP32 | PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3751         /* APUs w/ gfx9 onwards doesn't reply on PCIe atomics, rather it is a
3752          * internal path natively support atomics, set have_atomics_support to true.
3753          */
3754         } else if ((adev->flags & AMD_IS_APU) &&
3755                    (adev->ip_versions[GC_HWIP][0] > IP_VERSION(9, 0, 0))) {
3756                 adev->have_atomics_support = true;
3757         } else {
3758                 adev->have_atomics_support =
3759                         !pci_enable_atomic_ops_to_root(adev->pdev,
3760                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3761                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3762         }
3763
3764         if (!adev->have_atomics_support)
3765                 dev_info(adev->dev, "PCIE atomic ops is not supported\n");
3766
3767         /* doorbell bar mapping and doorbell index init*/
3768         amdgpu_doorbell_init(adev);
3769
3770         if (amdgpu_emu_mode == 1) {
3771                 /* post the asic on emulation mode */
3772                 emu_soc_asic_init(adev);
3773                 goto fence_driver_init;
3774         }
3775
3776         amdgpu_reset_init(adev);
3777
3778         /* detect if we are with an SRIOV vbios */
3779         if (adev->bios)
3780                 amdgpu_device_detect_sriov_bios(adev);
3781
3782         /* check if we need to reset the asic
3783          *  E.g., driver was not cleanly unloaded previously, etc.
3784          */
3785         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3786                 if (adev->gmc.xgmi.num_physical_nodes) {
3787                         dev_info(adev->dev, "Pending hive reset.\n");
3788                         adev->gmc.xgmi.pending_reset = true;
3789                         /* Only need to init necessary block for SMU to handle the reset */
3790                         for (i = 0; i < adev->num_ip_blocks; i++) {
3791                                 if (!adev->ip_blocks[i].status.valid)
3792                                         continue;
3793                                 if (!(adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
3794                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
3795                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
3796                                       adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC)) {
3797                                         DRM_DEBUG("IP %s disabled for hw_init.\n",
3798                                                 adev->ip_blocks[i].version->funcs->name);
3799                                         adev->ip_blocks[i].status.hw = true;
3800                                 }
3801                         }
3802                 } else {
3803                         tmp = amdgpu_reset_method;
3804                         /* It should do a default reset when loading or reloading the driver,
3805                          * regardless of the module parameter reset_method.
3806                          */
3807                         amdgpu_reset_method = AMD_RESET_METHOD_NONE;
3808                         r = amdgpu_asic_reset(adev);
3809                         amdgpu_reset_method = tmp;
3810                         if (r) {
3811                                 dev_err(adev->dev, "asic reset on init failed\n");
3812                                 goto failed;
3813                         }
3814                 }
3815         }
3816
3817         /* Post card if necessary */
3818         if (amdgpu_device_need_post(adev)) {
3819                 if (!adev->bios) {
3820                         dev_err(adev->dev, "no vBIOS found\n");
3821                         r = -EINVAL;
3822                         goto failed;
3823                 }
3824                 DRM_INFO("GPU posting now...\n");
3825                 r = amdgpu_device_asic_init(adev);
3826                 if (r) {
3827                         dev_err(adev->dev, "gpu post error!\n");
3828                         goto failed;
3829                 }
3830         }
3831
3832         if (adev->bios) {
3833                 if (adev->is_atom_fw) {
3834                         /* Initialize clocks */
3835                         r = amdgpu_atomfirmware_get_clock_info(adev);
3836                         if (r) {
3837                                 dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3838                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3839                                 goto failed;
3840                         }
3841                 } else {
3842                         /* Initialize clocks */
3843                         r = amdgpu_atombios_get_clock_info(adev);
3844                         if (r) {
3845                                 dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3846                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3847                                 goto failed;
3848                         }
3849                         /* init i2c buses */
3850                         if (!amdgpu_device_has_dc_support(adev))
3851                                 amdgpu_atombios_i2c_init(adev);
3852                 }
3853         }
3854
3855 fence_driver_init:
3856         /* Fence driver */
3857         r = amdgpu_fence_driver_sw_init(adev);
3858         if (r) {
3859                 dev_err(adev->dev, "amdgpu_fence_driver_sw_init failed\n");
3860                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3861                 goto failed;
3862         }
3863
3864         /* init the mode config */
3865         drm_mode_config_init(adev_to_drm(adev));
3866
3867         r = amdgpu_device_ip_init(adev);
3868         if (r) {
3869                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3870                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3871                 goto release_ras_con;
3872         }
3873
3874         amdgpu_fence_driver_hw_init(adev);
3875
3876         dev_info(adev->dev,
3877                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3878                         adev->gfx.config.max_shader_engines,
3879                         adev->gfx.config.max_sh_per_se,
3880                         adev->gfx.config.max_cu_per_sh,
3881                         adev->gfx.cu_info.number);
3882
3883         adev->accel_working = true;
3884
3885         amdgpu_vm_check_compute_bug(adev);
3886
3887         /* Initialize the buffer migration limit. */
3888         if (amdgpu_moverate >= 0)
3889                 max_MBps = amdgpu_moverate;
3890         else
3891                 max_MBps = 8; /* Allow 8 MB/s. */
3892         /* Get a log2 for easy divisions. */
3893         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3894
3895         r = amdgpu_atombios_sysfs_init(adev);
3896         if (r)
3897                 drm_err(&adev->ddev,
3898                         "registering atombios sysfs failed (%d).\n", r);
3899
3900         r = amdgpu_pm_sysfs_init(adev);
3901         if (r)
3902                 DRM_ERROR("registering pm sysfs failed (%d).\n", r);
3903
3904         r = amdgpu_ucode_sysfs_init(adev);
3905         if (r) {
3906                 adev->ucode_sysfs_en = false;
3907                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3908         } else
3909                 adev->ucode_sysfs_en = true;
3910
3911         /*
3912          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3913          * Otherwise the mgpu fan boost feature will be skipped due to the
3914          * gpu instance is counted less.
3915          */
3916         amdgpu_register_gpu_instance(adev);
3917
3918         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3919          * explicit gating rather than handling it automatically.
3920          */
3921         if (!adev->gmc.xgmi.pending_reset) {
3922                 r = amdgpu_device_ip_late_init(adev);
3923                 if (r) {
3924                         dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3925                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3926                         goto release_ras_con;
3927                 }
3928                 /* must succeed. */
3929                 amdgpu_ras_resume(adev);
3930                 queue_delayed_work(system_wq, &adev->delayed_init_work,
3931                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3932         }
3933
3934         if (amdgpu_sriov_vf(adev)) {
3935                 amdgpu_virt_release_full_gpu(adev, true);
3936                 flush_delayed_work(&adev->delayed_init_work);
3937         }
3938
3939         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3940         if (r)
3941                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3942
3943         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3944                 r = amdgpu_pmu_init(adev);
3945         if (r)
3946                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3947
3948         /* Have stored pci confspace at hand for restore in sudden PCI error */
3949         if (amdgpu_device_cache_pci_state(adev->pdev))
3950                 pci_restore_state(pdev);
3951
3952         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3953         /* this will fail for cards that aren't VGA class devices, just
3954          * ignore it */
3955         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
3956                 vga_client_register(adev->pdev, amdgpu_device_vga_set_decode);
3957
3958         px = amdgpu_device_supports_px(ddev);
3959
3960         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
3961                                 apple_gmux_detect(NULL, NULL)))
3962                 vga_switcheroo_register_client(adev->pdev,
3963                                                &amdgpu_switcheroo_ops, px);
3964
3965         if (px)
3966                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3967
3968         if (adev->gmc.xgmi.pending_reset)
3969                 queue_delayed_work(system_wq, &mgpu_info.delayed_reset_work,
3970                                    msecs_to_jiffies(AMDGPU_RESUME_MS));
3971
3972         amdgpu_device_check_iommu_direct_map(adev);
3973
3974         return 0;
3975
3976 release_ras_con:
3977         if (amdgpu_sriov_vf(adev))
3978                 amdgpu_virt_release_full_gpu(adev, true);
3979
3980         /* failed in exclusive mode due to timeout */
3981         if (amdgpu_sriov_vf(adev) &&
3982                 !amdgpu_sriov_runtime(adev) &&
3983                 amdgpu_virt_mmio_blocked(adev) &&
3984                 !amdgpu_virt_wait_reset(adev)) {
3985                 dev_err(adev->dev, "VF exclusive mode timeout\n");
3986                 /* Don't send request since VF is inactive. */
3987                 adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3988                 adev->virt.ops = NULL;
3989                 r = -EAGAIN;
3990         }
3991         amdgpu_release_ras_context(adev);
3992
3993 failed:
3994         amdgpu_vf_error_trans_all(adev);
3995
3996         return r;
3997 }
3998
3999 static void amdgpu_device_unmap_mmio(struct amdgpu_device *adev)
4000 {
4001
4002         /* Clear all CPU mappings pointing to this device */
4003         unmap_mapping_range(adev->ddev.anon_inode->i_mapping, 0, 0, 1);
4004
4005         /* Unmap all mapped bars - Doorbell, registers and VRAM */
4006         amdgpu_doorbell_fini(adev);
4007
4008         iounmap(adev->rmmio);
4009         adev->rmmio = NULL;
4010         if (adev->mman.aper_base_kaddr)
4011                 iounmap(adev->mman.aper_base_kaddr);
4012         adev->mman.aper_base_kaddr = NULL;
4013
4014         /* Memory manager related */
4015         if (!adev->gmc.xgmi.connected_to_cpu && !adev->gmc.is_app_apu) {
4016                 arch_phys_wc_del(adev->gmc.vram_mtrr);
4017                 arch_io_free_memtype_wc(adev->gmc.aper_base, adev->gmc.aper_size);
4018         }
4019 }
4020
4021 /**
4022  * amdgpu_device_fini_hw - tear down the driver
4023  *
4024  * @adev: amdgpu_device pointer
4025  *
4026  * Tear down the driver info (all asics).
4027  * Called at driver shutdown.
4028  */
4029 void amdgpu_device_fini_hw(struct amdgpu_device *adev)
4030 {
4031         dev_info(adev->dev, "amdgpu: finishing device.\n");
4032         flush_delayed_work(&adev->delayed_init_work);
4033         adev->shutdown = true;
4034
4035         /* make sure IB test finished before entering exclusive mode
4036          * to avoid preemption on IB test
4037          * */
4038         if (amdgpu_sriov_vf(adev)) {
4039                 amdgpu_virt_request_full_gpu(adev, false);
4040                 amdgpu_virt_fini_data_exchange(adev);
4041         }
4042
4043         /* disable all interrupts */
4044         amdgpu_irq_disable_all(adev);
4045         if (adev->mode_info.mode_config_initialized) {
4046                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(adev)))
4047                         drm_helper_force_disable_all(adev_to_drm(adev));
4048                 else
4049                         drm_atomic_helper_shutdown(adev_to_drm(adev));
4050         }
4051         amdgpu_fence_driver_hw_fini(adev);
4052
4053         if (adev->mman.initialized)
4054                 drain_workqueue(adev->mman.bdev.wq);
4055
4056         if (adev->pm.sysfs_initialized)
4057                 amdgpu_pm_sysfs_fini(adev);
4058         if (adev->ucode_sysfs_en)
4059                 amdgpu_ucode_sysfs_fini(adev);
4060         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
4061
4062         /* disable ras feature must before hw fini */
4063         amdgpu_ras_pre_fini(adev);
4064
4065         amdgpu_device_ip_fini_early(adev);
4066
4067         amdgpu_irq_fini_hw(adev);
4068
4069         if (adev->mman.initialized)
4070                 ttm_device_clear_dma_mappings(&adev->mman.bdev);
4071
4072         amdgpu_gart_dummy_page_fini(adev);
4073
4074         if (drm_dev_is_unplugged(adev_to_drm(adev)))
4075                 amdgpu_device_unmap_mmio(adev);
4076
4077 }
4078
4079 void amdgpu_device_fini_sw(struct amdgpu_device *adev)
4080 {
4081         int idx;
4082         bool px;
4083
4084         amdgpu_fence_driver_sw_fini(adev);
4085         amdgpu_device_ip_fini(adev);
4086         amdgpu_ucode_release(&adev->firmware.gpu_info_fw);
4087         adev->accel_working = false;
4088         dma_fence_put(rcu_dereference_protected(adev->gang_submit, true));
4089
4090         amdgpu_reset_fini(adev);
4091
4092         /* free i2c buses */
4093         if (!amdgpu_device_has_dc_support(adev))
4094                 amdgpu_i2c_fini(adev);
4095
4096         if (amdgpu_emu_mode != 1)
4097                 amdgpu_atombios_fini(adev);
4098
4099         kfree(adev->bios);
4100         adev->bios = NULL;
4101
4102         px = amdgpu_device_supports_px(adev_to_drm(adev));
4103
4104         if (px || (!pci_is_thunderbolt_attached(adev->pdev) &&
4105                                 apple_gmux_detect(NULL, NULL)))
4106                 vga_switcheroo_unregister_client(adev->pdev);
4107
4108         if (px)
4109                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
4110
4111         if ((adev->pdev->class >> 8) == PCI_CLASS_DISPLAY_VGA)
4112                 vga_client_unregister(adev->pdev);
4113
4114         if (drm_dev_enter(adev_to_drm(adev), &idx)) {
4115
4116                 iounmap(adev->rmmio);
4117                 adev->rmmio = NULL;
4118                 amdgpu_doorbell_fini(adev);
4119                 drm_dev_exit(idx);
4120         }
4121
4122         if (IS_ENABLED(CONFIG_PERF_EVENTS))
4123                 amdgpu_pmu_fini(adev);
4124         if (adev->mman.discovery_bin)
4125                 amdgpu_discovery_fini(adev);
4126
4127         amdgpu_reset_put_reset_domain(adev->reset_domain);
4128         adev->reset_domain = NULL;
4129
4130         kfree(adev->pci_state);
4131
4132 }
4133
4134 /**
4135  * amdgpu_device_evict_resources - evict device resources
4136  * @adev: amdgpu device object
4137  *
4138  * Evicts all ttm device resources(vram BOs, gart table) from the lru list
4139  * of the vram memory type. Mainly used for evicting device resources
4140  * at suspend time.
4141  *
4142  */
4143 static int amdgpu_device_evict_resources(struct amdgpu_device *adev)
4144 {
4145         int ret;
4146
4147         /* No need to evict vram on APUs for suspend to ram or s2idle */
4148         if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU))
4149                 return 0;
4150
4151         ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM);
4152         if (ret)
4153                 DRM_WARN("evicting device resources failed\n");
4154         return ret;
4155 }
4156
4157 /*
4158  * Suspend & resume.
4159  */
4160 /**
4161  * amdgpu_device_suspend - initiate device suspend
4162  *
4163  * @dev: drm dev pointer
4164  * @fbcon : notify the fbdev of suspend
4165  *
4166  * Puts the hw in the suspend state (all asics).
4167  * Returns 0 for success or an error on failure.
4168  * Called at driver suspend.
4169  */
4170 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
4171 {
4172         struct amdgpu_device *adev = drm_to_adev(dev);
4173         int r = 0;
4174
4175         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4176                 return 0;
4177
4178         adev->in_suspend = true;
4179
4180         /* Evict the majority of BOs before grabbing the full access */
4181         r = amdgpu_device_evict_resources(adev);
4182         if (r)
4183                 return r;
4184
4185         if (amdgpu_sriov_vf(adev)) {
4186                 amdgpu_virt_fini_data_exchange(adev);
4187                 r = amdgpu_virt_request_full_gpu(adev, false);
4188                 if (r)
4189                         return r;
4190         }
4191
4192         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D3))
4193                 DRM_WARN("smart shift update failed\n");
4194
4195         if (fbcon)
4196                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, true);
4197
4198         cancel_delayed_work_sync(&adev->delayed_init_work);
4199
4200         amdgpu_ras_suspend(adev);
4201
4202         amdgpu_device_ip_suspend_phase1(adev);
4203
4204         if (!adev->in_s0ix)
4205                 amdgpu_amdkfd_suspend(adev, adev->in_runpm);
4206
4207         r = amdgpu_device_evict_resources(adev);
4208         if (r)
4209                 return r;
4210
4211         amdgpu_fence_driver_hw_fini(adev);
4212
4213         amdgpu_device_ip_suspend_phase2(adev);
4214
4215         if (amdgpu_sriov_vf(adev))
4216                 amdgpu_virt_release_full_gpu(adev, false);
4217
4218         return 0;
4219 }
4220
4221 /**
4222  * amdgpu_device_resume - initiate device resume
4223  *
4224  * @dev: drm dev pointer
4225  * @fbcon : notify the fbdev of resume
4226  *
4227  * Bring the hw back to operating state (all asics).
4228  * Returns 0 for success or an error on failure.
4229  * Called at driver resume.
4230  */
4231 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
4232 {
4233         struct amdgpu_device *adev = drm_to_adev(dev);
4234         int r = 0;
4235
4236         if (amdgpu_sriov_vf(adev)) {
4237                 r = amdgpu_virt_request_full_gpu(adev, true);
4238                 if (r)
4239                         return r;
4240         }
4241
4242         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
4243                 return 0;
4244
4245         if (adev->in_s0ix)
4246                 amdgpu_dpm_gfx_state_change(adev, sGpuChangeState_D0Entry);
4247
4248         /* post card */
4249         if (amdgpu_device_need_post(adev)) {
4250                 r = amdgpu_device_asic_init(adev);
4251                 if (r)
4252                         dev_err(adev->dev, "amdgpu asic init failed\n");
4253         }
4254
4255         r = amdgpu_device_ip_resume(adev);
4256
4257         if (r) {
4258                 dev_err(adev->dev, "amdgpu_device_ip_resume failed (%d).\n", r);
4259                 goto exit;
4260         }
4261         amdgpu_fence_driver_hw_init(adev);
4262
4263         r = amdgpu_device_ip_late_init(adev);
4264         if (r)
4265                 goto exit;
4266
4267         queue_delayed_work(system_wq, &adev->delayed_init_work,
4268                            msecs_to_jiffies(AMDGPU_RESUME_MS));
4269
4270         if (!adev->in_s0ix) {
4271                 r = amdgpu_amdkfd_resume(adev, adev->in_runpm);
4272                 if (r)
4273                         goto exit;
4274         }
4275
4276 exit:
4277         if (amdgpu_sriov_vf(adev)) {
4278                 amdgpu_virt_init_data_exchange(adev);
4279                 amdgpu_virt_release_full_gpu(adev, true);
4280         }
4281
4282         if (r)
4283                 return r;
4284
4285         /* Make sure IB tests flushed */
4286         flush_delayed_work(&adev->delayed_init_work);
4287
4288         if (fbcon)
4289                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(adev)->fb_helper, false);
4290
4291         amdgpu_ras_resume(adev);
4292
4293         if (adev->mode_info.num_crtc) {
4294                 /*
4295                  * Most of the connector probing functions try to acquire runtime pm
4296                  * refs to ensure that the GPU is powered on when connector polling is
4297                  * performed. Since we're calling this from a runtime PM callback,
4298                  * trying to acquire rpm refs will cause us to deadlock.
4299                  *
4300                  * Since we're guaranteed to be holding the rpm lock, it's safe to
4301                  * temporarily disable the rpm helpers so this doesn't deadlock us.
4302                  */
4303 #ifdef CONFIG_PM
4304                 dev->dev->power.disable_depth++;
4305 #endif
4306                 if (!adev->dc_enabled)
4307                         drm_helper_hpd_irq_event(dev);
4308                 else
4309                         drm_kms_helper_hotplug_event(dev);
4310 #ifdef CONFIG_PM
4311                 dev->dev->power.disable_depth--;
4312 #endif
4313         }
4314         adev->in_suspend = false;
4315
4316         if (adev->enable_mes)
4317                 amdgpu_mes_self_test(adev);
4318
4319         if (amdgpu_acpi_smart_shift_update(dev, AMDGPU_SS_DEV_D0))
4320                 DRM_WARN("smart shift update failed\n");
4321
4322         return 0;
4323 }
4324
4325 /**
4326  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
4327  *
4328  * @adev: amdgpu_device pointer
4329  *
4330  * The list of all the hardware IPs that make up the asic is walked and
4331  * the check_soft_reset callbacks are run.  check_soft_reset determines
4332  * if the asic is still hung or not.
4333  * Returns true if any of the IPs are still in a hung state, false if not.
4334  */
4335 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
4336 {
4337         int i;
4338         bool asic_hang = false;
4339
4340         if (amdgpu_sriov_vf(adev))
4341                 return true;
4342
4343         if (amdgpu_asic_need_full_reset(adev))
4344                 return true;
4345
4346         for (i = 0; i < adev->num_ip_blocks; i++) {
4347                 if (!adev->ip_blocks[i].status.valid)
4348                         continue;
4349                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
4350                         adev->ip_blocks[i].status.hang =
4351                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
4352                 if (adev->ip_blocks[i].status.hang) {
4353                         dev_info(adev->dev, "IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
4354                         asic_hang = true;
4355                 }
4356         }
4357         return asic_hang;
4358 }
4359
4360 /**
4361  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
4362  *
4363  * @adev: amdgpu_device pointer
4364  *
4365  * The list of all the hardware IPs that make up the asic is walked and the
4366  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
4367  * handles any IP specific hardware or software state changes that are
4368  * necessary for a soft reset to succeed.
4369  * Returns 0 on success, negative error code on failure.
4370  */
4371 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
4372 {
4373         int i, r = 0;
4374
4375         for (i = 0; i < adev->num_ip_blocks; i++) {
4376                 if (!adev->ip_blocks[i].status.valid)
4377                         continue;
4378                 if (adev->ip_blocks[i].status.hang &&
4379                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
4380                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
4381                         if (r)
4382                                 return r;
4383                 }
4384         }
4385
4386         return 0;
4387 }
4388
4389 /**
4390  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
4391  *
4392  * @adev: amdgpu_device pointer
4393  *
4394  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
4395  * reset is necessary to recover.
4396  * Returns true if a full asic reset is required, false if not.
4397  */
4398 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
4399 {
4400         int i;
4401
4402         if (amdgpu_asic_need_full_reset(adev))
4403                 return true;
4404
4405         for (i = 0; i < adev->num_ip_blocks; i++) {
4406                 if (!adev->ip_blocks[i].status.valid)
4407                         continue;
4408                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
4409                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
4410                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
4411                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
4412                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
4413                         if (adev->ip_blocks[i].status.hang) {
4414                                 dev_info(adev->dev, "Some block need full reset!\n");
4415                                 return true;
4416                         }
4417                 }
4418         }
4419         return false;
4420 }
4421
4422 /**
4423  * amdgpu_device_ip_soft_reset - do a soft reset
4424  *
4425  * @adev: amdgpu_device pointer
4426  *
4427  * The list of all the hardware IPs that make up the asic is walked and the
4428  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
4429  * IP specific hardware or software state changes that are necessary to soft
4430  * reset the IP.
4431  * Returns 0 on success, negative error code on failure.
4432  */
4433 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
4434 {
4435         int i, r = 0;
4436
4437         for (i = 0; i < adev->num_ip_blocks; i++) {
4438                 if (!adev->ip_blocks[i].status.valid)
4439                         continue;
4440                 if (adev->ip_blocks[i].status.hang &&
4441                     adev->ip_blocks[i].version->funcs->soft_reset) {
4442                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
4443                         if (r)
4444                                 return r;
4445                 }
4446         }
4447
4448         return 0;
4449 }
4450
4451 /**
4452  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
4453  *
4454  * @adev: amdgpu_device pointer
4455  *
4456  * The list of all the hardware IPs that make up the asic is walked and the
4457  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
4458  * handles any IP specific hardware or software state changes that are
4459  * necessary after the IP has been soft reset.
4460  * Returns 0 on success, negative error code on failure.
4461  */
4462 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
4463 {
4464         int i, r = 0;
4465
4466         for (i = 0; i < adev->num_ip_blocks; i++) {
4467                 if (!adev->ip_blocks[i].status.valid)
4468                         continue;
4469                 if (adev->ip_blocks[i].status.hang &&
4470                     adev->ip_blocks[i].version->funcs->post_soft_reset)
4471                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
4472                 if (r)
4473                         return r;
4474         }
4475
4476         return 0;
4477 }
4478
4479 /**
4480  * amdgpu_device_recover_vram - Recover some VRAM contents
4481  *
4482  * @adev: amdgpu_device pointer
4483  *
4484  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
4485  * restore things like GPUVM page tables after a GPU reset where
4486  * the contents of VRAM might be lost.
4487  *
4488  * Returns:
4489  * 0 on success, negative error code on failure.
4490  */
4491 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
4492 {
4493         struct dma_fence *fence = NULL, *next = NULL;
4494         struct amdgpu_bo *shadow;
4495         struct amdgpu_bo_vm *vmbo;
4496         long r = 1, tmo;
4497
4498         if (amdgpu_sriov_runtime(adev))
4499                 tmo = msecs_to_jiffies(8000);
4500         else
4501                 tmo = msecs_to_jiffies(100);
4502
4503         dev_info(adev->dev, "recover vram bo from shadow start\n");
4504         mutex_lock(&adev->shadow_list_lock);
4505         list_for_each_entry(vmbo, &adev->shadow_list, shadow_list) {
4506                 /* If vm is compute context or adev is APU, shadow will be NULL */
4507                 if (!vmbo->shadow)
4508                         continue;
4509                 shadow = vmbo->shadow;
4510
4511                 /* No need to recover an evicted BO */
4512                 if (shadow->tbo.resource->mem_type != TTM_PL_TT ||
4513                     shadow->tbo.resource->start == AMDGPU_BO_INVALID_OFFSET ||
4514                     shadow->parent->tbo.resource->mem_type != TTM_PL_VRAM)
4515                         continue;
4516
4517                 r = amdgpu_bo_restore_shadow(shadow, &next);
4518                 if (r)
4519                         break;
4520
4521                 if (fence) {
4522                         tmo = dma_fence_wait_timeout(fence, false, tmo);
4523                         dma_fence_put(fence);
4524                         fence = next;
4525                         if (tmo == 0) {
4526                                 r = -ETIMEDOUT;
4527                                 break;
4528                         } else if (tmo < 0) {
4529                                 r = tmo;
4530                                 break;
4531                         }
4532                 } else {
4533                         fence = next;
4534                 }
4535         }
4536         mutex_unlock(&adev->shadow_list_lock);
4537
4538         if (fence)
4539                 tmo = dma_fence_wait_timeout(fence, false, tmo);
4540         dma_fence_put(fence);
4541
4542         if (r < 0 || tmo <= 0) {
4543                 dev_err(adev->dev, "recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
4544                 return -EIO;
4545         }
4546
4547         dev_info(adev->dev, "recover vram bo from shadow done\n");
4548         return 0;
4549 }
4550
4551
4552 /**
4553  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
4554  *
4555  * @adev: amdgpu_device pointer
4556  * @from_hypervisor: request from hypervisor
4557  *
4558  * do VF FLR and reinitialize Asic
4559  * return 0 means succeeded otherwise failed
4560  */
4561 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
4562                                      bool from_hypervisor)
4563 {
4564         int r;
4565         struct amdgpu_hive_info *hive = NULL;
4566         int retry_limit = 0;
4567
4568 retry:
4569         amdgpu_amdkfd_pre_reset(adev);
4570
4571         if (from_hypervisor)
4572                 r = amdgpu_virt_request_full_gpu(adev, true);
4573         else
4574                 r = amdgpu_virt_reset_gpu(adev);
4575         if (r)
4576                 return r;
4577
4578         /* some sw clean up VF needs to do before recover */
4579         amdgpu_virt_post_reset(adev);
4580
4581         /* Resume IP prior to SMC */
4582         r = amdgpu_device_ip_reinit_early_sriov(adev);
4583         if (r)
4584                 goto error;
4585
4586         amdgpu_virt_init_data_exchange(adev);
4587
4588         r = amdgpu_device_fw_loading(adev);
4589         if (r)
4590                 return r;
4591
4592         /* now we are okay to resume SMC/CP/SDMA */
4593         r = amdgpu_device_ip_reinit_late_sriov(adev);
4594         if (r)
4595                 goto error;
4596
4597         hive = amdgpu_get_xgmi_hive(adev);
4598         /* Update PSP FW topology after reset */
4599         if (hive && adev->gmc.xgmi.num_physical_nodes > 1)
4600                 r = amdgpu_xgmi_update_topology(hive, adev);
4601
4602         if (hive)
4603                 amdgpu_put_xgmi_hive(hive);
4604
4605         if (!r) {
4606                 amdgpu_irq_gpu_reset_resume_helper(adev);
4607                 r = amdgpu_ib_ring_tests(adev);
4608
4609                 amdgpu_amdkfd_post_reset(adev);
4610         }
4611
4612 error:
4613         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
4614                 amdgpu_inc_vram_lost(adev);
4615                 r = amdgpu_device_recover_vram(adev);
4616         }
4617         amdgpu_virt_release_full_gpu(adev, true);
4618
4619         if (AMDGPU_RETRY_SRIOV_RESET(r)) {
4620                 if (retry_limit < AMDGPU_MAX_RETRY_LIMIT) {
4621                         retry_limit++;
4622                         goto retry;
4623                 } else
4624                         DRM_ERROR("GPU reset retry is beyond the retry limit\n");
4625         }
4626
4627         return r;
4628 }
4629
4630 /**
4631  * amdgpu_device_has_job_running - check if there is any job in mirror list
4632  *
4633  * @adev: amdgpu_device pointer
4634  *
4635  * check if there is any job in mirror list
4636  */
4637 bool amdgpu_device_has_job_running(struct amdgpu_device *adev)
4638 {
4639         int i;
4640         struct drm_sched_job *job;
4641
4642         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4643                 struct amdgpu_ring *ring = adev->rings[i];
4644
4645                 if (!ring || !ring->sched.thread)
4646                         continue;
4647
4648                 spin_lock(&ring->sched.job_list_lock);
4649                 job = list_first_entry_or_null(&ring->sched.pending_list,
4650                                                struct drm_sched_job, list);
4651                 spin_unlock(&ring->sched.job_list_lock);
4652                 if (job)
4653                         return true;
4654         }
4655         return false;
4656 }
4657
4658 /**
4659  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
4660  *
4661  * @adev: amdgpu_device pointer
4662  *
4663  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
4664  * a hung GPU.
4665  */
4666 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
4667 {
4668
4669         if (amdgpu_gpu_recovery == 0)
4670                 goto disabled;
4671
4672         /* Skip soft reset check in fatal error mode */
4673         if (!amdgpu_ras_is_poison_mode_supported(adev))
4674                 return true;
4675
4676         if (amdgpu_sriov_vf(adev))
4677                 return true;
4678
4679         if (amdgpu_gpu_recovery == -1) {
4680                 switch (adev->asic_type) {
4681 #ifdef CONFIG_DRM_AMDGPU_SI
4682                 case CHIP_VERDE:
4683                 case CHIP_TAHITI:
4684                 case CHIP_PITCAIRN:
4685                 case CHIP_OLAND:
4686                 case CHIP_HAINAN:
4687 #endif
4688 #ifdef CONFIG_DRM_AMDGPU_CIK
4689                 case CHIP_KAVERI:
4690                 case CHIP_KABINI:
4691                 case CHIP_MULLINS:
4692 #endif
4693                 case CHIP_CARRIZO:
4694                 case CHIP_STONEY:
4695                 case CHIP_CYAN_SKILLFISH:
4696                         goto disabled;
4697                 default:
4698                         break;
4699                 }
4700         }
4701
4702         return true;
4703
4704 disabled:
4705                 dev_info(adev->dev, "GPU recovery disabled.\n");
4706                 return false;
4707 }
4708
4709 int amdgpu_device_mode1_reset(struct amdgpu_device *adev)
4710 {
4711         u32 i;
4712         int ret = 0;
4713
4714         amdgpu_atombios_scratch_regs_engine_hung(adev, true);
4715
4716         dev_info(adev->dev, "GPU mode1 reset\n");
4717
4718         /* disable BM */
4719         pci_clear_master(adev->pdev);
4720
4721         amdgpu_device_cache_pci_state(adev->pdev);
4722
4723         if (amdgpu_dpm_is_mode1_reset_supported(adev)) {
4724                 dev_info(adev->dev, "GPU smu mode1 reset\n");
4725                 ret = amdgpu_dpm_mode1_reset(adev);
4726         } else {
4727                 dev_info(adev->dev, "GPU psp mode1 reset\n");
4728                 ret = psp_gpu_reset(adev);
4729         }
4730
4731         if (ret)
4732                 dev_err(adev->dev, "GPU mode1 reset failed\n");
4733
4734         amdgpu_device_load_pci_state(adev->pdev);
4735
4736         /* wait for asic to come out of reset */
4737         for (i = 0; i < adev->usec_timeout; i++) {
4738                 u32 memsize = adev->nbio.funcs->get_memsize(adev);
4739
4740                 if (memsize != 0xffffffff)
4741                         break;
4742                 udelay(1);
4743         }
4744
4745         amdgpu_atombios_scratch_regs_engine_hung(adev, false);
4746         return ret;
4747 }
4748
4749 int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
4750                                  struct amdgpu_reset_context *reset_context)
4751 {
4752         int i, r = 0;
4753         struct amdgpu_job *job = NULL;
4754         bool need_full_reset =
4755                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4756
4757         if (reset_context->reset_req_dev == adev)
4758                 job = reset_context->job;
4759
4760         if (amdgpu_sriov_vf(adev)) {
4761                 /* stop the data exchange thread */
4762                 amdgpu_virt_fini_data_exchange(adev);
4763         }
4764
4765         amdgpu_fence_driver_isr_toggle(adev, true);
4766
4767         /* block all schedulers and reset given job's ring */
4768         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4769                 struct amdgpu_ring *ring = adev->rings[i];
4770
4771                 if (!ring || !ring->sched.thread)
4772                         continue;
4773
4774                 /*clear job fence from fence drv to avoid force_completion
4775                  *leave NULL and vm flush fence in fence drv */
4776                 amdgpu_fence_driver_clear_job_fences(ring);
4777
4778                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
4779                 amdgpu_fence_driver_force_completion(ring);
4780         }
4781
4782         amdgpu_fence_driver_isr_toggle(adev, false);
4783
4784         if (job && job->vm)
4785                 drm_sched_increase_karma(&job->base);
4786
4787         r = amdgpu_reset_prepare_hwcontext(adev, reset_context);
4788         /* If reset handler not implemented, continue; otherwise return */
4789         if (r == -ENOSYS)
4790                 r = 0;
4791         else
4792                 return r;
4793
4794         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
4795         if (!amdgpu_sriov_vf(adev)) {
4796
4797                 if (!need_full_reset)
4798                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
4799
4800                 if (!need_full_reset && amdgpu_gpu_recovery &&
4801                     amdgpu_device_ip_check_soft_reset(adev)) {
4802                         amdgpu_device_ip_pre_soft_reset(adev);
4803                         r = amdgpu_device_ip_soft_reset(adev);
4804                         amdgpu_device_ip_post_soft_reset(adev);
4805                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4806                                 dev_info(adev->dev, "soft reset failed, will fallback to full reset!\n");
4807                                 need_full_reset = true;
4808                         }
4809                 }
4810
4811                 if (need_full_reset)
4812                         r = amdgpu_device_ip_suspend(adev);
4813                 if (need_full_reset)
4814                         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4815                 else
4816                         clear_bit(AMDGPU_NEED_FULL_RESET,
4817                                   &reset_context->flags);
4818         }
4819
4820         return r;
4821 }
4822
4823 static int amdgpu_reset_reg_dumps(struct amdgpu_device *adev)
4824 {
4825         int i;
4826
4827         lockdep_assert_held(&adev->reset_domain->sem);
4828
4829         for (i = 0; i < adev->num_regs; i++) {
4830                 adev->reset_dump_reg_value[i] = RREG32(adev->reset_dump_reg_list[i]);
4831                 trace_amdgpu_reset_reg_dumps(adev->reset_dump_reg_list[i],
4832                                              adev->reset_dump_reg_value[i]);
4833         }
4834
4835         return 0;
4836 }
4837
4838 #ifdef CONFIG_DEV_COREDUMP
4839 static ssize_t amdgpu_devcoredump_read(char *buffer, loff_t offset,
4840                 size_t count, void *data, size_t datalen)
4841 {
4842         struct drm_printer p;
4843         struct amdgpu_device *adev = data;
4844         struct drm_print_iterator iter;
4845         int i;
4846
4847         iter.data = buffer;
4848         iter.offset = 0;
4849         iter.start = offset;
4850         iter.remain = count;
4851
4852         p = drm_coredump_printer(&iter);
4853
4854         drm_printf(&p, "**** AMDGPU Device Coredump ****\n");
4855         drm_printf(&p, "kernel: " UTS_RELEASE "\n");
4856         drm_printf(&p, "module: " KBUILD_MODNAME "\n");
4857         drm_printf(&p, "time: %lld.%09ld\n", adev->reset_time.tv_sec, adev->reset_time.tv_nsec);
4858         if (adev->reset_task_info.pid)
4859                 drm_printf(&p, "process_name: %s PID: %d\n",
4860                            adev->reset_task_info.process_name,
4861                            adev->reset_task_info.pid);
4862
4863         if (adev->reset_vram_lost)
4864                 drm_printf(&p, "VRAM is lost due to GPU reset!\n");
4865         if (adev->num_regs) {
4866                 drm_printf(&p, "AMDGPU register dumps:\nOffset:     Value:\n");
4867
4868                 for (i = 0; i < adev->num_regs; i++)
4869                         drm_printf(&p, "0x%08x: 0x%08x\n",
4870                                    adev->reset_dump_reg_list[i],
4871                                    adev->reset_dump_reg_value[i]);
4872         }
4873
4874         return count - iter.remain;
4875 }
4876
4877 static void amdgpu_devcoredump_free(void *data)
4878 {
4879 }
4880
4881 static void amdgpu_reset_capture_coredumpm(struct amdgpu_device *adev)
4882 {
4883         struct drm_device *dev = adev_to_drm(adev);
4884
4885         ktime_get_ts64(&adev->reset_time);
4886         dev_coredumpm(dev->dev, THIS_MODULE, adev, 0, GFP_KERNEL,
4887                       amdgpu_devcoredump_read, amdgpu_devcoredump_free);
4888 }
4889 #endif
4890
4891 int amdgpu_do_asic_reset(struct list_head *device_list_handle,
4892                          struct amdgpu_reset_context *reset_context)
4893 {
4894         struct amdgpu_device *tmp_adev = NULL;
4895         bool need_full_reset, skip_hw_reset, vram_lost = false;
4896         int r = 0;
4897         bool gpu_reset_for_dev_remove = 0;
4898
4899         /* Try reset handler method first */
4900         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
4901                                     reset_list);
4902         amdgpu_reset_reg_dumps(tmp_adev);
4903
4904         reset_context->reset_device_list = device_list_handle;
4905         r = amdgpu_reset_perform_reset(tmp_adev, reset_context);
4906         /* If reset handler not implemented, continue; otherwise return */
4907         if (r == -ENOSYS)
4908                 r = 0;
4909         else
4910                 return r;
4911
4912         /* Reset handler not implemented, use the default method */
4913         need_full_reset =
4914                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4915         skip_hw_reset = test_bit(AMDGPU_SKIP_HW_RESET, &reset_context->flags);
4916
4917         gpu_reset_for_dev_remove =
4918                 test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
4919                         test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
4920
4921         /*
4922          * ASIC reset has to be done on all XGMI hive nodes ASAP
4923          * to allow proper links negotiation in FW (within 1 sec)
4924          */
4925         if (!skip_hw_reset && need_full_reset) {
4926                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4927                         /* For XGMI run all resets in parallel to speed up the process */
4928                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4929                                 tmp_adev->gmc.xgmi.pending_reset = false;
4930                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4931                                         r = -EALREADY;
4932                         } else
4933                                 r = amdgpu_asic_reset(tmp_adev);
4934
4935                         if (r) {
4936                                 dev_err(tmp_adev->dev, "ASIC reset failed with error, %d for drm dev, %s",
4937                                          r, adev_to_drm(tmp_adev)->unique);
4938                                 break;
4939                         }
4940                 }
4941
4942                 /* For XGMI wait for all resets to complete before proceed */
4943                 if (!r) {
4944                         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4945                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4946                                         flush_work(&tmp_adev->xgmi_reset_work);
4947                                         r = tmp_adev->asic_reset_res;
4948                                         if (r)
4949                                                 break;
4950                                 }
4951                         }
4952                 }
4953         }
4954
4955         if (!r && amdgpu_ras_intr_triggered()) {
4956                 list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4957                         if (tmp_adev->mmhub.ras && tmp_adev->mmhub.ras->ras_block.hw_ops &&
4958                             tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count)
4959                                 tmp_adev->mmhub.ras->ras_block.hw_ops->reset_ras_error_count(tmp_adev);
4960                 }
4961
4962                 amdgpu_ras_intr_cleared();
4963         }
4964
4965         /* Since the mode1 reset affects base ip blocks, the
4966          * phase1 ip blocks need to be resumed. Otherwise there
4967          * will be a BIOS signature error and the psp bootloader
4968          * can't load kdb on the next amdgpu install.
4969          */
4970         if (gpu_reset_for_dev_remove) {
4971                 list_for_each_entry(tmp_adev, device_list_handle, reset_list)
4972                         amdgpu_device_ip_resume_phase1(tmp_adev);
4973
4974                 goto end;
4975         }
4976
4977         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
4978                 if (need_full_reset) {
4979                         /* post card */
4980                         r = amdgpu_device_asic_init(tmp_adev);
4981                         if (r) {
4982                                 dev_warn(tmp_adev->dev, "asic atom init failed!");
4983                         } else {
4984                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4985                                 r = amdgpu_amdkfd_resume_iommu(tmp_adev);
4986                                 if (r)
4987                                         goto out;
4988
4989                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4990                                 if (r)
4991                                         goto out;
4992
4993                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4994 #ifdef CONFIG_DEV_COREDUMP
4995                                 tmp_adev->reset_vram_lost = vram_lost;
4996                                 memset(&tmp_adev->reset_task_info, 0,
4997                                                 sizeof(tmp_adev->reset_task_info));
4998                                 if (reset_context->job && reset_context->job->vm)
4999                                         tmp_adev->reset_task_info =
5000                                                 reset_context->job->vm->task_info;
5001                                 amdgpu_reset_capture_coredumpm(tmp_adev);
5002 #endif
5003                                 if (vram_lost) {
5004                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
5005                                         amdgpu_inc_vram_lost(tmp_adev);
5006                                 }
5007
5008                                 r = amdgpu_device_fw_loading(tmp_adev);
5009                                 if (r)
5010                                         return r;
5011
5012                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
5013                                 if (r)
5014                                         goto out;
5015
5016                                 if (vram_lost)
5017                                         amdgpu_device_fill_reset_magic(tmp_adev);
5018
5019                                 /*
5020                                  * Add this ASIC as tracked as reset was already
5021                                  * complete successfully.
5022                                  */
5023                                 amdgpu_register_gpu_instance(tmp_adev);
5024
5025                                 if (!reset_context->hive &&
5026                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5027                                         amdgpu_xgmi_add_device(tmp_adev);
5028
5029                                 r = amdgpu_device_ip_late_init(tmp_adev);
5030                                 if (r)
5031                                         goto out;
5032
5033                                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, false);
5034
5035                                 /*
5036                                  * The GPU enters bad state once faulty pages
5037                                  * by ECC has reached the threshold, and ras
5038                                  * recovery is scheduled next. So add one check
5039                                  * here to break recovery if it indeed exceeds
5040                                  * bad page threshold, and remind user to
5041                                  * retire this GPU or setting one bigger
5042                                  * bad_page_threshold value to fix this once
5043                                  * probing driver again.
5044                                  */
5045                                 if (!amdgpu_ras_eeprom_check_err_threshold(tmp_adev)) {
5046                                         /* must succeed. */
5047                                         amdgpu_ras_resume(tmp_adev);
5048                                 } else {
5049                                         r = -EINVAL;
5050                                         goto out;
5051                                 }
5052
5053                                 /* Update PSP FW topology after reset */
5054                                 if (reset_context->hive &&
5055                                     tmp_adev->gmc.xgmi.num_physical_nodes > 1)
5056                                         r = amdgpu_xgmi_update_topology(
5057                                                 reset_context->hive, tmp_adev);
5058                         }
5059                 }
5060
5061 out:
5062                 if (!r) {
5063                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
5064                         r = amdgpu_ib_ring_tests(tmp_adev);
5065                         if (r) {
5066                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
5067                                 need_full_reset = true;
5068                                 r = -EAGAIN;
5069                                 goto end;
5070                         }
5071                 }
5072
5073                 if (!r)
5074                         r = amdgpu_device_recover_vram(tmp_adev);
5075                 else
5076                         tmp_adev->asic_reset_res = r;
5077         }
5078
5079 end:
5080         if (need_full_reset)
5081                 set_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5082         else
5083                 clear_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5084         return r;
5085 }
5086
5087 static void amdgpu_device_set_mp1_state(struct amdgpu_device *adev)
5088 {
5089
5090         switch (amdgpu_asic_reset_method(adev)) {
5091         case AMD_RESET_METHOD_MODE1:
5092                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
5093                 break;
5094         case AMD_RESET_METHOD_MODE2:
5095                 adev->mp1_state = PP_MP1_STATE_RESET;
5096                 break;
5097         default:
5098                 adev->mp1_state = PP_MP1_STATE_NONE;
5099                 break;
5100         }
5101 }
5102
5103 static void amdgpu_device_unset_mp1_state(struct amdgpu_device *adev)
5104 {
5105         amdgpu_vf_error_trans_all(adev);
5106         adev->mp1_state = PP_MP1_STATE_NONE;
5107 }
5108
5109 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
5110 {
5111         struct pci_dev *p = NULL;
5112
5113         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5114                         adev->pdev->bus->number, 1);
5115         if (p) {
5116                 pm_runtime_enable(&(p->dev));
5117                 pm_runtime_resume(&(p->dev));
5118         }
5119
5120         pci_dev_put(p);
5121 }
5122
5123 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
5124 {
5125         enum amd_reset_method reset_method;
5126         struct pci_dev *p = NULL;
5127         u64 expires;
5128
5129         /*
5130          * For now, only BACO and mode1 reset are confirmed
5131          * to suffer the audio issue without proper suspended.
5132          */
5133         reset_method = amdgpu_asic_reset_method(adev);
5134         if ((reset_method != AMD_RESET_METHOD_BACO) &&
5135              (reset_method != AMD_RESET_METHOD_MODE1))
5136                 return -EINVAL;
5137
5138         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
5139                         adev->pdev->bus->number, 1);
5140         if (!p)
5141                 return -ENODEV;
5142
5143         expires = pm_runtime_autosuspend_expiration(&(p->dev));
5144         if (!expires)
5145                 /*
5146                  * If we cannot get the audio device autosuspend delay,
5147                  * a fixed 4S interval will be used. Considering 3S is
5148                  * the audio controller default autosuspend delay setting.
5149                  * 4S used here is guaranteed to cover that.
5150                  */
5151                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
5152
5153         while (!pm_runtime_status_suspended(&(p->dev))) {
5154                 if (!pm_runtime_suspend(&(p->dev)))
5155                         break;
5156
5157                 if (expires < ktime_get_mono_fast_ns()) {
5158                         dev_warn(adev->dev, "failed to suspend display audio\n");
5159                         pci_dev_put(p);
5160                         /* TODO: abort the succeeding gpu reset? */
5161                         return -ETIMEDOUT;
5162                 }
5163         }
5164
5165         pm_runtime_disable(&(p->dev));
5166
5167         pci_dev_put(p);
5168         return 0;
5169 }
5170
5171 static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev)
5172 {
5173         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
5174
5175 #if defined(CONFIG_DEBUG_FS)
5176         if (!amdgpu_sriov_vf(adev))
5177                 cancel_work(&adev->reset_work);
5178 #endif
5179
5180         if (adev->kfd.dev)
5181                 cancel_work(&adev->kfd.reset_work);
5182
5183         if (amdgpu_sriov_vf(adev))
5184                 cancel_work(&adev->virt.flr_work);
5185
5186         if (con && adev->ras_enabled)
5187                 cancel_work(&con->recovery_work);
5188
5189 }
5190
5191 /**
5192  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
5193  *
5194  * @adev: amdgpu_device pointer
5195  * @job: which job trigger hang
5196  * @reset_context: amdgpu reset context pointer
5197  *
5198  * Attempt to reset the GPU if it has hung (all asics).
5199  * Attempt to do soft-reset or full-reset and reinitialize Asic
5200  * Returns 0 for success or an error on failure.
5201  */
5202
5203 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
5204                               struct amdgpu_job *job,
5205                               struct amdgpu_reset_context *reset_context)
5206 {
5207         struct list_head device_list, *device_list_handle =  NULL;
5208         bool job_signaled = false;
5209         struct amdgpu_hive_info *hive = NULL;
5210         struct amdgpu_device *tmp_adev = NULL;
5211         int i, r = 0;
5212         bool need_emergency_restart = false;
5213         bool audio_suspended = false;
5214         bool gpu_reset_for_dev_remove = false;
5215
5216         gpu_reset_for_dev_remove =
5217                         test_bit(AMDGPU_RESET_FOR_DEVICE_REMOVE, &reset_context->flags) &&
5218                                 test_bit(AMDGPU_NEED_FULL_RESET, &reset_context->flags);
5219
5220         /*
5221          * Special case: RAS triggered and full reset isn't supported
5222          */
5223         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
5224
5225         /*
5226          * Flush RAM to disk so that after reboot
5227          * the user can read log and see why the system rebooted.
5228          */
5229         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
5230                 DRM_WARN("Emergency reboot.");
5231
5232                 ksys_sync_helper();
5233                 emergency_restart();
5234         }
5235
5236         dev_info(adev->dev, "GPU %s begin!\n",
5237                 need_emergency_restart ? "jobs stop":"reset");
5238
5239         if (!amdgpu_sriov_vf(adev))
5240                 hive = amdgpu_get_xgmi_hive(adev);
5241         if (hive)
5242                 mutex_lock(&hive->hive_lock);
5243
5244         reset_context->job = job;
5245         reset_context->hive = hive;
5246         /*
5247          * Build list of devices to reset.
5248          * In case we are in XGMI hive mode, resort the device list
5249          * to put adev in the 1st position.
5250          */
5251         INIT_LIST_HEAD(&device_list);
5252         if (!amdgpu_sriov_vf(adev) && (adev->gmc.xgmi.num_physical_nodes > 1)) {
5253                 list_for_each_entry(tmp_adev, &hive->device_list, gmc.xgmi.head) {
5254                         list_add_tail(&tmp_adev->reset_list, &device_list);
5255                         if (gpu_reset_for_dev_remove && adev->shutdown)
5256                                 tmp_adev->shutdown = true;
5257                 }
5258                 if (!list_is_first(&adev->reset_list, &device_list))
5259                         list_rotate_to_front(&adev->reset_list, &device_list);
5260                 device_list_handle = &device_list;
5261         } else {
5262                 list_add_tail(&adev->reset_list, &device_list);
5263                 device_list_handle = &device_list;
5264         }
5265
5266         /* We need to lock reset domain only once both for XGMI and single device */
5267         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5268                                     reset_list);
5269         amdgpu_device_lock_reset_domain(tmp_adev->reset_domain);
5270
5271         /* block all schedulers and reset given job's ring */
5272         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5273
5274                 amdgpu_device_set_mp1_state(tmp_adev);
5275
5276                 /*
5277                  * Try to put the audio codec into suspend state
5278                  * before gpu reset started.
5279                  *
5280                  * Due to the power domain of the graphics device
5281                  * is shared with AZ power domain. Without this,
5282                  * we may change the audio hardware from behind
5283                  * the audio driver's back. That will trigger
5284                  * some audio codec errors.
5285                  */
5286                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
5287                         audio_suspended = true;
5288
5289                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
5290
5291                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
5292
5293                 if (!amdgpu_sriov_vf(tmp_adev))
5294                         amdgpu_amdkfd_pre_reset(tmp_adev);
5295
5296                 /*
5297                  * Mark these ASICs to be reseted as untracked first
5298                  * And add them back after reset completed
5299                  */
5300                 amdgpu_unregister_gpu_instance(tmp_adev);
5301
5302                 drm_fb_helper_set_suspend_unlocked(adev_to_drm(tmp_adev)->fb_helper, true);
5303
5304                 /* disable ras on ALL IPs */
5305                 if (!need_emergency_restart &&
5306                       amdgpu_device_ip_need_full_reset(tmp_adev))
5307                         amdgpu_ras_suspend(tmp_adev);
5308
5309                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5310                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5311
5312                         if (!ring || !ring->sched.thread)
5313                                 continue;
5314
5315                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
5316
5317                         if (need_emergency_restart)
5318                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
5319                 }
5320                 atomic_inc(&tmp_adev->gpu_reset_counter);
5321         }
5322
5323         if (need_emergency_restart)
5324                 goto skip_sched_resume;
5325
5326         /*
5327          * Must check guilty signal here since after this point all old
5328          * HW fences are force signaled.
5329          *
5330          * job->base holds a reference to parent fence
5331          */
5332         if (job && dma_fence_is_signaled(&job->hw_fence)) {
5333                 job_signaled = true;
5334                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
5335                 goto skip_hw_reset;
5336         }
5337
5338 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
5339         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5340                 if (gpu_reset_for_dev_remove) {
5341                         /* Workaroud for ASICs need to disable SMC first */
5342                         amdgpu_device_smu_fini_early(tmp_adev);
5343                 }
5344                 r = amdgpu_device_pre_asic_reset(tmp_adev, reset_context);
5345                 /*TODO Should we stop ?*/
5346                 if (r) {
5347                         dev_err(tmp_adev->dev, "GPU pre asic reset failed with err, %d for drm dev, %s ",
5348                                   r, adev_to_drm(tmp_adev)->unique);
5349                         tmp_adev->asic_reset_res = r;
5350                 }
5351
5352                 /*
5353                  * Drop all pending non scheduler resets. Scheduler resets
5354                  * were already dropped during drm_sched_stop
5355                  */
5356                 amdgpu_device_stop_pending_resets(tmp_adev);
5357         }
5358
5359         /* Actual ASIC resets if needed.*/
5360         /* Host driver will handle XGMI hive reset for SRIOV */
5361         if (amdgpu_sriov_vf(adev)) {
5362                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
5363                 if (r)
5364                         adev->asic_reset_res = r;
5365
5366                 /* Aldebaran and gfx_11_0_3 support ras in SRIOV, so need resume ras during reset */
5367                 if (adev->ip_versions[GC_HWIP][0] == IP_VERSION(9, 4, 2) ||
5368                     adev->ip_versions[GC_HWIP][0] == IP_VERSION(11, 0, 3))
5369                         amdgpu_ras_resume(adev);
5370         } else {
5371                 r = amdgpu_do_asic_reset(device_list_handle, reset_context);
5372                 if (r && r == -EAGAIN)
5373                         goto retry;
5374
5375                 if (!r && gpu_reset_for_dev_remove)
5376                         goto recover_end;
5377         }
5378
5379 skip_hw_reset:
5380
5381         /* Post ASIC reset for all devs .*/
5382         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5383
5384                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5385                         struct amdgpu_ring *ring = tmp_adev->rings[i];
5386
5387                         if (!ring || !ring->sched.thread)
5388                                 continue;
5389
5390                         drm_sched_start(&ring->sched, true);
5391                 }
5392
5393                 if (adev->enable_mes && adev->ip_versions[GC_HWIP][0] != IP_VERSION(11, 0, 3))
5394                         amdgpu_mes_self_test(tmp_adev);
5395
5396                 if (!drm_drv_uses_atomic_modeset(adev_to_drm(tmp_adev)) && !job_signaled) {
5397                         drm_helper_resume_force_mode(adev_to_drm(tmp_adev));
5398                 }
5399
5400                 if (tmp_adev->asic_reset_res)
5401                         r = tmp_adev->asic_reset_res;
5402
5403                 tmp_adev->asic_reset_res = 0;
5404
5405                 if (r) {
5406                         /* bad news, how to tell it to userspace ? */
5407                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
5408                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
5409                 } else {
5410                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
5411                         if (amdgpu_acpi_smart_shift_update(adev_to_drm(tmp_adev), AMDGPU_SS_DEV_D0))
5412                                 DRM_WARN("smart shift update failed\n");
5413                 }
5414         }
5415
5416 skip_sched_resume:
5417         list_for_each_entry(tmp_adev, device_list_handle, reset_list) {
5418                 /* unlock kfd: SRIOV would do it separately */
5419                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
5420                         amdgpu_amdkfd_post_reset(tmp_adev);
5421
5422                 /* kfd_post_reset will do nothing if kfd device is not initialized,
5423                  * need to bring up kfd here if it's not be initialized before
5424                  */
5425                 if (!adev->kfd.init_complete)
5426                         amdgpu_amdkfd_device_init(adev);
5427
5428                 if (audio_suspended)
5429                         amdgpu_device_resume_display_audio(tmp_adev);
5430
5431                 amdgpu_device_unset_mp1_state(tmp_adev);
5432
5433                 amdgpu_ras_set_error_query_ready(tmp_adev, true);
5434         }
5435
5436 recover_end:
5437         tmp_adev = list_first_entry(device_list_handle, struct amdgpu_device,
5438                                             reset_list);
5439         amdgpu_device_unlock_reset_domain(tmp_adev->reset_domain);
5440
5441         if (hive) {
5442                 mutex_unlock(&hive->hive_lock);
5443                 amdgpu_put_xgmi_hive(hive);
5444         }
5445
5446         if (r)
5447                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
5448
5449         atomic_set(&adev->reset_domain->reset_res, r);
5450         return r;
5451 }
5452
5453 /**
5454  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
5455  *
5456  * @adev: amdgpu_device pointer
5457  *
5458  * Fetchs and stores in the driver the PCIE capabilities (gen speed
5459  * and lanes) of the slot the device is in. Handles APUs and
5460  * virtualized environments where PCIE config space may not be available.
5461  */
5462 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
5463 {
5464         struct pci_dev *pdev;
5465         enum pci_bus_speed speed_cap, platform_speed_cap;
5466         enum pcie_link_width platform_link_width;
5467
5468         if (amdgpu_pcie_gen_cap)
5469                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
5470
5471         if (amdgpu_pcie_lane_cap)
5472                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
5473
5474         /* covers APUs as well */
5475         if (pci_is_root_bus(adev->pdev->bus) && !amdgpu_passthrough(adev)) {
5476                 if (adev->pm.pcie_gen_mask == 0)
5477                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
5478                 if (adev->pm.pcie_mlw_mask == 0)
5479                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
5480                 return;
5481         }
5482
5483         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
5484                 return;
5485
5486         pcie_bandwidth_available(adev->pdev, NULL,
5487                                  &platform_speed_cap, &platform_link_width);
5488
5489         if (adev->pm.pcie_gen_mask == 0) {
5490                 /* asic caps */
5491                 pdev = adev->pdev;
5492                 speed_cap = pcie_get_speed_cap(pdev);
5493                 if (speed_cap == PCI_SPEED_UNKNOWN) {
5494                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5495                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5496                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5497                 } else {
5498                         if (speed_cap == PCIE_SPEED_32_0GT)
5499                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5500                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5501                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5502                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5503                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN5);
5504                         else if (speed_cap == PCIE_SPEED_16_0GT)
5505                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5506                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5507                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5508                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
5509                         else if (speed_cap == PCIE_SPEED_8_0GT)
5510                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5511                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5512                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
5513                         else if (speed_cap == PCIE_SPEED_5_0GT)
5514                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5515                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
5516                         else
5517                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
5518                 }
5519                 /* platform caps */
5520                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
5521                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5522                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5523                 } else {
5524                         if (platform_speed_cap == PCIE_SPEED_32_0GT)
5525                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5526                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5527                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5528                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4 |
5529                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN5);
5530                         else if (platform_speed_cap == PCIE_SPEED_16_0GT)
5531                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5532                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5533                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
5534                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
5535                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
5536                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5537                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
5538                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
5539                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
5540                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
5541                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
5542                         else
5543                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
5544
5545                 }
5546         }
5547         if (adev->pm.pcie_mlw_mask == 0) {
5548                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
5549                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
5550                 } else {
5551                         switch (platform_link_width) {
5552                         case PCIE_LNK_X32:
5553                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
5554                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5555                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5556                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5557                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5558                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5559                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5560                                 break;
5561                         case PCIE_LNK_X16:
5562                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
5563                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5564                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5565                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5566                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5567                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5568                                 break;
5569                         case PCIE_LNK_X12:
5570                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
5571                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5572                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5573                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5574                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5575                                 break;
5576                         case PCIE_LNK_X8:
5577                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
5578                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5579                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5580                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5581                                 break;
5582                         case PCIE_LNK_X4:
5583                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
5584                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5585                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5586                                 break;
5587                         case PCIE_LNK_X2:
5588                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
5589                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
5590                                 break;
5591                         case PCIE_LNK_X1:
5592                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
5593                                 break;
5594                         default:
5595                                 break;
5596                         }
5597                 }
5598         }
5599 }
5600
5601 /**
5602  * amdgpu_device_is_peer_accessible - Check peer access through PCIe BAR
5603  *
5604  * @adev: amdgpu_device pointer
5605  * @peer_adev: amdgpu_device pointer for peer device trying to access @adev
5606  *
5607  * Return true if @peer_adev can access (DMA) @adev through the PCIe
5608  * BAR, i.e. @adev is "large BAR" and the BAR matches the DMA mask of
5609  * @peer_adev.
5610  */
5611 bool amdgpu_device_is_peer_accessible(struct amdgpu_device *adev,
5612                                       struct amdgpu_device *peer_adev)
5613 {
5614 #ifdef CONFIG_HSA_AMD_P2P
5615         uint64_t address_mask = peer_adev->dev->dma_mask ?
5616                 ~*peer_adev->dev->dma_mask : ~((1ULL << 32) - 1);
5617         resource_size_t aper_limit =
5618                 adev->gmc.aper_base + adev->gmc.aper_size - 1;
5619         bool p2p_access =
5620                 !adev->gmc.xgmi.connected_to_cpu &&
5621                 !(pci_p2pdma_distance(adev->pdev, peer_adev->dev, false) < 0);
5622
5623         return pcie_p2p && p2p_access && (adev->gmc.visible_vram_size &&
5624                 adev->gmc.real_vram_size == adev->gmc.visible_vram_size &&
5625                 !(adev->gmc.aper_base & address_mask ||
5626                   aper_limit & address_mask));
5627 #else
5628         return false;
5629 #endif
5630 }
5631
5632 int amdgpu_device_baco_enter(struct drm_device *dev)
5633 {
5634         struct amdgpu_device *adev = drm_to_adev(dev);
5635         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5636
5637         if (!amdgpu_device_supports_baco(dev))
5638                 return -ENOTSUPP;
5639
5640         if (ras && adev->ras_enabled &&
5641             adev->nbio.funcs->enable_doorbell_interrupt)
5642                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
5643
5644         return amdgpu_dpm_baco_enter(adev);
5645 }
5646
5647 int amdgpu_device_baco_exit(struct drm_device *dev)
5648 {
5649         struct amdgpu_device *adev = drm_to_adev(dev);
5650         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
5651         int ret = 0;
5652
5653         if (!amdgpu_device_supports_baco(dev))
5654                 return -ENOTSUPP;
5655
5656         ret = amdgpu_dpm_baco_exit(adev);
5657         if (ret)
5658                 return ret;
5659
5660         if (ras && adev->ras_enabled &&
5661             adev->nbio.funcs->enable_doorbell_interrupt)
5662                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
5663
5664         if (amdgpu_passthrough(adev) &&
5665             adev->nbio.funcs->clear_doorbell_interrupt)
5666                 adev->nbio.funcs->clear_doorbell_interrupt(adev);
5667
5668         return 0;
5669 }
5670
5671 /**
5672  * amdgpu_pci_error_detected - Called when a PCI error is detected.
5673  * @pdev: PCI device struct
5674  * @state: PCI channel state
5675  *
5676  * Description: Called when a PCI error is detected.
5677  *
5678  * Return: PCI_ERS_RESULT_NEED_RESET or PCI_ERS_RESULT_DISCONNECT.
5679  */
5680 pci_ers_result_t amdgpu_pci_error_detected(struct pci_dev *pdev, pci_channel_state_t state)
5681 {
5682         struct drm_device *dev = pci_get_drvdata(pdev);
5683         struct amdgpu_device *adev = drm_to_adev(dev);
5684         int i;
5685
5686         DRM_INFO("PCI error: detected callback, state(%d)!!\n", state);
5687
5688         if (adev->gmc.xgmi.num_physical_nodes > 1) {
5689                 DRM_WARN("No support for XGMI hive yet...");
5690                 return PCI_ERS_RESULT_DISCONNECT;
5691         }
5692
5693         adev->pci_channel_state = state;
5694
5695         switch (state) {
5696         case pci_channel_io_normal:
5697                 return PCI_ERS_RESULT_CAN_RECOVER;
5698         /* Fatal error, prepare for slot reset */
5699         case pci_channel_io_frozen:
5700                 /*
5701                  * Locking adev->reset_domain->sem will prevent any external access
5702                  * to GPU during PCI error recovery
5703                  */
5704                 amdgpu_device_lock_reset_domain(adev->reset_domain);
5705                 amdgpu_device_set_mp1_state(adev);
5706
5707                 /*
5708                  * Block any work scheduling as we do for regular GPU reset
5709                  * for the duration of the recovery
5710                  */
5711                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5712                         struct amdgpu_ring *ring = adev->rings[i];
5713
5714                         if (!ring || !ring->sched.thread)
5715                                 continue;
5716
5717                         drm_sched_stop(&ring->sched, NULL);
5718                 }
5719                 atomic_inc(&adev->gpu_reset_counter);
5720                 return PCI_ERS_RESULT_NEED_RESET;
5721         case pci_channel_io_perm_failure:
5722                 /* Permanent error, prepare for device removal */
5723                 return PCI_ERS_RESULT_DISCONNECT;
5724         }
5725
5726         return PCI_ERS_RESULT_NEED_RESET;
5727 }
5728
5729 /**
5730  * amdgpu_pci_mmio_enabled - Enable MMIO and dump debug registers
5731  * @pdev: pointer to PCI device
5732  */
5733 pci_ers_result_t amdgpu_pci_mmio_enabled(struct pci_dev *pdev)
5734 {
5735
5736         DRM_INFO("PCI error: mmio enabled callback!!\n");
5737
5738         /* TODO - dump whatever for debugging purposes */
5739
5740         /* This called only if amdgpu_pci_error_detected returns
5741          * PCI_ERS_RESULT_CAN_RECOVER. Read/write to the device still
5742          * works, no need to reset slot.
5743          */
5744
5745         return PCI_ERS_RESULT_RECOVERED;
5746 }
5747
5748 /**
5749  * amdgpu_pci_slot_reset - Called when PCI slot has been reset.
5750  * @pdev: PCI device struct
5751  *
5752  * Description: This routine is called by the pci error recovery
5753  * code after the PCI slot has been reset, just before we
5754  * should resume normal operations.
5755  */
5756 pci_ers_result_t amdgpu_pci_slot_reset(struct pci_dev *pdev)
5757 {
5758         struct drm_device *dev = pci_get_drvdata(pdev);
5759         struct amdgpu_device *adev = drm_to_adev(dev);
5760         int r, i;
5761         struct amdgpu_reset_context reset_context;
5762         u32 memsize;
5763         struct list_head device_list;
5764
5765         DRM_INFO("PCI error: slot reset callback!!\n");
5766
5767         memset(&reset_context, 0, sizeof(reset_context));
5768
5769         INIT_LIST_HEAD(&device_list);
5770         list_add_tail(&adev->reset_list, &device_list);
5771
5772         /* wait for asic to come out of reset */
5773         msleep(500);
5774
5775         /* Restore PCI confspace */
5776         amdgpu_device_load_pci_state(pdev);
5777
5778         /* confirm  ASIC came out of reset */
5779         for (i = 0; i < adev->usec_timeout; i++) {
5780                 memsize = amdgpu_asic_get_config_memsize(adev);
5781
5782                 if (memsize != 0xffffffff)
5783                         break;
5784                 udelay(1);
5785         }
5786         if (memsize == 0xffffffff) {
5787                 r = -ETIME;
5788                 goto out;
5789         }
5790
5791         reset_context.method = AMD_RESET_METHOD_NONE;
5792         reset_context.reset_req_dev = adev;
5793         set_bit(AMDGPU_NEED_FULL_RESET, &reset_context.flags);
5794         set_bit(AMDGPU_SKIP_HW_RESET, &reset_context.flags);
5795
5796         adev->no_hw_access = true;
5797         r = amdgpu_device_pre_asic_reset(adev, &reset_context);
5798         adev->no_hw_access = false;
5799         if (r)
5800                 goto out;
5801
5802         r = amdgpu_do_asic_reset(&device_list, &reset_context);
5803
5804 out:
5805         if (!r) {
5806                 if (amdgpu_device_cache_pci_state(adev->pdev))
5807                         pci_restore_state(adev->pdev);
5808
5809                 DRM_INFO("PCIe error recovery succeeded\n");
5810         } else {
5811                 DRM_ERROR("PCIe error recovery failed, err:%d", r);
5812                 amdgpu_device_unset_mp1_state(adev);
5813                 amdgpu_device_unlock_reset_domain(adev->reset_domain);
5814         }
5815
5816         return r ? PCI_ERS_RESULT_DISCONNECT : PCI_ERS_RESULT_RECOVERED;
5817 }
5818
5819 /**
5820  * amdgpu_pci_resume() - resume normal ops after PCI reset
5821  * @pdev: pointer to PCI device
5822  *
5823  * Called when the error recovery driver tells us that its
5824  * OK to resume normal operation.
5825  */
5826 void amdgpu_pci_resume(struct pci_dev *pdev)
5827 {
5828         struct drm_device *dev = pci_get_drvdata(pdev);
5829         struct amdgpu_device *adev = drm_to_adev(dev);
5830         int i;
5831
5832
5833         DRM_INFO("PCI error: resume callback!!\n");
5834
5835         /* Only continue execution for the case of pci_channel_io_frozen */
5836         if (adev->pci_channel_state != pci_channel_io_frozen)
5837                 return;
5838
5839         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
5840                 struct amdgpu_ring *ring = adev->rings[i];
5841
5842                 if (!ring || !ring->sched.thread)
5843                         continue;
5844
5845                 drm_sched_start(&ring->sched, true);
5846         }
5847
5848         amdgpu_device_unset_mp1_state(adev);
5849         amdgpu_device_unlock_reset_domain(adev->reset_domain);
5850 }
5851
5852 bool amdgpu_device_cache_pci_state(struct pci_dev *pdev)
5853 {
5854         struct drm_device *dev = pci_get_drvdata(pdev);
5855         struct amdgpu_device *adev = drm_to_adev(dev);
5856         int r;
5857
5858         r = pci_save_state(pdev);
5859         if (!r) {
5860                 kfree(adev->pci_state);
5861
5862                 adev->pci_state = pci_store_saved_state(pdev);
5863
5864                 if (!adev->pci_state) {
5865                         DRM_ERROR("Failed to store PCI saved state");
5866                         return false;
5867                 }
5868         } else {
5869                 DRM_WARN("Failed to save PCI state, err:%d\n", r);
5870                 return false;
5871         }
5872
5873         return true;
5874 }
5875
5876 bool amdgpu_device_load_pci_state(struct pci_dev *pdev)
5877 {
5878         struct drm_device *dev = pci_get_drvdata(pdev);
5879         struct amdgpu_device *adev = drm_to_adev(dev);
5880         int r;
5881
5882         if (!adev->pci_state)
5883                 return false;
5884
5885         r = pci_load_saved_state(pdev, adev->pci_state);
5886
5887         if (!r) {
5888                 pci_restore_state(pdev);
5889         } else {
5890                 DRM_WARN("Failed to load PCI state, err:%d\n", r);
5891                 return false;
5892         }
5893
5894         return true;
5895 }
5896
5897 void amdgpu_device_flush_hdp(struct amdgpu_device *adev,
5898                 struct amdgpu_ring *ring)
5899 {
5900 #ifdef CONFIG_X86_64
5901         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5902                 return;
5903 #endif
5904         if (adev->gmc.xgmi.connected_to_cpu)
5905                 return;
5906
5907         if (ring && ring->funcs->emit_hdp_flush)
5908                 amdgpu_ring_emit_hdp_flush(ring);
5909         else
5910                 amdgpu_asic_flush_hdp(adev, ring);
5911 }
5912
5913 void amdgpu_device_invalidate_hdp(struct amdgpu_device *adev,
5914                 struct amdgpu_ring *ring)
5915 {
5916 #ifdef CONFIG_X86_64
5917         if ((adev->flags & AMD_IS_APU) && !amdgpu_passthrough(adev))
5918                 return;
5919 #endif
5920         if (adev->gmc.xgmi.connected_to_cpu)
5921                 return;
5922
5923         amdgpu_asic_invalidate_hdp(adev, ring);
5924 }
5925
5926 int amdgpu_in_reset(struct amdgpu_device *adev)
5927 {
5928         return atomic_read(&adev->reset_domain->in_gpu_reset);
5929 }
5930
5931 /**
5932  * amdgpu_device_halt() - bring hardware to some kind of halt state
5933  *
5934  * @adev: amdgpu_device pointer
5935  *
5936  * Bring hardware to some kind of halt state so that no one can touch it
5937  * any more. It will help to maintain error context when error occurred.
5938  * Compare to a simple hang, the system will keep stable at least for SSH
5939  * access. Then it should be trivial to inspect the hardware state and
5940  * see what's going on. Implemented as following:
5941  *
5942  * 1. drm_dev_unplug() makes device inaccessible to user space(IOCTLs, etc),
5943  *    clears all CPU mappings to device, disallows remappings through page faults
5944  * 2. amdgpu_irq_disable_all() disables all interrupts
5945  * 3. amdgpu_fence_driver_hw_fini() signals all HW fences
5946  * 4. set adev->no_hw_access to avoid potential crashes after setp 5
5947  * 5. amdgpu_device_unmap_mmio() clears all MMIO mappings
5948  * 6. pci_disable_device() and pci_wait_for_pending_transaction()
5949  *    flush any in flight DMA operations
5950  */
5951 void amdgpu_device_halt(struct amdgpu_device *adev)
5952 {
5953         struct pci_dev *pdev = adev->pdev;
5954         struct drm_device *ddev = adev_to_drm(adev);
5955
5956         amdgpu_xcp_dev_unplug(adev);
5957         drm_dev_unplug(ddev);
5958
5959         amdgpu_irq_disable_all(adev);
5960
5961         amdgpu_fence_driver_hw_fini(adev);
5962
5963         adev->no_hw_access = true;
5964
5965         amdgpu_device_unmap_mmio(adev);
5966
5967         pci_disable_device(pdev);
5968         pci_wait_for_pending_transaction(pdev);
5969 }
5970
5971 u32 amdgpu_device_pcie_port_rreg(struct amdgpu_device *adev,
5972                                 u32 reg)
5973 {
5974         unsigned long flags, address, data;
5975         u32 r;
5976
5977         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5978         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5979
5980         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5981         WREG32(address, reg * 4);
5982         (void)RREG32(address);
5983         r = RREG32(data);
5984         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
5985         return r;
5986 }
5987
5988 void amdgpu_device_pcie_port_wreg(struct amdgpu_device *adev,
5989                                 u32 reg, u32 v)
5990 {
5991         unsigned long flags, address, data;
5992
5993         address = adev->nbio.funcs->get_pcie_port_index_offset(adev);
5994         data = adev->nbio.funcs->get_pcie_port_data_offset(adev);
5995
5996         spin_lock_irqsave(&adev->pcie_idx_lock, flags);
5997         WREG32(address, reg * 4);
5998         (void)RREG32(address);
5999         WREG32(data, v);
6000         (void)RREG32(data);
6001         spin_unlock_irqrestore(&adev->pcie_idx_lock, flags);
6002 }
6003
6004 /**
6005  * amdgpu_device_switch_gang - switch to a new gang
6006  * @adev: amdgpu_device pointer
6007  * @gang: the gang to switch to
6008  *
6009  * Try to switch to a new gang.
6010  * Returns: NULL if we switched to the new gang or a reference to the current
6011  * gang leader.
6012  */
6013 struct dma_fence *amdgpu_device_switch_gang(struct amdgpu_device *adev,
6014                                             struct dma_fence *gang)
6015 {
6016         struct dma_fence *old = NULL;
6017
6018         do {
6019                 dma_fence_put(old);
6020                 rcu_read_lock();
6021                 old = dma_fence_get_rcu_safe(&adev->gang_submit);
6022                 rcu_read_unlock();
6023
6024                 if (old == gang)
6025                         break;
6026
6027                 if (!dma_fence_is_signaled(old))
6028                         return old;
6029
6030         } while (cmpxchg((struct dma_fence __force **)&adev->gang_submit,
6031                          old, gang) != old);
6032
6033         dma_fence_put(old);
6034         return NULL;
6035 }
6036
6037 bool amdgpu_device_has_display_hardware(struct amdgpu_device *adev)
6038 {
6039         switch (adev->asic_type) {
6040 #ifdef CONFIG_DRM_AMDGPU_SI
6041         case CHIP_HAINAN:
6042 #endif
6043         case CHIP_TOPAZ:
6044                 /* chips with no display hardware */
6045                 return false;
6046 #ifdef CONFIG_DRM_AMDGPU_SI
6047         case CHIP_TAHITI:
6048         case CHIP_PITCAIRN:
6049         case CHIP_VERDE:
6050         case CHIP_OLAND:
6051 #endif
6052 #ifdef CONFIG_DRM_AMDGPU_CIK
6053         case CHIP_BONAIRE:
6054         case CHIP_HAWAII:
6055         case CHIP_KAVERI:
6056         case CHIP_KABINI:
6057         case CHIP_MULLINS:
6058 #endif
6059         case CHIP_TONGA:
6060         case CHIP_FIJI:
6061         case CHIP_POLARIS10:
6062         case CHIP_POLARIS11:
6063         case CHIP_POLARIS12:
6064         case CHIP_VEGAM:
6065         case CHIP_CARRIZO:
6066         case CHIP_STONEY:
6067                 /* chips with display hardware */
6068                 return true;
6069         default:
6070                 /* IP discovery */
6071                 if (!adev->ip_versions[DCE_HWIP][0] ||
6072                     (adev->harvest_ip_mask & AMD_HARVEST_IP_DMU_MASK))
6073                         return false;
6074                 return true;
6075         }
6076 }
6077
6078 uint32_t amdgpu_device_wait_on_rreg(struct amdgpu_device *adev,
6079                 uint32_t inst, uint32_t reg_addr, char reg_name[],
6080                 uint32_t expected_value, uint32_t mask)
6081 {
6082         uint32_t ret = 0;
6083         uint32_t old_ = 0;
6084         uint32_t tmp_ = RREG32(reg_addr);
6085         uint32_t loop = adev->usec_timeout;
6086
6087         while ((tmp_ & (mask)) != (expected_value)) {
6088                 if (old_ != tmp_) {
6089                         loop = adev->usec_timeout;
6090                         old_ = tmp_;
6091                 } else
6092                         udelay(1);
6093                 tmp_ = RREG32(reg_addr);
6094                 loop--;
6095                 if (!loop) {
6096                         DRM_WARN("Register(%d) [%s] failed to reach value 0x%08x != 0x%08xn",
6097                                   inst, reg_name, (uint32_t)expected_value,
6098                                   (uint32_t)(tmp_ & (mask)));
6099                         ret = -ETIMEDOUT;
6100                         break;
6101                 }
6102         }
6103         return ret;
6104 }