drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
  84 MODULE_FIRMWARE("amdgpu/navy_flounder_gpu_info.bin");
  85
  86 #define AMDGPU_RESUME_MS                2000
  87
  88 const char *amdgpu_asic_name[] = {
  89         "TAHITI",
  90         "PITCAIRN",
  91         "VERDE",
  92         "OLAND",
  93         "HAINAN",
  94         "BONAIRE",
  95         "KAVERI",
  96         "KABINI",
  97         "HAWAII",
  98         "MULLINS",
  99         "TOPAZ",
 100         "TONGA",
 101         "FIJI",
 102         "CARRIZO",
 103         "STONEY",
 104         "POLARIS10",
 105         "POLARIS11",
 106         "POLARIS12",
 107         "VEGAM",
 108         "VEGA10",
 109         "VEGA12",
 110         "VEGA20",
 111         "RAVEN",
 112         "ARCTURUS",
 113         "RENOIR",
 114         "NAVI10",
 115         "NAVI14",
 116         "NAVI12",
 117         "SIENNA_CICHLID",
 118         "NAVY_FLOUNDER",
 119         "LAST",
 120 };
 121
 122 /**
 123  * DOC: pcie_replay_count
 124  *
 125  * The amdgpu driver provides a sysfs API for reporting the total number
 126  * of PCIe replays (NAKs)
 127  * The file pcie_replay_count is used for this and returns the total
 128  * number of replays as a sum of the NAKs generated and NAKs received
 129  */
 130
 131 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 132                 struct device_attribute *attr, char *buf)
 133 {
 134         struct drm_device *ddev = dev_get_drvdata(dev);
 135         struct amdgpu_device *adev = ddev->dev_private;
 136         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 137
 138         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 139 }
 140
 141 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 142                 amdgpu_device_get_pcie_replay_count, NULL);
 143
 144 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 145
 146 /**
 147  * DOC: product_name
 148  *
 149  * The amdgpu driver provides a sysfs API for reporting the product name
 150  * for the device
 151  * The file serial_number is used for this and returns the product name
 152  * as returned from the FRU.
 153  * NOTE: This is only available for certain server cards
 154  */
 155
 156 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 157                 struct device_attribute *attr, char *buf)
 158 {
 159         struct drm_device *ddev = dev_get_drvdata(dev);
 160         struct amdgpu_device *adev = ddev->dev_private;
 161
 162         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 163 }
 164
 165 static DEVICE_ATTR(product_name, S_IRUGO,
 166                 amdgpu_device_get_product_name, NULL);
 167
 168 /**
 169  * DOC: product_number
 170  *
 171  * The amdgpu driver provides a sysfs API for reporting the part number
 172  * for the device
 173  * The file serial_number is used for this and returns the part number
 174  * as returned from the FRU.
 175  * NOTE: This is only available for certain server cards
 176  */
 177
 178 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 179                 struct device_attribute *attr, char *buf)
 180 {
 181         struct drm_device *ddev = dev_get_drvdata(dev);
 182         struct amdgpu_device *adev = ddev->dev_private;
 183
 184         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 185 }
 186
 187 static DEVICE_ATTR(product_number, S_IRUGO,
 188                 amdgpu_device_get_product_number, NULL);
 189
 190 /**
 191  * DOC: serial_number
 192  *
 193  * The amdgpu driver provides a sysfs API for reporting the serial number
 194  * for the device
 195  * The file serial_number is used for this and returns the serial number
 196  * as returned from the FRU.
 197  * NOTE: This is only available for certain server cards
 198  */
 199
 200 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 201                 struct device_attribute *attr, char *buf)
 202 {
 203         struct drm_device *ddev = dev_get_drvdata(dev);
 204         struct amdgpu_device *adev = ddev->dev_private;
 205
 206         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 207 }
 208
 209 static DEVICE_ATTR(serial_number, S_IRUGO,
 210                 amdgpu_device_get_serial_number, NULL);
 211
 212 /**
 213  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 214  *
 215  * @dev: drm_device pointer
 216  *
 217  * Returns true if the device is a dGPU with HG/PX power control,
 218  * otherwise return false.
 219  */
 220 bool amdgpu_device_supports_boco(struct drm_device *dev)
 221 {
 222         struct amdgpu_device *adev = dev->dev_private;
 223
 224         if (adev->flags & AMD_IS_PX)
 225                 return true;
 226         return false;
 227 }
 228
 229 /**
 230  * amdgpu_device_supports_baco - Does the device support BACO
 231  *
 232  * @dev: drm_device pointer
 233  *
 234  * Returns true if the device supporte BACO,
 235  * otherwise return false.
 236  */
 237 bool amdgpu_device_supports_baco(struct drm_device *dev)
 238 {
 239         struct amdgpu_device *adev = dev->dev_private;
 240
 241         return amdgpu_asic_supports_baco(adev);
 242 }
 243
 244 /**
 245  * VRAM access helper functions.
 246  *
 247  * amdgpu_device_vram_access - read/write a buffer in vram
 248  *
 249  * @adev: amdgpu_device pointer
 250  * @pos: offset of the buffer in vram
 251  * @buf: virtual address of the buffer in system memory
 252  * @size: read/write size, sizeof(@buf) must > @size
 253  * @write: true - write to vram, otherwise - read from vram
 254  */
 255 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 256                                uint32_t *buf, size_t size, bool write)
 257 {
 258         unsigned long flags;
 259         uint32_t hi = ~0;
 260         uint64_t last;
 261
 262
 263 #ifdef CONFIG_64BIT
 264         last = min(pos + size, adev->gmc.visible_vram_size);
 265         if (last > pos) {
 266                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 267                 size_t count = last - pos;
 268
 269                 if (write) {
 270                         memcpy_toio(addr, buf, count);
 271                         mb();
 272                         amdgpu_asic_flush_hdp(adev, NULL);
 273                 } else {
 274                         amdgpu_asic_invalidate_hdp(adev, NULL);
 275                         mb();
 276                         memcpy_fromio(buf, addr, count);
 277                 }
 278
 279                 if (count == size)
 280                         return;
 281
 282                 pos += count;
 283                 buf += count / 4;
 284                 size -= count;
 285         }
 286 #endif
 287
 288         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 289         for (last = pos + size; pos < last; pos += 4) {
 290                 uint32_t tmp = pos >> 31;
 291
 292                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 293                 if (tmp != hi) {
 294                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 295                         hi = tmp;
 296                 }
 297                 if (write)
 298                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 299                 else
 300                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 301         }
 302         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 303 }
 304
 305 /*
 306  * MMIO register access helper functions.
 307  */
 308 /**
 309  * amdgpu_mm_rreg - read a memory mapped IO register
 310  *
 311  * @adev: amdgpu_device pointer
 312  * @reg: dword aligned register offset
 313  * @acc_flags: access flags which require special behavior
 314  *
 315  * Returns the 32 bit value from the offset specified.
 316  */
 317 uint32_t amdgpu_mm_rreg(struct amdgpu_device *adev, uint32_t reg,
 318                         uint32_t acc_flags)
 319 {
 320         uint32_t ret;
 321
 322         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 323                 return amdgpu_kiq_rreg(adev, reg);
 324
 325         if ((reg * 4) < adev->rmmio_size)
 326                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 327         else {
 328                 unsigned long flags;
 329
 330                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 331                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 332                 ret = readl(((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 333                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 334         }
 335         trace_amdgpu_mm_rreg(adev->pdev->device, reg, ret);
 336         return ret;
 337 }
 338
 339 /*
 340  * MMIO register read with bytes helper functions
 341  * @offset:bytes offset from MMIO start
 342  *
 343 */
 344
 345 /**
 346  * amdgpu_mm_rreg8 - read a memory mapped IO register
 347  *
 348  * @adev: amdgpu_device pointer
 349  * @offset: byte aligned register offset
 350  *
 351  * Returns the 8 bit value from the offset specified.
 352  */
 353 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
 354         if (offset < adev->rmmio_size)
 355                 return (readb(adev->rmmio + offset));
 356         BUG();
 357 }
 358
 359 /*
 360  * MMIO register write with bytes helper functions
 361  * @offset:bytes offset from MMIO start
 362  * @value: the value want to be written to the register
 363  *
 364 */
 365 /**
 366  * amdgpu_mm_wreg8 - read a memory mapped IO register
 367  *
 368  * @adev: amdgpu_device pointer
 369  * @offset: byte aligned register offset
 370  * @value: 8 bit value to write
 371  *
 372  * Writes the value specified to the offset specified.
 373  */
 374 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
 375         if (offset < adev->rmmio_size)
 376                 writeb(value, adev->rmmio + offset);
 377         else
 378                 BUG();
 379 }
 380
 381 void static inline amdgpu_mm_wreg_mmio(struct amdgpu_device *adev, uint32_t reg, uint32_t v, uint32_t acc_flags)
 382 {
 383         trace_amdgpu_mm_wreg(adev->pdev->device, reg, v);
 384
 385         if ((reg * 4) < adev->rmmio_size)
 386                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 387         else {
 388                 unsigned long flags;
 389
 390                 spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 391                 writel((reg * 4), ((void __iomem *)adev->rmmio) + (mmMM_INDEX * 4));
 392                 writel(v, ((void __iomem *)adev->rmmio) + (mmMM_DATA * 4));
 393                 spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 394         }
 395 }
 396
 397 /**
 398  * amdgpu_mm_wreg - write to a memory mapped IO register
 399  *
 400  * @adev: amdgpu_device pointer
 401  * @reg: dword aligned register offset
 402  * @v: 32 bit value to write to the register
 403  * @acc_flags: access flags which require special behavior
 404  *
 405  * Writes the value specified to the offset specified.
 406  */
 407 void amdgpu_mm_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 408                     uint32_t acc_flags)
 409 {
 410         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 411                 return amdgpu_kiq_wreg(adev, reg, v);
 412
 413         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 414 }
 415
 416 /*
 417  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 418  *
 419  * this function is invoked only the debugfs register access
 420  * */
 421 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 422                     uint32_t acc_flags)
 423 {
 424         if (amdgpu_sriov_fullaccess(adev) &&
 425                 adev->gfx.rlc.funcs &&
 426                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
 427
 428                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 429                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 430         }
 431
 432         amdgpu_mm_wreg_mmio(adev, reg, v, acc_flags);
 433 }
 434
 435 /**
 436  * amdgpu_io_rreg - read an IO register
 437  *
 438  * @adev: amdgpu_device pointer
 439  * @reg: dword aligned register offset
 440  *
 441  * Returns the 32 bit value from the offset specified.
 442  */
 443 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 444 {
 445         if ((reg * 4) < adev->rio_mem_size)
 446                 return ioread32(adev->rio_mem + (reg * 4));
 447         else {
 448                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 449                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 450         }
 451 }
 452
 453 /**
 454  * amdgpu_io_wreg - write to an IO register
 455  *
 456  * @adev: amdgpu_device pointer
 457  * @reg: dword aligned register offset
 458  * @v: 32 bit value to write to the register
 459  *
 460  * Writes the value specified to the offset specified.
 461  */
 462 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 463 {
 464         if ((reg * 4) < adev->rio_mem_size)
 465                 iowrite32(v, adev->rio_mem + (reg * 4));
 466         else {
 467                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 468                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 469         }
 470 }
 471
 472 /**
 473  * amdgpu_mm_rdoorbell - read a doorbell dword
 474  *
 475  * @adev: amdgpu_device pointer
 476  * @index: doorbell index
 477  *
 478  * Returns the value in the doorbell aperture at the
 479  * requested doorbell index (CIK).
 480  */
 481 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 482 {
 483         if (index < adev->doorbell.num_doorbells) {
 484                 return readl(adev->doorbell.ptr + index);
 485         } else {
 486                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 487                 return 0;
 488         }
 489 }
 490
 491 /**
 492  * amdgpu_mm_wdoorbell - write a doorbell dword
 493  *
 494  * @adev: amdgpu_device pointer
 495  * @index: doorbell index
 496  * @v: value to write
 497  *
 498  * Writes @v to the doorbell aperture at the
 499  * requested doorbell index (CIK).
 500  */
 501 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 502 {
 503         if (index < adev->doorbell.num_doorbells) {
 504                 writel(v, adev->doorbell.ptr + index);
 505         } else {
 506                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 507         }
 508 }
 509
 510 /**
 511  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 512  *
 513  * @adev: amdgpu_device pointer
 514  * @index: doorbell index
 515  *
 516  * Returns the value in the doorbell aperture at the
 517  * requested doorbell index (VEGA10+).
 518  */
 519 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 520 {
 521         if (index < adev->doorbell.num_doorbells) {
 522                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 523         } else {
 524                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 525                 return 0;
 526         }
 527 }
 528
 529 /**
 530  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 531  *
 532  * @adev: amdgpu_device pointer
 533  * @index: doorbell index
 534  * @v: value to write
 535  *
 536  * Writes @v to the doorbell aperture at the
 537  * requested doorbell index (VEGA10+).
 538  */
 539 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 540 {
 541         if (index < adev->doorbell.num_doorbells) {
 542                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 543         } else {
 544                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 545         }
 546 }
 547
 548 /**
 549  * amdgpu_invalid_rreg - dummy reg read function
 550  *
 551  * @adev: amdgpu device pointer
 552  * @reg: offset of register
 553  *
 554  * Dummy register read function.  Used for register blocks
 555  * that certain asics don't have (all asics).
 556  * Returns the value in the register.
 557  */
 558 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 559 {
 560         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 561         BUG();
 562         return 0;
 563 }
 564
 565 /**
 566  * amdgpu_invalid_wreg - dummy reg write function
 567  *
 568  * @adev: amdgpu device pointer
 569  * @reg: offset of register
 570  * @v: value to write to the register
 571  *
 572  * Dummy register read function.  Used for register blocks
 573  * that certain asics don't have (all asics).
 574  */
 575 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 576 {
 577         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 578                   reg, v);
 579         BUG();
 580 }
 581
 582 /**
 583  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 584  *
 585  * @adev: amdgpu device pointer
 586  * @reg: offset of register
 587  *
 588  * Dummy register read function.  Used for register blocks
 589  * that certain asics don't have (all asics).
 590  * Returns the value in the register.
 591  */
 592 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 593 {
 594         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 595         BUG();
 596         return 0;
 597 }
 598
 599 /**
 600  * amdgpu_invalid_wreg64 - dummy reg write function
 601  *
 602  * @adev: amdgpu device pointer
 603  * @reg: offset of register
 604  * @v: value to write to the register
 605  *
 606  * Dummy register read function.  Used for register blocks
 607  * that certain asics don't have (all asics).
 608  */
 609 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 610 {
 611         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 612                   reg, v);
 613         BUG();
 614 }
 615
 616 /**
 617  * amdgpu_block_invalid_rreg - dummy reg read function
 618  *
 619  * @adev: amdgpu device pointer
 620  * @block: offset of instance
 621  * @reg: offset of register
 622  *
 623  * Dummy register read function.  Used for register blocks
 624  * that certain asics don't have (all asics).
 625  * Returns the value in the register.
 626  */
 627 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 628                                           uint32_t block, uint32_t reg)
 629 {
 630         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 631                   reg, block);
 632         BUG();
 633         return 0;
 634 }
 635
 636 /**
 637  * amdgpu_block_invalid_wreg - dummy reg write function
 638  *
 639  * @adev: amdgpu device pointer
 640  * @block: offset of instance
 641  * @reg: offset of register
 642  * @v: value to write to the register
 643  *
 644  * Dummy register read function.  Used for register blocks
 645  * that certain asics don't have (all asics).
 646  */
 647 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 648                                       uint32_t block,
 649                                       uint32_t reg, uint32_t v)
 650 {
 651         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 652                   reg, block, v);
 653         BUG();
 654 }
 655
 656 /**
 657  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 658  *
 659  * @adev: amdgpu device pointer
 660  *
 661  * Allocates a scratch page of VRAM for use by various things in the
 662  * driver.
 663  */
 664 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 665 {
 666         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 667                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 668                                        &adev->vram_scratch.robj,
 669                                        &adev->vram_scratch.gpu_addr,
 670                                        (void **)&adev->vram_scratch.ptr);
 671 }
 672
 673 /**
 674  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 675  *
 676  * @adev: amdgpu device pointer
 677  *
 678  * Frees the VRAM scratch page.
 679  */
 680 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 681 {
 682         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 683 }
 684
 685 /**
 686  * amdgpu_device_program_register_sequence - program an array of registers.
 687  *
 688  * @adev: amdgpu_device pointer
 689  * @registers: pointer to the register array
 690  * @array_size: size of the register array
 691  *
 692  * Programs an array or registers with and and or masks.
 693  * This is a helper for setting golden registers.
 694  */
 695 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 696                                              const u32 *registers,
 697                                              const u32 array_size)
 698 {
 699         u32 tmp, reg, and_mask, or_mask;
 700         int i;
 701
 702         if (array_size % 3)
 703                 return;
 704
 705         for (i = 0; i < array_size; i +=3) {
 706                 reg = registers[i + 0];
 707                 and_mask = registers[i + 1];
 708                 or_mask = registers[i + 2];
 709
 710                 if (and_mask == 0xffffffff) {
 711                         tmp = or_mask;
 712                 } else {
 713                         tmp = RREG32(reg);
 714                         tmp &= ~and_mask;
 715                         if (adev->family >= AMDGPU_FAMILY_AI)
 716                                 tmp |= (or_mask & and_mask);
 717                         else
 718                                 tmp |= or_mask;
 719                 }
 720                 WREG32(reg, tmp);
 721         }
 722 }
 723
 724 /**
 725  * amdgpu_device_pci_config_reset - reset the GPU
 726  *
 727  * @adev: amdgpu_device pointer
 728  *
 729  * Resets the GPU using the pci config reset sequence.
 730  * Only applicable to asics prior to vega10.
 731  */
 732 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 733 {
 734         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 735 }
 736
 737 /*
 738  * GPU doorbell aperture helpers function.
 739  */
 740 /**
 741  * amdgpu_device_doorbell_init - Init doorbell driver information.
 742  *
 743  * @adev: amdgpu_device pointer
 744  *
 745  * Init doorbell driver information (CIK)
 746  * Returns 0 on success, error on failure.
 747  */
 748 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 749 {
 750
 751         /* No doorbell on SI hardware generation */
 752         if (adev->asic_type < CHIP_BONAIRE) {
 753                 adev->doorbell.base = 0;
 754                 adev->doorbell.size = 0;
 755                 adev->doorbell.num_doorbells = 0;
 756                 adev->doorbell.ptr = NULL;
 757                 return 0;
 758         }
 759
 760         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 761                 return -EINVAL;
 762
 763         amdgpu_asic_init_doorbell_index(adev);
 764
 765         /* doorbell bar mapping */
 766         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 767         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 768
 769         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 770                                              adev->doorbell_index.max_assignment+1);
 771         if (adev->doorbell.num_doorbells == 0)
 772                 return -EINVAL;
 773
 774         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 775          * paging queue doorbell use the second page. The
 776          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 777          * doorbells are in the first page. So with paging queue enabled,
 778          * the max num_doorbells should + 1 page (0x400 in dword)
 779          */
 780         if (adev->asic_type >= CHIP_VEGA10)
 781                 adev->doorbell.num_doorbells += 0x400;
 782
 783         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 784                                      adev->doorbell.num_doorbells *
 785                                      sizeof(u32));
 786         if (adev->doorbell.ptr == NULL)
 787                 return -ENOMEM;
 788
 789         return 0;
 790 }
 791
 792 /**
 793  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 794  *
 795  * @adev: amdgpu_device pointer
 796  *
 797  * Tear down doorbell driver information (CIK)
 798  */
 799 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 800 {
 801         iounmap(adev->doorbell.ptr);
 802         adev->doorbell.ptr = NULL;
 803 }
 804
 805
 806
 807 /*
 808  * amdgpu_device_wb_*()
 809  * Writeback is the method by which the GPU updates special pages in memory
 810  * with the status of certain GPU events (fences, ring pointers,etc.).
 811  */
 812
 813 /**
 814  * amdgpu_device_wb_fini - Disable Writeback and free memory
 815  *
 816  * @adev: amdgpu_device pointer
 817  *
 818  * Disables Writeback and frees the Writeback memory (all asics).
 819  * Used at driver shutdown.
 820  */
 821 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 822 {
 823         if (adev->wb.wb_obj) {
 824                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 825                                       &adev->wb.gpu_addr,
 826                                       (void **)&adev->wb.wb);
 827                 adev->wb.wb_obj = NULL;
 828         }
 829 }
 830
 831 /**
 832  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 833  *
 834  * @adev: amdgpu_device pointer
 835  *
 836  * Initializes writeback and allocates writeback memory (all asics).
 837  * Used at driver startup.
 838  * Returns 0 on success or an -error on failure.
 839  */
 840 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 841 {
 842         int r;
 843
 844         if (adev->wb.wb_obj == NULL) {
 845                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 846                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 847                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 848                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 849                                             (void **)&adev->wb.wb);
 850                 if (r) {
 851                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 852                         return r;
 853                 }
 854
 855                 adev->wb.num_wb = AMDGPU_MAX_WB;
 856                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 857
 858                 /* clear wb memory */
 859                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 860         }
 861
 862         return 0;
 863 }
 864
 865 /**
 866  * amdgpu_device_wb_get - Allocate a wb entry
 867  *
 868  * @adev: amdgpu_device pointer
 869  * @wb: wb index
 870  *
 871  * Allocate a wb slot for use by the driver (all asics).
 872  * Returns 0 on success or -EINVAL on failure.
 873  */
 874 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 875 {
 876         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 877
 878         if (offset < adev->wb.num_wb) {
 879                 __set_bit(offset, adev->wb.used);
 880                 *wb = offset << 3; /* convert to dw offset */
 881                 return 0;
 882         } else {
 883                 return -EINVAL;
 884         }
 885 }
 886
 887 /**
 888  * amdgpu_device_wb_free - Free a wb entry
 889  *
 890  * @adev: amdgpu_device pointer
 891  * @wb: wb index
 892  *
 893  * Free a wb slot allocated for use by the driver (all asics)
 894  */
 895 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 896 {
 897         wb >>= 3;
 898         if (wb < adev->wb.num_wb)
 899                 __clear_bit(wb, adev->wb.used);
 900 }
 901
 902 /**
 903  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 904  *
 905  * @adev: amdgpu_device pointer
 906  *
 907  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 908  * to fail, but if any of the BARs is not accessible after the size we abort
 909  * driver loading by returning -ENODEV.
 910  */
 911 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 912 {
 913         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 914         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 915         struct pci_bus *root;
 916         struct resource *res;
 917         unsigned i;
 918         u16 cmd;
 919         int r;
 920
 921         /* Bypass for VF */
 922         if (amdgpu_sriov_vf(adev))
 923                 return 0;
 924
 925         /* skip if the bios has already enabled large BAR */
 926         if (adev->gmc.real_vram_size &&
 927             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
 928                 return 0;
 929
 930         /* Check if the root BUS has 64bit memory resources */
 931         root = adev->pdev->bus;
 932         while (root->parent)
 933                 root = root->parent;
 934
 935         pci_bus_for_each_resource(root, res, i) {
 936                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
 937                     res->start > 0x100000000ull)
 938                         break;
 939         }
 940
 941         /* Trying to resize is pointless without a root hub window above 4GB */
 942         if (!res)
 943                 return 0;
 944
 945         /* Disable memory decoding while we change the BAR addresses and size */
 946         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
 947         pci_write_config_word(adev->pdev, PCI_COMMAND,
 948                               cmd & ~PCI_COMMAND_MEMORY);
 949
 950         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
 951         amdgpu_device_doorbell_fini(adev);
 952         if (adev->asic_type >= CHIP_BONAIRE)
 953                 pci_release_resource(adev->pdev, 2);
 954
 955         pci_release_resource(adev->pdev, 0);
 956
 957         r = pci_resize_resource(adev->pdev, 0, rbar_size);
 958         if (r == -ENOSPC)
 959                 DRM_INFO("Not enough PCI address space for a large BAR.");
 960         else if (r && r != -ENOTSUPP)
 961                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
 962
 963         pci_assign_unassigned_bus_resources(adev->pdev->bus);
 964
 965         /* When the doorbell or fb BAR isn't available we have no chance of
 966          * using the device.
 967          */
 968         r = amdgpu_device_doorbell_init(adev);
 969         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
 970                 return -ENODEV;
 971
 972         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
 973
 974         return 0;
 975 }
 976
 977 /*
 978  * GPU helpers function.
 979  */
 980 /**
 981  * amdgpu_device_need_post - check if the hw need post or not
 982  *
 983  * @adev: amdgpu_device pointer
 984  *
 985  * Check if the asic has been initialized (all asics) at driver startup
 986  * or post is needed if  hw reset is performed.
 987  * Returns true if need or false if not.
 988  */
 989 bool amdgpu_device_need_post(struct amdgpu_device *adev)
 990 {
 991         uint32_t reg;
 992
 993         if (amdgpu_sriov_vf(adev))
 994                 return false;
 995
 996         if (amdgpu_passthrough(adev)) {
 997                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
 998                  * some old smc fw still need driver do vPost otherwise gpu hang, while
 999                  * those smc fw version above 22.15 doesn't have this flaw, so we force
1000                  * vpost executed for smc version below 22.15
1001                  */
1002                 if (adev->asic_type == CHIP_FIJI) {
1003                         int err;
1004                         uint32_t fw_ver;
1005                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
1006                         /* force vPost if error occured */
1007                         if (err)
1008                                 return true;
1009
1010                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
1011                         if (fw_ver < 0x00160e00)
1012                                 return true;
1013                 }
1014         }
1015
1016         if (adev->has_hw_reset) {
1017                 adev->has_hw_reset = false;
1018                 return true;
1019         }
1020
1021         /* bios scratch used on CIK+ */
1022         if (adev->asic_type >= CHIP_BONAIRE)
1023                 return amdgpu_atombios_scratch_need_asic_init(adev);
1024
1025         /* check MEM_SIZE for older asics */
1026         reg = amdgpu_asic_get_config_memsize(adev);
1027
1028         if ((reg != 0) && (reg != 0xffffffff))
1029                 return false;
1030
1031         return true;
1032 }
1033
1034 /* if we get transitioned to only one device, take VGA back */
1035 /**
1036  * amdgpu_device_vga_set_decode - enable/disable vga decode
1037  *
1038  * @cookie: amdgpu_device pointer
1039  * @state: enable/disable vga decode
1040  *
1041  * Enable/disable vga decode (all asics).
1042  * Returns VGA resource flags.
1043  */
1044 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1045 {
1046         struct amdgpu_device *adev = cookie;
1047         amdgpu_asic_set_vga_state(adev, state);
1048         if (state)
1049                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1050                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1051         else
1052                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1053 }
1054
1055 /**
1056  * amdgpu_device_check_block_size - validate the vm block size
1057  *
1058  * @adev: amdgpu_device pointer
1059  *
1060  * Validates the vm block size specified via module parameter.
1061  * The vm block size defines number of bits in page table versus page directory,
1062  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1063  * page table and the remaining bits are in the page directory.
1064  */
1065 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1066 {
1067         /* defines number of bits in page table versus page directory,
1068          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1069          * page table and the remaining bits are in the page directory */
1070         if (amdgpu_vm_block_size == -1)
1071                 return;
1072
1073         if (amdgpu_vm_block_size < 9) {
1074                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1075                          amdgpu_vm_block_size);
1076                 amdgpu_vm_block_size = -1;
1077         }
1078 }
1079
1080 /**
1081  * amdgpu_device_check_vm_size - validate the vm size
1082  *
1083  * @adev: amdgpu_device pointer
1084  *
1085  * Validates the vm size in GB specified via module parameter.
1086  * The VM size is the size of the GPU virtual memory space in GB.
1087  */
1088 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1089 {
1090         /* no need to check the default value */
1091         if (amdgpu_vm_size == -1)
1092                 return;
1093
1094         if (amdgpu_vm_size < 1) {
1095                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1096                          amdgpu_vm_size);
1097                 amdgpu_vm_size = -1;
1098         }
1099 }
1100
1101 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1102 {
1103         struct sysinfo si;
1104         bool is_os_64 = (sizeof(void *) == 8);
1105         uint64_t total_memory;
1106         uint64_t dram_size_seven_GB = 0x1B8000000;
1107         uint64_t dram_size_three_GB = 0xB8000000;
1108
1109         if (amdgpu_smu_memory_pool_size == 0)
1110                 return;
1111
1112         if (!is_os_64) {
1113                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1114                 goto def_value;
1115         }
1116         si_meminfo(&si);
1117         total_memory = (uint64_t)si.totalram * si.mem_unit;
1118
1119         if ((amdgpu_smu_memory_pool_size == 1) ||
1120                 (amdgpu_smu_memory_pool_size == 2)) {
1121                 if (total_memory < dram_size_three_GB)
1122                         goto def_value1;
1123         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1124                 (amdgpu_smu_memory_pool_size == 8)) {
1125                 if (total_memory < dram_size_seven_GB)
1126                         goto def_value1;
1127         } else {
1128                 DRM_WARN("Smu memory pool size not supported\n");
1129                 goto def_value;
1130         }
1131         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1132
1133         return;
1134
1135 def_value1:
1136         DRM_WARN("No enough system memory\n");
1137 def_value:
1138         adev->pm.smu_prv_buffer_size = 0;
1139 }
1140
1141 /**
1142  * amdgpu_device_check_arguments - validate module params
1143  *
1144  * @adev: amdgpu_device pointer
1145  *
1146  * Validates certain module parameters and updates
1147  * the associated values used by the driver (all asics).
1148  */
1149 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1150 {
1151         if (amdgpu_sched_jobs < 4) {
1152                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1153                          amdgpu_sched_jobs);
1154                 amdgpu_sched_jobs = 4;
1155         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1156                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1157                          amdgpu_sched_jobs);
1158                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1159         }
1160
1161         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1162                 /* gart size must be greater or equal to 32M */
1163                 dev_warn(adev->dev, "gart size (%d) too small\n",
1164                          amdgpu_gart_size);
1165                 amdgpu_gart_size = -1;
1166         }
1167
1168         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1169                 /* gtt size must be greater or equal to 32M */
1170                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1171                                  amdgpu_gtt_size);
1172                 amdgpu_gtt_size = -1;
1173         }
1174
1175         /* valid range is between 4 and 9 inclusive */
1176         if (amdgpu_vm_fragment_size != -1 &&
1177             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1178                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1179                 amdgpu_vm_fragment_size = -1;
1180         }
1181
1182         if (amdgpu_sched_hw_submission < 2) {
1183                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1184                          amdgpu_sched_hw_submission);
1185                 amdgpu_sched_hw_submission = 2;
1186         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1187                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1188                          amdgpu_sched_hw_submission);
1189                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1190         }
1191
1192         amdgpu_device_check_smu_prv_buffer_size(adev);
1193
1194         amdgpu_device_check_vm_size(adev);
1195
1196         amdgpu_device_check_block_size(adev);
1197
1198         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1199
1200         amdgpu_gmc_tmz_set(adev);
1201
1202         return 0;
1203 }
1204
1205 /**
1206  * amdgpu_switcheroo_set_state - set switcheroo state
1207  *
1208  * @pdev: pci dev pointer
1209  * @state: vga_switcheroo state
1210  *
1211  * Callback for the switcheroo driver.  Suspends or resumes the
1212  * the asics before or after it is powered up using ACPI methods.
1213  */
1214 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1215 {
1216         struct drm_device *dev = pci_get_drvdata(pdev);
1217         int r;
1218
1219         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1220                 return;
1221
1222         if (state == VGA_SWITCHEROO_ON) {
1223                 pr_info("switched on\n");
1224                 /* don't suspend or resume card normally */
1225                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1226
1227                 pci_set_power_state(dev->pdev, PCI_D0);
1228                 pci_restore_state(dev->pdev);
1229                 r = pci_enable_device(dev->pdev);
1230                 if (r)
1231                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1232                 amdgpu_device_resume(dev, true);
1233
1234                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1235                 drm_kms_helper_poll_enable(dev);
1236         } else {
1237                 pr_info("switched off\n");
1238                 drm_kms_helper_poll_disable(dev);
1239                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1240                 amdgpu_device_suspend(dev, true);
1241                 pci_save_state(dev->pdev);
1242                 /* Shut down the device */
1243                 pci_disable_device(dev->pdev);
1244                 pci_set_power_state(dev->pdev, PCI_D3cold);
1245                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1246         }
1247 }
1248
1249 /**
1250  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1251  *
1252  * @pdev: pci dev pointer
1253  *
1254  * Callback for the switcheroo driver.  Check of the switcheroo
1255  * state can be changed.
1256  * Returns true if the state can be changed, false if not.
1257  */
1258 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1259 {
1260         struct drm_device *dev = pci_get_drvdata(pdev);
1261
1262         /*
1263         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1264         * locking inversion with the driver load path. And the access here is
1265         * completely racy anyway. So don't bother with locking for now.
1266         */
1267         return atomic_read(&dev->open_count) == 0;
1268 }
1269
1270 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1271         .set_gpu_state = amdgpu_switcheroo_set_state,
1272         .reprobe = NULL,
1273         .can_switch = amdgpu_switcheroo_can_switch,
1274 };
1275
1276 /**
1277  * amdgpu_device_ip_set_clockgating_state - set the CG state
1278  *
1279  * @dev: amdgpu_device pointer
1280  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1281  * @state: clockgating state (gate or ungate)
1282  *
1283  * Sets the requested clockgating state for all instances of
1284  * the hardware IP specified.
1285  * Returns the error code from the last instance.
1286  */
1287 int amdgpu_device_ip_set_clockgating_state(void *dev,
1288                                            enum amd_ip_block_type block_type,
1289                                            enum amd_clockgating_state state)
1290 {
1291         struct amdgpu_device *adev = dev;
1292         int i, r = 0;
1293
1294         for (i = 0; i < adev->num_ip_blocks; i++) {
1295                 if (!adev->ip_blocks[i].status.valid)
1296                         continue;
1297                 if (adev->ip_blocks[i].version->type != block_type)
1298                         continue;
1299                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1300                         continue;
1301                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1302                         (void *)adev, state);
1303                 if (r)
1304                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1305                                   adev->ip_blocks[i].version->funcs->name, r);
1306         }
1307         return r;
1308 }
1309
1310 /**
1311  * amdgpu_device_ip_set_powergating_state - set the PG state
1312  *
1313  * @dev: amdgpu_device pointer
1314  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1315  * @state: powergating state (gate or ungate)
1316  *
1317  * Sets the requested powergating state for all instances of
1318  * the hardware IP specified.
1319  * Returns the error code from the last instance.
1320  */
1321 int amdgpu_device_ip_set_powergating_state(void *dev,
1322                                            enum amd_ip_block_type block_type,
1323                                            enum amd_powergating_state state)
1324 {
1325         struct amdgpu_device *adev = dev;
1326         int i, r = 0;
1327
1328         for (i = 0; i < adev->num_ip_blocks; i++) {
1329                 if (!adev->ip_blocks[i].status.valid)
1330                         continue;
1331                 if (adev->ip_blocks[i].version->type != block_type)
1332                         continue;
1333                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1334                         continue;
1335                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1336                         (void *)adev, state);
1337                 if (r)
1338                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1339                                   adev->ip_blocks[i].version->funcs->name, r);
1340         }
1341         return r;
1342 }
1343
1344 /**
1345  * amdgpu_device_ip_get_clockgating_state - get the CG state
1346  *
1347  * @adev: amdgpu_device pointer
1348  * @flags: clockgating feature flags
1349  *
1350  * Walks the list of IPs on the device and updates the clockgating
1351  * flags for each IP.
1352  * Updates @flags with the feature flags for each hardware IP where
1353  * clockgating is enabled.
1354  */
1355 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1356                                             u32 *flags)
1357 {
1358         int i;
1359
1360         for (i = 0; i < adev->num_ip_blocks; i++) {
1361                 if (!adev->ip_blocks[i].status.valid)
1362                         continue;
1363                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1364                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1365         }
1366 }
1367
1368 /**
1369  * amdgpu_device_ip_wait_for_idle - wait for idle
1370  *
1371  * @adev: amdgpu_device pointer
1372  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1373  *
1374  * Waits for the request hardware IP to be idle.
1375  * Returns 0 for success or a negative error code on failure.
1376  */
1377 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1378                                    enum amd_ip_block_type block_type)
1379 {
1380         int i, r;
1381
1382         for (i = 0; i < adev->num_ip_blocks; i++) {
1383                 if (!adev->ip_blocks[i].status.valid)
1384                         continue;
1385                 if (adev->ip_blocks[i].version->type == block_type) {
1386                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1387                         if (r)
1388                                 return r;
1389                         break;
1390                 }
1391         }
1392         return 0;
1393
1394 }
1395
1396 /**
1397  * amdgpu_device_ip_is_idle - is the hardware IP idle
1398  *
1399  * @adev: amdgpu_device pointer
1400  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1401  *
1402  * Check if the hardware IP is idle or not.
1403  * Returns true if it the IP is idle, false if not.
1404  */
1405 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1406                               enum amd_ip_block_type block_type)
1407 {
1408         int i;
1409
1410         for (i = 0; i < adev->num_ip_blocks; i++) {
1411                 if (!adev->ip_blocks[i].status.valid)
1412                         continue;
1413                 if (adev->ip_blocks[i].version->type == block_type)
1414                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1415         }
1416         return true;
1417
1418 }
1419
1420 /**
1421  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1422  *
1423  * @adev: amdgpu_device pointer
1424  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1425  *
1426  * Returns a pointer to the hardware IP block structure
1427  * if it exists for the asic, otherwise NULL.
1428  */
1429 struct amdgpu_ip_block *
1430 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1431                               enum amd_ip_block_type type)
1432 {
1433         int i;
1434
1435         for (i = 0; i < adev->num_ip_blocks; i++)
1436                 if (adev->ip_blocks[i].version->type == type)
1437                         return &adev->ip_blocks[i];
1438
1439         return NULL;
1440 }
1441
1442 /**
1443  * amdgpu_device_ip_block_version_cmp
1444  *
1445  * @adev: amdgpu_device pointer
1446  * @type: enum amd_ip_block_type
1447  * @major: major version
1448  * @minor: minor version
1449  *
1450  * return 0 if equal or greater
1451  * return 1 if smaller or the ip_block doesn't exist
1452  */
1453 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1454                                        enum amd_ip_block_type type,
1455                                        u32 major, u32 minor)
1456 {
1457         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1458
1459         if (ip_block && ((ip_block->version->major > major) ||
1460                         ((ip_block->version->major == major) &&
1461                         (ip_block->version->minor >= minor))))
1462                 return 0;
1463
1464         return 1;
1465 }
1466
1467 /**
1468  * amdgpu_device_ip_block_add
1469  *
1470  * @adev: amdgpu_device pointer
1471  * @ip_block_version: pointer to the IP to add
1472  *
1473  * Adds the IP block driver information to the collection of IPs
1474  * on the asic.
1475  */
1476 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1477                                const struct amdgpu_ip_block_version *ip_block_version)
1478 {
1479         if (!ip_block_version)
1480                 return -EINVAL;
1481
1482         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1483                   ip_block_version->funcs->name);
1484
1485         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1486
1487         return 0;
1488 }
1489
1490 /**
1491  * amdgpu_device_enable_virtual_display - enable virtual display feature
1492  *
1493  * @adev: amdgpu_device pointer
1494  *
1495  * Enabled the virtual display feature if the user has enabled it via
1496  * the module parameter virtual_display.  This feature provides a virtual
1497  * display hardware on headless boards or in virtualized environments.
1498  * This function parses and validates the configuration string specified by
1499  * the user and configues the virtual display configuration (number of
1500  * virtual connectors, crtcs, etc.) specified.
1501  */
1502 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1503 {
1504         adev->enable_virtual_display = false;
1505
1506         if (amdgpu_virtual_display) {
1507                 struct drm_device *ddev = adev->ddev;
1508                 const char *pci_address_name = pci_name(ddev->pdev);
1509                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1510
1511                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1512                 pciaddstr_tmp = pciaddstr;
1513                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1514                         pciaddname = strsep(&pciaddname_tmp, ",");
1515                         if (!strcmp("all", pciaddname)
1516                             || !strcmp(pci_address_name, pciaddname)) {
1517                                 long num_crtc;
1518                                 int res = -1;
1519
1520                                 adev->enable_virtual_display = true;
1521
1522                                 if (pciaddname_tmp)
1523                                         res = kstrtol(pciaddname_tmp, 10,
1524                                                       &num_crtc);
1525
1526                                 if (!res) {
1527                                         if (num_crtc < 1)
1528                                                 num_crtc = 1;
1529                                         if (num_crtc > 6)
1530                                                 num_crtc = 6;
1531                                         adev->mode_info.num_crtc = num_crtc;
1532                                 } else {
1533                                         adev->mode_info.num_crtc = 1;
1534                                 }
1535                                 break;
1536                         }
1537                 }
1538
1539                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1540                          amdgpu_virtual_display, pci_address_name,
1541                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1542
1543                 kfree(pciaddstr);
1544         }
1545 }
1546
1547 /**
1548  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1549  *
1550  * @adev: amdgpu_device pointer
1551  *
1552  * Parses the asic configuration parameters specified in the gpu info
1553  * firmware and makes them availale to the driver for use in configuring
1554  * the asic.
1555  * Returns 0 on success, -EINVAL on failure.
1556  */
1557 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1558 {
1559         const char *chip_name;
1560         char fw_name[40];
1561         int err;
1562         const struct gpu_info_firmware_header_v1_0 *hdr;
1563
1564         adev->firmware.gpu_info_fw = NULL;
1565
1566         if (adev->discovery_bin) {
1567                 amdgpu_discovery_get_gfx_info(adev);
1568
1569                 /*
1570                  * FIXME: The bounding box is still needed by Navi12, so
1571                  * temporarily read it from gpu_info firmware. Should be droped
1572                  * when DAL no longer needs it.
1573                  */
1574                 if (adev->asic_type != CHIP_NAVI12)
1575                         return 0;
1576         }
1577
1578         switch (adev->asic_type) {
1579 #ifdef CONFIG_DRM_AMDGPU_SI
1580         case CHIP_VERDE:
1581         case CHIP_TAHITI:
1582         case CHIP_PITCAIRN:
1583         case CHIP_OLAND:
1584         case CHIP_HAINAN:
1585 #endif
1586 #ifdef CONFIG_DRM_AMDGPU_CIK
1587         case CHIP_BONAIRE:
1588         case CHIP_HAWAII:
1589         case CHIP_KAVERI:
1590         case CHIP_KABINI:
1591         case CHIP_MULLINS:
1592 #endif
1593         case CHIP_TOPAZ:
1594         case CHIP_TONGA:
1595         case CHIP_FIJI:
1596         case CHIP_POLARIS10:
1597         case CHIP_POLARIS11:
1598         case CHIP_POLARIS12:
1599         case CHIP_VEGAM:
1600         case CHIP_CARRIZO:
1601         case CHIP_STONEY:
1602         case CHIP_VEGA20:
1603         default:
1604                 return 0;
1605         case CHIP_VEGA10:
1606                 chip_name = "vega10";
1607                 break;
1608         case CHIP_VEGA12:
1609                 chip_name = "vega12";
1610                 break;
1611         case CHIP_RAVEN:
1612                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1613                         chip_name = "raven2";
1614                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1615                         chip_name = "picasso";
1616                 else
1617                         chip_name = "raven";
1618                 break;
1619         case CHIP_ARCTURUS:
1620                 chip_name = "arcturus";
1621                 break;
1622         case CHIP_RENOIR:
1623                 chip_name = "renoir";
1624                 break;
1625         case CHIP_NAVI10:
1626                 chip_name = "navi10";
1627                 break;
1628         case CHIP_NAVI14:
1629                 chip_name = "navi14";
1630                 break;
1631         case CHIP_NAVI12:
1632                 chip_name = "navi12";
1633                 break;
1634         case CHIP_SIENNA_CICHLID:
1635                 chip_name = "sienna_cichlid";
1636                 break;
1637         case CHIP_NAVY_FLOUNDER:
1638                 chip_name = "navy_flounder";
1639                 break;
1640         }
1641
1642         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1643         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1644         if (err) {
1645                 dev_err(adev->dev,
1646                         "Failed to load gpu_info firmware \"%s\"\n",
1647                         fw_name);
1648                 goto out;
1649         }
1650         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1651         if (err) {
1652                 dev_err(adev->dev,
1653                         "Failed to validate gpu_info firmware \"%s\"\n",
1654                         fw_name);
1655                 goto out;
1656         }
1657
1658         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1659         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1660
1661         switch (hdr->version_major) {
1662         case 1:
1663         {
1664                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1665                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1666                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1667
1668                 /*
1669                  * Should be droped when DAL no longer needs it.
1670                  */
1671                 if (adev->asic_type == CHIP_NAVI12)
1672                         goto parse_soc_bounding_box;
1673
1674                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1675                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1676                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1677                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1678                 adev->gfx.config.max_texture_channel_caches =
1679                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1680                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1681                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1682                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1683                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1684                 adev->gfx.config.double_offchip_lds_buf =
1685                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1686                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1687                 adev->gfx.cu_info.max_waves_per_simd =
1688                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1689                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1690                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1691                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1692                 if (hdr->version_minor >= 1) {
1693                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1694                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1695                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1696                         adev->gfx.config.num_sc_per_sh =
1697                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1698                         adev->gfx.config.num_packer_per_sc =
1699                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1700                 }
1701
1702 parse_soc_bounding_box:
1703                 /*
1704                  * soc bounding box info is not integrated in disocovery table,
1705                  * we always need to parse it from gpu info firmware if needed.
1706                  */
1707                 if (hdr->version_minor == 2) {
1708                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1709                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1710                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1711                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1712                 }
1713                 break;
1714         }
1715         default:
1716                 dev_err(adev->dev,
1717                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1718                 err = -EINVAL;
1719                 goto out;
1720         }
1721 out:
1722         return err;
1723 }
1724
1725 /**
1726  * amdgpu_device_ip_early_init - run early init for hardware IPs
1727  *
1728  * @adev: amdgpu_device pointer
1729  *
1730  * Early initialization pass for hardware IPs.  The hardware IPs that make
1731  * up each asic are discovered each IP's early_init callback is run.  This
1732  * is the first stage in initializing the asic.
1733  * Returns 0 on success, negative error code on failure.
1734  */
1735 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1736 {
1737         int i, r;
1738
1739         amdgpu_device_enable_virtual_display(adev);
1740
1741         if (amdgpu_sriov_vf(adev)) {
1742                 r = amdgpu_virt_request_full_gpu(adev, true);
1743                 if (r)
1744                         return r;
1745         }
1746
1747         switch (adev->asic_type) {
1748 #ifdef CONFIG_DRM_AMDGPU_SI
1749         case CHIP_VERDE:
1750         case CHIP_TAHITI:
1751         case CHIP_PITCAIRN:
1752         case CHIP_OLAND:
1753         case CHIP_HAINAN:
1754                 adev->family = AMDGPU_FAMILY_SI;
1755                 r = si_set_ip_blocks(adev);
1756                 if (r)
1757                         return r;
1758                 break;
1759 #endif
1760 #ifdef CONFIG_DRM_AMDGPU_CIK
1761         case CHIP_BONAIRE:
1762         case CHIP_HAWAII:
1763         case CHIP_KAVERI:
1764         case CHIP_KABINI:
1765         case CHIP_MULLINS:
1766                 if (adev->flags & AMD_IS_APU)
1767                         adev->family = AMDGPU_FAMILY_KV;
1768                 else
1769                         adev->family = AMDGPU_FAMILY_CI;
1770
1771                 r = cik_set_ip_blocks(adev);
1772                 if (r)
1773                         return r;
1774                 break;
1775 #endif
1776         case CHIP_TOPAZ:
1777         case CHIP_TONGA:
1778         case CHIP_FIJI:
1779         case CHIP_POLARIS10:
1780         case CHIP_POLARIS11:
1781         case CHIP_POLARIS12:
1782         case CHIP_VEGAM:
1783         case CHIP_CARRIZO:
1784         case CHIP_STONEY:
1785                 if (adev->flags & AMD_IS_APU)
1786                         adev->family = AMDGPU_FAMILY_CZ;
1787                 else
1788                         adev->family = AMDGPU_FAMILY_VI;
1789
1790                 r = vi_set_ip_blocks(adev);
1791                 if (r)
1792                         return r;
1793                 break;
1794         case CHIP_VEGA10:
1795         case CHIP_VEGA12:
1796         case CHIP_VEGA20:
1797         case CHIP_RAVEN:
1798         case CHIP_ARCTURUS:
1799         case CHIP_RENOIR:
1800                 if (adev->flags & AMD_IS_APU)
1801                         adev->family = AMDGPU_FAMILY_RV;
1802                 else
1803                         adev->family = AMDGPU_FAMILY_AI;
1804
1805                 r = soc15_set_ip_blocks(adev);
1806                 if (r)
1807                         return r;
1808                 break;
1809         case  CHIP_NAVI10:
1810         case  CHIP_NAVI14:
1811         case  CHIP_NAVI12:
1812         case  CHIP_SIENNA_CICHLID:
1813         case  CHIP_NAVY_FLOUNDER:
1814                 adev->family = AMDGPU_FAMILY_NV;
1815
1816                 r = nv_set_ip_blocks(adev);
1817                 if (r)
1818                         return r;
1819                 break;
1820         default:
1821                 /* FIXME: not supported yet */
1822                 return -EINVAL;
1823         }
1824
1825         amdgpu_amdkfd_device_probe(adev);
1826
1827         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1828         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1829                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1830
1831         for (i = 0; i < adev->num_ip_blocks; i++) {
1832                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1833                         DRM_ERROR("disabled ip block: %d <%s>\n",
1834                                   i, adev->ip_blocks[i].version->funcs->name);
1835                         adev->ip_blocks[i].status.valid = false;
1836                 } else {
1837                         if (adev->ip_blocks[i].version->funcs->early_init) {
1838                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1839                                 if (r == -ENOENT) {
1840                                         adev->ip_blocks[i].status.valid = false;
1841                                 } else if (r) {
1842                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1843                                                   adev->ip_blocks[i].version->funcs->name, r);
1844                                         return r;
1845                                 } else {
1846                                         adev->ip_blocks[i].status.valid = true;
1847                                 }
1848                         } else {
1849                                 adev->ip_blocks[i].status.valid = true;
1850                         }
1851                 }
1852                 /* get the vbios after the asic_funcs are set up */
1853                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1854                         r = amdgpu_device_parse_gpu_info_fw(adev);
1855                         if (r)
1856                                 return r;
1857
1858                         /* Read BIOS */
1859                         if (!amdgpu_get_bios(adev))
1860                                 return -EINVAL;
1861
1862                         r = amdgpu_atombios_init(adev);
1863                         if (r) {
1864                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1865                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1866                                 return r;
1867                         }
1868                 }
1869         }
1870
1871         adev->cg_flags &= amdgpu_cg_mask;
1872         adev->pg_flags &= amdgpu_pg_mask;
1873
1874         return 0;
1875 }
1876
1877 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1878 {
1879         int i, r;
1880
1881         for (i = 0; i < adev->num_ip_blocks; i++) {
1882                 if (!adev->ip_blocks[i].status.sw)
1883                         continue;
1884                 if (adev->ip_blocks[i].status.hw)
1885                         continue;
1886                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1887                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1888                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1889                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1890                         if (r) {
1891                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1892                                           adev->ip_blocks[i].version->funcs->name, r);
1893                                 return r;
1894                         }
1895                         adev->ip_blocks[i].status.hw = true;
1896                 }
1897         }
1898
1899         return 0;
1900 }
1901
1902 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1903 {
1904         int i, r;
1905
1906         for (i = 0; i < adev->num_ip_blocks; i++) {
1907                 if (!adev->ip_blocks[i].status.sw)
1908                         continue;
1909                 if (adev->ip_blocks[i].status.hw)
1910                         continue;
1911                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1912                 if (r) {
1913                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1914                                   adev->ip_blocks[i].version->funcs->name, r);
1915                         return r;
1916                 }
1917                 adev->ip_blocks[i].status.hw = true;
1918         }
1919
1920         return 0;
1921 }
1922
1923 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1924 {
1925         int r = 0;
1926         int i;
1927         uint32_t smu_version;
1928
1929         if (adev->asic_type >= CHIP_VEGA10) {
1930                 for (i = 0; i < adev->num_ip_blocks; i++) {
1931                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1932                                 continue;
1933
1934                         /* no need to do the fw loading again if already done*/
1935                         if (adev->ip_blocks[i].status.hw == true)
1936                                 break;
1937
1938                         if (adev->in_gpu_reset || adev->in_suspend) {
1939                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
1940                                 if (r) {
1941                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
1942                                                           adev->ip_blocks[i].version->funcs->name, r);
1943                                         return r;
1944                                 }
1945                         } else {
1946                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1947                                 if (r) {
1948                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1949                                                           adev->ip_blocks[i].version->funcs->name, r);
1950                                         return r;
1951                                 }
1952                         }
1953
1954                         adev->ip_blocks[i].status.hw = true;
1955                         break;
1956                 }
1957         }
1958
1959         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1960                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1961
1962         return r;
1963 }
1964
1965 /**
1966  * amdgpu_device_ip_init - run init for hardware IPs
1967  *
1968  * @adev: amdgpu_device pointer
1969  *
1970  * Main initialization pass for hardware IPs.  The list of all the hardware
1971  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1972  * are run.  sw_init initializes the software state associated with each IP
1973  * and hw_init initializes the hardware associated with each IP.
1974  * Returns 0 on success, negative error code on failure.
1975  */
1976 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1977 {
1978         int i, r;
1979
1980         r = amdgpu_ras_init(adev);
1981         if (r)
1982                 return r;
1983
1984         for (i = 0; i < adev->num_ip_blocks; i++) {
1985                 if (!adev->ip_blocks[i].status.valid)
1986                         continue;
1987                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
1988                 if (r) {
1989                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
1990                                   adev->ip_blocks[i].version->funcs->name, r);
1991                         goto init_failed;
1992                 }
1993                 adev->ip_blocks[i].status.sw = true;
1994
1995                 /* need to do gmc hw init early so we can allocate gpu mem */
1996                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
1997                         r = amdgpu_device_vram_scratch_init(adev);
1998                         if (r) {
1999                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2000                                 goto init_failed;
2001                         }
2002                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2003                         if (r) {
2004                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2005                                 goto init_failed;
2006                         }
2007                         r = amdgpu_device_wb_init(adev);
2008                         if (r) {
2009                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2010                                 goto init_failed;
2011                         }
2012                         adev->ip_blocks[i].status.hw = true;
2013
2014                         /* right after GMC hw init, we create CSA */
2015                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2016                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2017                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2018                                                                 AMDGPU_CSA_SIZE);
2019                                 if (r) {
2020                                         DRM_ERROR("allocate CSA failed %d\n", r);
2021                                         goto init_failed;
2022                                 }
2023                         }
2024                 }
2025         }
2026
2027         if (amdgpu_sriov_vf(adev))
2028                 amdgpu_virt_init_data_exchange(adev);
2029
2030         r = amdgpu_ib_pool_init(adev);
2031         if (r) {
2032                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2033                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2034                 goto init_failed;
2035         }
2036
2037         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2038         if (r)
2039                 goto init_failed;
2040
2041         r = amdgpu_device_ip_hw_init_phase1(adev);
2042         if (r)
2043                 goto init_failed;
2044
2045         r = amdgpu_device_fw_loading(adev);
2046         if (r)
2047                 goto init_failed;
2048
2049         r = amdgpu_device_ip_hw_init_phase2(adev);
2050         if (r)
2051                 goto init_failed;
2052
2053         /*
2054          * retired pages will be loaded from eeprom and reserved here,
2055          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2056          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2057          * for I2C communication which only true at this point.
2058          * recovery_init may fail, but it can free all resources allocated by
2059          * itself and its failure should not stop amdgpu init process.
2060          *
2061          * Note: theoretically, this should be called before all vram allocations
2062          * to protect retired page from abusing
2063          */
2064         amdgpu_ras_recovery_init(adev);
2065
2066         if (adev->gmc.xgmi.num_physical_nodes > 1)
2067                 amdgpu_xgmi_add_device(adev);
2068         amdgpu_amdkfd_device_init(adev);
2069
2070         amdgpu_fru_get_product_info(adev);
2071
2072 init_failed:
2073         if (amdgpu_sriov_vf(adev))
2074                 amdgpu_virt_release_full_gpu(adev, true);
2075
2076         return r;
2077 }
2078
2079 /**
2080  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2081  *
2082  * @adev: amdgpu_device pointer
2083  *
2084  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2085  * this function before a GPU reset.  If the value is retained after a
2086  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2087  */
2088 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2089 {
2090         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2091 }
2092
2093 /**
2094  * amdgpu_device_check_vram_lost - check if vram is valid
2095  *
2096  * @adev: amdgpu_device pointer
2097  *
2098  * Checks the reset magic value written to the gart pointer in VRAM.
2099  * The driver calls this after a GPU reset to see if the contents of
2100  * VRAM is lost or now.
2101  * returns true if vram is lost, false if not.
2102  */
2103 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2104 {
2105         if (memcmp(adev->gart.ptr, adev->reset_magic,
2106                         AMDGPU_RESET_MAGIC_NUM))
2107                 return true;
2108
2109         if (!adev->in_gpu_reset)
2110                 return false;
2111
2112         /*
2113          * For all ASICs with baco/mode1 reset, the VRAM is
2114          * always assumed to be lost.
2115          */
2116         switch (amdgpu_asic_reset_method(adev)) {
2117         case AMD_RESET_METHOD_BACO:
2118         case AMD_RESET_METHOD_MODE1:
2119                 return true;
2120         default:
2121                 return false;
2122         }
2123 }
2124
2125 /**
2126  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2127  *
2128  * @adev: amdgpu_device pointer
2129  * @state: clockgating state (gate or ungate)
2130  *
2131  * The list of all the hardware IPs that make up the asic is walked and the
2132  * set_clockgating_state callbacks are run.
2133  * Late initialization pass enabling clockgating for hardware IPs.
2134  * Fini or suspend, pass disabling clockgating for hardware IPs.
2135  * Returns 0 on success, negative error code on failure.
2136  */
2137
2138 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2139                                                 enum amd_clockgating_state state)
2140 {
2141         int i, j, r;
2142
2143         if (amdgpu_emu_mode == 1)
2144                 return 0;
2145
2146         for (j = 0; j < adev->num_ip_blocks; j++) {
2147                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2148                 if (!adev->ip_blocks[i].status.late_initialized)
2149                         continue;
2150                 /* skip CG for VCE/UVD, it's handled specially */
2151                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2152                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2153                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2154                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2155                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2156                         /* enable clockgating to save power */
2157                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2158                                                                                      state);
2159                         if (r) {
2160                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2161                                           adev->ip_blocks[i].version->funcs->name, r);
2162                                 return r;
2163                         }
2164                 }
2165         }
2166
2167         return 0;
2168 }
2169
2170 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2171 {
2172         int i, j, r;
2173
2174         if (amdgpu_emu_mode == 1)
2175                 return 0;
2176
2177         for (j = 0; j < adev->num_ip_blocks; j++) {
2178                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2179                 if (!adev->ip_blocks[i].status.late_initialized)
2180                         continue;
2181                 /* skip CG for VCE/UVD, it's handled specially */
2182                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2183                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2184                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2185                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2186                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2187                         /* enable powergating to save power */
2188                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2189                                                                                         state);
2190                         if (r) {
2191                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2192                                           adev->ip_blocks[i].version->funcs->name, r);
2193                                 return r;
2194                         }
2195                 }
2196         }
2197         return 0;
2198 }
2199
2200 static int amdgpu_device_enable_mgpu_fan_boost(void)
2201 {
2202         struct amdgpu_gpu_instance *gpu_ins;
2203         struct amdgpu_device *adev;
2204         int i, ret = 0;
2205
2206         mutex_lock(&mgpu_info.mutex);
2207
2208         /*
2209          * MGPU fan boost feature should be enabled
2210          * only when there are two or more dGPUs in
2211          * the system
2212          */
2213         if (mgpu_info.num_dgpu < 2)
2214                 goto out;
2215
2216         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2217                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2218                 adev = gpu_ins->adev;
2219                 if (!(adev->flags & AMD_IS_APU) &&
2220                     !gpu_ins->mgpu_fan_enabled &&
2221                     adev->powerplay.pp_funcs &&
2222                     adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2223                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2224                         if (ret)
2225                                 break;
2226
2227                         gpu_ins->mgpu_fan_enabled = 1;
2228                 }
2229         }
2230
2231 out:
2232         mutex_unlock(&mgpu_info.mutex);
2233
2234         return ret;
2235 }
2236
2237 /**
2238  * amdgpu_device_ip_late_init - run late init for hardware IPs
2239  *
2240  * @adev: amdgpu_device pointer
2241  *
2242  * Late initialization pass for hardware IPs.  The list of all the hardware
2243  * IPs that make up the asic is walked and the late_init callbacks are run.
2244  * late_init covers any special initialization that an IP requires
2245  * after all of the have been initialized or something that needs to happen
2246  * late in the init process.
2247  * Returns 0 on success, negative error code on failure.
2248  */
2249 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2250 {
2251         struct amdgpu_gpu_instance *gpu_instance;
2252         int i = 0, r;
2253
2254         for (i = 0; i < adev->num_ip_blocks; i++) {
2255                 if (!adev->ip_blocks[i].status.hw)
2256                         continue;
2257                 if (adev->ip_blocks[i].version->funcs->late_init) {
2258                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2259                         if (r) {
2260                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2261                                           adev->ip_blocks[i].version->funcs->name, r);
2262                                 return r;
2263                         }
2264                 }
2265                 adev->ip_blocks[i].status.late_initialized = true;
2266         }
2267
2268         amdgpu_ras_set_error_query_ready(adev, true);
2269
2270         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2271         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2272
2273         amdgpu_device_fill_reset_magic(adev);
2274
2275         r = amdgpu_device_enable_mgpu_fan_boost();
2276         if (r)
2277                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2278
2279
2280         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2281                 mutex_lock(&mgpu_info.mutex);
2282
2283                 /*
2284                  * Reset device p-state to low as this was booted with high.
2285                  *
2286                  * This should be performed only after all devices from the same
2287                  * hive get initialized.
2288                  *
2289                  * However, it's unknown how many device in the hive in advance.
2290                  * As this is counted one by one during devices initializations.
2291                  *
2292                  * So, we wait for all XGMI interlinked devices initialized.
2293                  * This may bring some delays as those devices may come from
2294                  * different hives. But that should be OK.
2295                  */
2296                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2297                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2298                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2299                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2300                                         continue;
2301
2302                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2303                                                 AMDGPU_XGMI_PSTATE_MIN);
2304                                 if (r) {
2305                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2306                                         break;
2307                                 }
2308                         }
2309                 }
2310
2311                 mutex_unlock(&mgpu_info.mutex);
2312         }
2313
2314         return 0;
2315 }
2316
2317 /**
2318  * amdgpu_device_ip_fini - run fini for hardware IPs
2319  *
2320  * @adev: amdgpu_device pointer
2321  *
2322  * Main teardown pass for hardware IPs.  The list of all the hardware
2323  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2324  * are run.  hw_fini tears down the hardware associated with each IP
2325  * and sw_fini tears down any software state associated with each IP.
2326  * Returns 0 on success, negative error code on failure.
2327  */
2328 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2329 {
2330         int i, r;
2331
2332         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2333                 amdgpu_virt_release_ras_err_handler_data(adev);
2334
2335         amdgpu_ras_pre_fini(adev);
2336
2337         if (adev->gmc.xgmi.num_physical_nodes > 1)
2338                 amdgpu_xgmi_remove_device(adev);
2339
2340         amdgpu_amdkfd_device_fini(adev);
2341
2342         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2343         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2344
2345         /* need to disable SMC first */
2346         for (i = 0; i < adev->num_ip_blocks; i++) {
2347                 if (!adev->ip_blocks[i].status.hw)
2348                         continue;
2349                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2350                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2351                         /* XXX handle errors */
2352                         if (r) {
2353                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2354                                           adev->ip_blocks[i].version->funcs->name, r);
2355                         }
2356                         adev->ip_blocks[i].status.hw = false;
2357                         break;
2358                 }
2359         }
2360
2361         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2362                 if (!adev->ip_blocks[i].status.hw)
2363                         continue;
2364
2365                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2366                 /* XXX handle errors */
2367                 if (r) {
2368                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2369                                   adev->ip_blocks[i].version->funcs->name, r);
2370                 }
2371
2372                 adev->ip_blocks[i].status.hw = false;
2373         }
2374
2375
2376         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2377                 if (!adev->ip_blocks[i].status.sw)
2378                         continue;
2379
2380                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2381                         amdgpu_ucode_free_bo(adev);
2382                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2383                         amdgpu_device_wb_fini(adev);
2384                         amdgpu_device_vram_scratch_fini(adev);
2385                         amdgpu_ib_pool_fini(adev);
2386                 }
2387
2388                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2389                 /* XXX handle errors */
2390                 if (r) {
2391                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2392                                   adev->ip_blocks[i].version->funcs->name, r);
2393                 }
2394                 adev->ip_blocks[i].status.sw = false;
2395                 adev->ip_blocks[i].status.valid = false;
2396         }
2397
2398         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2399                 if (!adev->ip_blocks[i].status.late_initialized)
2400                         continue;
2401                 if (adev->ip_blocks[i].version->funcs->late_fini)
2402                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2403                 adev->ip_blocks[i].status.late_initialized = false;
2404         }
2405
2406         amdgpu_ras_fini(adev);
2407
2408         if (amdgpu_sriov_vf(adev))
2409                 if (amdgpu_virt_release_full_gpu(adev, false))
2410                         DRM_ERROR("failed to release exclusive mode on fini\n");
2411
2412         return 0;
2413 }
2414
2415 /**
2416  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2417  *
2418  * @work: work_struct.
2419  */
2420 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2421 {
2422         struct amdgpu_device *adev =
2423                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2424         int r;
2425
2426         r = amdgpu_ib_ring_tests(adev);
2427         if (r)
2428                 DRM_ERROR("ib ring test failed (%d).\n", r);
2429 }
2430
2431 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2432 {
2433         struct amdgpu_device *adev =
2434                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2435
2436         mutex_lock(&adev->gfx.gfx_off_mutex);
2437         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2438                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2439                         adev->gfx.gfx_off_state = true;
2440         }
2441         mutex_unlock(&adev->gfx.gfx_off_mutex);
2442 }
2443
2444 /**
2445  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2446  *
2447  * @adev: amdgpu_device pointer
2448  *
2449  * Main suspend function for hardware IPs.  The list of all the hardware
2450  * IPs that make up the asic is walked, clockgating is disabled and the
2451  * suspend callbacks are run.  suspend puts the hardware and software state
2452  * in each IP into a state suitable for suspend.
2453  * Returns 0 on success, negative error code on failure.
2454  */
2455 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2456 {
2457         int i, r;
2458
2459         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2460         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2461
2462         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2463                 if (!adev->ip_blocks[i].status.valid)
2464                         continue;
2465
2466                 /* displays are handled separately */
2467                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_DCE)
2468                         continue;
2469
2470                 /* XXX handle errors */
2471                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2472                 /* XXX handle errors */
2473                 if (r) {
2474                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2475                                   adev->ip_blocks[i].version->funcs->name, r);
2476                         return r;
2477                 }
2478
2479                 adev->ip_blocks[i].status.hw = false;
2480         }
2481
2482         return 0;
2483 }
2484
2485 /**
2486  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2487  *
2488  * @adev: amdgpu_device pointer
2489  *
2490  * Main suspend function for hardware IPs.  The list of all the hardware
2491  * IPs that make up the asic is walked, clockgating is disabled and the
2492  * suspend callbacks are run.  suspend puts the hardware and software state
2493  * in each IP into a state suitable for suspend.
2494  * Returns 0 on success, negative error code on failure.
2495  */
2496 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2497 {
2498         int i, r;
2499
2500         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2501                 if (!adev->ip_blocks[i].status.valid)
2502                         continue;
2503                 /* displays are handled in phase1 */
2504                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2505                         continue;
2506                 /* PSP lost connection when err_event_athub occurs */
2507                 if (amdgpu_ras_intr_triggered() &&
2508                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2509                         adev->ip_blocks[i].status.hw = false;
2510                         continue;
2511                 }
2512                 /* XXX handle errors */
2513                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2514                 /* XXX handle errors */
2515                 if (r) {
2516                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2517                                   adev->ip_blocks[i].version->funcs->name, r);
2518                 }
2519                 adev->ip_blocks[i].status.hw = false;
2520                 /* handle putting the SMC in the appropriate state */
2521                 if(!amdgpu_sriov_vf(adev)){
2522                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2523                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2524                                 if (r) {
2525                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2526                                                         adev->mp1_state, r);
2527                                         return r;
2528                                 }
2529                         }
2530                 }
2531                 adev->ip_blocks[i].status.hw = false;
2532         }
2533
2534         return 0;
2535 }
2536
2537 /**
2538  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2539  *
2540  * @adev: amdgpu_device pointer
2541  *
2542  * Main suspend function for hardware IPs.  The list of all the hardware
2543  * IPs that make up the asic is walked, clockgating is disabled and the
2544  * suspend callbacks are run.  suspend puts the hardware and software state
2545  * in each IP into a state suitable for suspend.
2546  * Returns 0 on success, negative error code on failure.
2547  */
2548 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2549 {
2550         int r;
2551
2552         if (amdgpu_sriov_vf(adev))
2553                 amdgpu_virt_request_full_gpu(adev, false);
2554
2555         r = amdgpu_device_ip_suspend_phase1(adev);
2556         if (r)
2557                 return r;
2558         r = amdgpu_device_ip_suspend_phase2(adev);
2559
2560         if (amdgpu_sriov_vf(adev))
2561                 amdgpu_virt_release_full_gpu(adev, false);
2562
2563         return r;
2564 }
2565
2566 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2567 {
2568         int i, r;
2569
2570         static enum amd_ip_block_type ip_order[] = {
2571                 AMD_IP_BLOCK_TYPE_GMC,
2572                 AMD_IP_BLOCK_TYPE_COMMON,
2573                 AMD_IP_BLOCK_TYPE_PSP,
2574                 AMD_IP_BLOCK_TYPE_IH,
2575         };
2576
2577         for (i = 0; i < adev->num_ip_blocks; i++)
2578                 adev->ip_blocks[i].status.hw = false;
2579
2580         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2581                 int j;
2582                 struct amdgpu_ip_block *block;
2583
2584                 for (j = 0; j < adev->num_ip_blocks; j++) {
2585                         block = &adev->ip_blocks[j];
2586
2587                         if (block->version->type != ip_order[i] ||
2588                                 !block->status.valid)
2589                                 continue;
2590
2591                         r = block->version->funcs->hw_init(adev);
2592                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2593                         if (r)
2594                                 return r;
2595                         block->status.hw = true;
2596                 }
2597         }
2598
2599         return 0;
2600 }
2601
2602 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2603 {
2604         int i, r;
2605
2606         static enum amd_ip_block_type ip_order[] = {
2607                 AMD_IP_BLOCK_TYPE_SMC,
2608                 AMD_IP_BLOCK_TYPE_DCE,
2609                 AMD_IP_BLOCK_TYPE_GFX,
2610                 AMD_IP_BLOCK_TYPE_SDMA,
2611                 AMD_IP_BLOCK_TYPE_UVD,
2612                 AMD_IP_BLOCK_TYPE_VCE,
2613                 AMD_IP_BLOCK_TYPE_VCN
2614         };
2615
2616         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2617                 int j;
2618                 struct amdgpu_ip_block *block;
2619
2620                 for (j = 0; j < adev->num_ip_blocks; j++) {
2621                         block = &adev->ip_blocks[j];
2622
2623                         if (block->version->type != ip_order[i] ||
2624                                 !block->status.valid ||
2625                                 block->status.hw)
2626                                 continue;
2627
2628                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2629                                 r = block->version->funcs->resume(adev);
2630                         else
2631                                 r = block->version->funcs->hw_init(adev);
2632
2633                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2634                         if (r)
2635                                 return r;
2636                         block->status.hw = true;
2637                 }
2638         }
2639
2640         return 0;
2641 }
2642
2643 /**
2644  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2645  *
2646  * @adev: amdgpu_device pointer
2647  *
2648  * First resume function for hardware IPs.  The list of all the hardware
2649  * IPs that make up the asic is walked and the resume callbacks are run for
2650  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2651  * after a suspend and updates the software state as necessary.  This
2652  * function is also used for restoring the GPU after a GPU reset.
2653  * Returns 0 on success, negative error code on failure.
2654  */
2655 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2656 {
2657         int i, r;
2658
2659         for (i = 0; i < adev->num_ip_blocks; i++) {
2660                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2661                         continue;
2662                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2663                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2664                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2665
2666                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2667                         if (r) {
2668                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2669                                           adev->ip_blocks[i].version->funcs->name, r);
2670                                 return r;
2671                         }
2672                         adev->ip_blocks[i].status.hw = true;
2673                 }
2674         }
2675
2676         return 0;
2677 }
2678
2679 /**
2680  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2681  *
2682  * @adev: amdgpu_device pointer
2683  *
2684  * First resume function for hardware IPs.  The list of all the hardware
2685  * IPs that make up the asic is walked and the resume callbacks are run for
2686  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2687  * functional state after a suspend and updates the software state as
2688  * necessary.  This function is also used for restoring the GPU after a GPU
2689  * reset.
2690  * Returns 0 on success, negative error code on failure.
2691  */
2692 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2693 {
2694         int i, r;
2695
2696         for (i = 0; i < adev->num_ip_blocks; i++) {
2697                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2698                         continue;
2699                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2700                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2701                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2702                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2703                         continue;
2704                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2705                 if (r) {
2706                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2707                                   adev->ip_blocks[i].version->funcs->name, r);
2708                         return r;
2709                 }
2710                 adev->ip_blocks[i].status.hw = true;
2711         }
2712
2713         return 0;
2714 }
2715
2716 /**
2717  * amdgpu_device_ip_resume - run resume for hardware IPs
2718  *
2719  * @adev: amdgpu_device pointer
2720  *
2721  * Main resume function for hardware IPs.  The hardware IPs
2722  * are split into two resume functions because they are
2723  * are also used in in recovering from a GPU reset and some additional
2724  * steps need to be take between them.  In this case (S3/S4) they are
2725  * run sequentially.
2726  * Returns 0 on success, negative error code on failure.
2727  */
2728 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2729 {
2730         int r;
2731
2732         r = amdgpu_device_ip_resume_phase1(adev);
2733         if (r)
2734                 return r;
2735
2736         r = amdgpu_device_fw_loading(adev);
2737         if (r)
2738                 return r;
2739
2740         r = amdgpu_device_ip_resume_phase2(adev);
2741
2742         return r;
2743 }
2744
2745 /**
2746  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2747  *
2748  * @adev: amdgpu_device pointer
2749  *
2750  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2751  */
2752 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2753 {
2754         if (amdgpu_sriov_vf(adev)) {
2755                 if (adev->is_atom_fw) {
2756                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2757                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2758                 } else {
2759                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2760                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2761                 }
2762
2763                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2764                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2765         }
2766 }
2767
2768 /**
2769  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2770  *
2771  * @asic_type: AMD asic type
2772  *
2773  * Check if there is DC (new modesetting infrastructre) support for an asic.
2774  * returns true if DC has support, false if not.
2775  */
2776 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2777 {
2778         switch (asic_type) {
2779 #if defined(CONFIG_DRM_AMD_DC)
2780         case CHIP_BONAIRE:
2781         case CHIP_KAVERI:
2782         case CHIP_KABINI:
2783         case CHIP_MULLINS:
2784                 /*
2785                  * We have systems in the wild with these ASICs that require
2786                  * LVDS and VGA support which is not supported with DC.
2787                  *
2788                  * Fallback to the non-DC driver here by default so as not to
2789                  * cause regressions.
2790                  */
2791                 return amdgpu_dc > 0;
2792         case CHIP_HAWAII:
2793         case CHIP_CARRIZO:
2794         case CHIP_STONEY:
2795         case CHIP_POLARIS10:
2796         case CHIP_POLARIS11:
2797         case CHIP_POLARIS12:
2798         case CHIP_VEGAM:
2799         case CHIP_TONGA:
2800         case CHIP_FIJI:
2801         case CHIP_VEGA10:
2802         case CHIP_VEGA12:
2803         case CHIP_VEGA20:
2804 #if defined(CONFIG_DRM_AMD_DC_DCN)
2805         case CHIP_RAVEN:
2806         case CHIP_NAVI10:
2807         case CHIP_NAVI14:
2808         case CHIP_NAVI12:
2809         case CHIP_RENOIR:
2810 #endif
2811 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2812         case CHIP_SIENNA_CICHLID:
2813         case CHIP_NAVY_FLOUNDER:
2814 #endif
2815                 return amdgpu_dc != 0;
2816 #endif
2817         default:
2818                 if (amdgpu_dc > 0)
2819                         DRM_INFO("Display Core has been requested via kernel parameter "
2820                                          "but isn't supported by ASIC, ignoring\n");
2821                 return false;
2822         }
2823 }
2824
2825 /**
2826  * amdgpu_device_has_dc_support - check if dc is supported
2827  *
2828  * @adev: amdgpu_device_pointer
2829  *
2830  * Returns true for supported, false for not supported
2831  */
2832 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2833 {
2834         if (amdgpu_sriov_vf(adev))
2835                 return false;
2836
2837         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2838 }
2839
2840
2841 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2842 {
2843         struct amdgpu_device *adev =
2844                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2845         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2846
2847         /* It's a bug to not have a hive within this function */
2848         if (WARN_ON(!hive))
2849                 return;
2850
2851         /*
2852          * Use task barrier to synchronize all xgmi reset works across the
2853          * hive. task_barrier_enter and task_barrier_exit will block
2854          * until all the threads running the xgmi reset works reach
2855          * those points. task_barrier_full will do both blocks.
2856          */
2857         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2858
2859                 task_barrier_enter(&hive->tb);
2860                 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2861
2862                 if (adev->asic_reset_res)
2863                         goto fail;
2864
2865                 task_barrier_exit(&hive->tb);
2866                 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2867
2868                 if (adev->asic_reset_res)
2869                         goto fail;
2870
2871                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2872                         adev->mmhub.funcs->reset_ras_error_count(adev);
2873         } else {
2874
2875                 task_barrier_full(&hive->tb);
2876                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2877         }
2878
2879 fail:
2880         if (adev->asic_reset_res)
2881                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2882                          adev->asic_reset_res, adev->ddev->unique);
2883 }
2884
2885 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2886 {
2887         char *input = amdgpu_lockup_timeout;
2888         char *timeout_setting = NULL;
2889         int index = 0;
2890         long timeout;
2891         int ret = 0;
2892
2893         /*
2894          * By default timeout for non compute jobs is 10000.
2895          * And there is no timeout enforced on compute jobs.
2896          * In SR-IOV or passthrough mode, timeout for compute
2897          * jobs are 60000 by default.
2898          */
2899         adev->gfx_timeout = msecs_to_jiffies(10000);
2900         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2901         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2902                 adev->compute_timeout =  msecs_to_jiffies(60000);
2903         else
2904                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2905
2906         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2907                 while ((timeout_setting = strsep(&input, ",")) &&
2908                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2909                         ret = kstrtol(timeout_setting, 0, &timeout);
2910                         if (ret)
2911                                 return ret;
2912
2913                         if (timeout == 0) {
2914                                 index++;
2915                                 continue;
2916                         } else if (timeout < 0) {
2917                                 timeout = MAX_SCHEDULE_TIMEOUT;
2918                         } else {
2919                                 timeout = msecs_to_jiffies(timeout);
2920                         }
2921
2922                         switch (index++) {
2923                         case 0:
2924                                 adev->gfx_timeout = timeout;
2925                                 break;
2926                         case 1:
2927                                 adev->compute_timeout = timeout;
2928                                 break;
2929                         case 2:
2930                                 adev->sdma_timeout = timeout;
2931                                 break;
2932                         case 3:
2933                                 adev->video_timeout = timeout;
2934                                 break;
2935                         default:
2936                                 break;
2937                         }
2938                 }
2939                 /*
2940                  * There is only one value specified and
2941                  * it should apply to all non-compute jobs.
2942                  */
2943                 if (index == 1) {
2944                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2945                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2946                                 adev->compute_timeout = adev->gfx_timeout;
2947                 }
2948         }
2949
2950         return ret;
2951 }
2952
2953 static const struct attribute *amdgpu_dev_attributes[] = {
2954         &dev_attr_product_name.attr,
2955         &dev_attr_product_number.attr,
2956         &dev_attr_serial_number.attr,
2957         &dev_attr_pcie_replay_count.attr,
2958         NULL
2959 };
2960
2961 /**
2962  * amdgpu_device_init - initialize the driver
2963  *
2964  * @adev: amdgpu_device pointer
2965  * @ddev: drm dev pointer
2966  * @pdev: pci dev pointer
2967  * @flags: driver flags
2968  *
2969  * Initializes the driver info and hw (all asics).
2970  * Returns 0 for success or an error on failure.
2971  * Called at driver startup.
2972  */
2973 int amdgpu_device_init(struct amdgpu_device *adev,
2974                        struct drm_device *ddev,
2975                        struct pci_dev *pdev,
2976                        uint32_t flags)
2977 {
2978         int r, i;
2979         bool boco = false;
2980         u32 max_MBps;
2981
2982         adev->shutdown = false;
2983         adev->dev = &pdev->dev;
2984         adev->ddev = ddev;
2985         adev->pdev = pdev;
2986         adev->flags = flags;
2987
2988         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2989                 adev->asic_type = amdgpu_force_asic_type;
2990         else
2991                 adev->asic_type = flags & AMD_ASIC_MASK;
2992
2993         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
2994         if (amdgpu_emu_mode == 1)
2995                 adev->usec_timeout *= 10;
2996         adev->gmc.gart_size = 512 * 1024 * 1024;
2997         adev->accel_working = false;
2998         adev->num_rings = 0;
2999         adev->mman.buffer_funcs = NULL;
3000         adev->mman.buffer_funcs_ring = NULL;
3001         adev->vm_manager.vm_pte_funcs = NULL;
3002         adev->vm_manager.vm_pte_num_scheds = 0;
3003         adev->gmc.gmc_funcs = NULL;
3004         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3005         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3006
3007         adev->smc_rreg = &amdgpu_invalid_rreg;
3008         adev->smc_wreg = &amdgpu_invalid_wreg;
3009         adev->pcie_rreg = &amdgpu_invalid_rreg;
3010         adev->pcie_wreg = &amdgpu_invalid_wreg;
3011         adev->pciep_rreg = &amdgpu_invalid_rreg;
3012         adev->pciep_wreg = &amdgpu_invalid_wreg;
3013         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3014         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3015         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3016         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3017         adev->didt_rreg = &amdgpu_invalid_rreg;
3018         adev->didt_wreg = &amdgpu_invalid_wreg;
3019         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3020         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3021         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3022         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3023
3024         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3025                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3026                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3027
3028         /* mutex initialization are all done here so we
3029          * can recall function without having locking issues */
3030         atomic_set(&adev->irq.ih.lock, 0);
3031         mutex_init(&adev->firmware.mutex);
3032         mutex_init(&adev->pm.mutex);
3033         mutex_init(&adev->gfx.gpu_clock_mutex);
3034         mutex_init(&adev->srbm_mutex);
3035         mutex_init(&adev->gfx.pipe_reserve_mutex);
3036         mutex_init(&adev->gfx.gfx_off_mutex);
3037         mutex_init(&adev->grbm_idx_mutex);
3038         mutex_init(&adev->mn_lock);
3039         mutex_init(&adev->virt.vf_errors.lock);
3040         hash_init(adev->mn_hash);
3041         mutex_init(&adev->lock_reset);
3042         mutex_init(&adev->psp.mutex);
3043         mutex_init(&adev->notifier_lock);
3044
3045         r = amdgpu_device_check_arguments(adev);
3046         if (r)
3047                 return r;
3048
3049         spin_lock_init(&adev->mmio_idx_lock);
3050         spin_lock_init(&adev->smc_idx_lock);
3051         spin_lock_init(&adev->pcie_idx_lock);
3052         spin_lock_init(&adev->uvd_ctx_idx_lock);
3053         spin_lock_init(&adev->didt_idx_lock);
3054         spin_lock_init(&adev->gc_cac_idx_lock);
3055         spin_lock_init(&adev->se_cac_idx_lock);
3056         spin_lock_init(&adev->audio_endpt_idx_lock);
3057         spin_lock_init(&adev->mm_stats.lock);
3058
3059         INIT_LIST_HEAD(&adev->shadow_list);
3060         mutex_init(&adev->shadow_list_lock);
3061
3062         INIT_DELAYED_WORK(&adev->delayed_init_work,
3063                           amdgpu_device_delayed_init_work_handler);
3064         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3065                           amdgpu_device_delay_enable_gfx_off);
3066
3067         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3068
3069         adev->gfx.gfx_off_req_count = 1;
3070         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3071
3072         atomic_set(&adev->throttling_logging_enabled, 1);
3073         /*
3074          * If throttling continues, logging will be performed every minute
3075          * to avoid log flooding. "-1" is subtracted since the thermal
3076          * throttling interrupt comes every second. Thus, the total logging
3077          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3078          * for throttling interrupt) = 60 seconds.
3079          */
3080         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3081         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3082
3083         /* Registers mapping */
3084         /* TODO: block userspace mapping of io register */
3085         if (adev->asic_type >= CHIP_BONAIRE) {
3086                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3087                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3088         } else {
3089                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3090                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3091         }
3092
3093         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3094         if (adev->rmmio == NULL) {
3095                 return -ENOMEM;
3096         }
3097         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3098         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3099
3100         /* io port mapping */
3101         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3102                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3103                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3104                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3105                         break;
3106                 }
3107         }
3108         if (adev->rio_mem == NULL)
3109                 DRM_INFO("PCI I/O BAR is not found.\n");
3110
3111         /* enable PCIE atomic ops */
3112         r = pci_enable_atomic_ops_to_root(adev->pdev,
3113                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3114                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3115         if (r) {
3116                 adev->have_atomics_support = false;
3117                 DRM_INFO("PCIE atomic ops is not supported\n");
3118         } else {
3119                 adev->have_atomics_support = true;
3120         }
3121
3122         amdgpu_device_get_pcie_info(adev);
3123
3124         if (amdgpu_mcbp)
3125                 DRM_INFO("MCBP is enabled\n");
3126
3127         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3128                 adev->enable_mes = true;
3129
3130         /* detect hw virtualization here */
3131         amdgpu_detect_virtualization(adev);
3132
3133         r = amdgpu_device_get_job_timeout_settings(adev);
3134         if (r) {
3135                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3136                 return r;
3137         }
3138
3139         /* early init functions */
3140         r = amdgpu_device_ip_early_init(adev);
3141         if (r)
3142                 return r;
3143
3144         /* doorbell bar mapping and doorbell index init*/
3145         amdgpu_device_doorbell_init(adev);
3146
3147         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3148         /* this will fail for cards that aren't VGA class devices, just
3149          * ignore it */
3150         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3151
3152         if (amdgpu_device_supports_boco(ddev))
3153                 boco = true;
3154         if (amdgpu_has_atpx() &&
3155             (amdgpu_is_atpx_hybrid() ||
3156              amdgpu_has_atpx_dgpu_power_cntl()) &&
3157             !pci_is_thunderbolt_attached(adev->pdev))
3158                 vga_switcheroo_register_client(adev->pdev,
3159                                                &amdgpu_switcheroo_ops, boco);
3160         if (boco)
3161                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3162
3163         if (amdgpu_emu_mode == 1) {
3164                 /* post the asic on emulation mode */
3165                 emu_soc_asic_init(adev);
3166                 goto fence_driver_init;
3167         }
3168
3169         /* detect if we are with an SRIOV vbios */
3170         amdgpu_device_detect_sriov_bios(adev);
3171
3172         /* check if we need to reset the asic
3173          *  E.g., driver was not cleanly unloaded previously, etc.
3174          */
3175         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3176                 r = amdgpu_asic_reset(adev);
3177                 if (r) {
3178                         dev_err(adev->dev, "asic reset on init failed\n");
3179                         goto failed;
3180                 }
3181         }
3182
3183         /* Post card if necessary */
3184         if (amdgpu_device_need_post(adev)) {
3185                 if (!adev->bios) {
3186                         dev_err(adev->dev, "no vBIOS found\n");
3187                         r = -EINVAL;
3188                         goto failed;
3189                 }
3190                 DRM_INFO("GPU posting now...\n");
3191                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3192                 if (r) {
3193                         dev_err(adev->dev, "gpu post error!\n");
3194                         goto failed;
3195                 }
3196         }
3197
3198         if (adev->is_atom_fw) {
3199                 /* Initialize clocks */
3200                 r = amdgpu_atomfirmware_get_clock_info(adev);
3201                 if (r) {
3202                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3203                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3204                         goto failed;
3205                 }
3206         } else {
3207                 /* Initialize clocks */
3208                 r = amdgpu_atombios_get_clock_info(adev);
3209                 if (r) {
3210                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3211                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3212                         goto failed;
3213                 }
3214                 /* init i2c buses */
3215                 if (!amdgpu_device_has_dc_support(adev))
3216                         amdgpu_atombios_i2c_init(adev);
3217         }
3218
3219 fence_driver_init:
3220         /* Fence driver */
3221         r = amdgpu_fence_driver_init(adev);
3222         if (r) {
3223                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3224                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3225                 goto failed;
3226         }
3227
3228         /* init the mode config */
3229         drm_mode_config_init(adev->ddev);
3230
3231         r = amdgpu_device_ip_init(adev);
3232         if (r) {
3233                 /* failed in exclusive mode due to timeout */
3234                 if (amdgpu_sriov_vf(adev) &&
3235                     !amdgpu_sriov_runtime(adev) &&
3236                     amdgpu_virt_mmio_blocked(adev) &&
3237                     !amdgpu_virt_wait_reset(adev)) {
3238                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3239                         /* Don't send request since VF is inactive. */
3240                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3241                         adev->virt.ops = NULL;
3242                         r = -EAGAIN;
3243                         goto failed;
3244                 }
3245                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3246                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3247                 goto failed;
3248         }
3249
3250         dev_info(adev->dev,
3251                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3252                         adev->gfx.config.max_shader_engines,
3253                         adev->gfx.config.max_sh_per_se,
3254                         adev->gfx.config.max_cu_per_sh,
3255                         adev->gfx.cu_info.number);
3256
3257         adev->accel_working = true;
3258
3259         amdgpu_vm_check_compute_bug(adev);
3260
3261         /* Initialize the buffer migration limit. */
3262         if (amdgpu_moverate >= 0)
3263                 max_MBps = amdgpu_moverate;
3264         else
3265                 max_MBps = 8; /* Allow 8 MB/s. */
3266         /* Get a log2 for easy divisions. */
3267         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3268
3269         amdgpu_fbdev_init(adev);
3270
3271         r = amdgpu_pm_sysfs_init(adev);
3272         if (r) {
3273                 adev->pm_sysfs_en = false;
3274                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3275         } else
3276                 adev->pm_sysfs_en = true;
3277
3278         r = amdgpu_ucode_sysfs_init(adev);
3279         if (r) {
3280                 adev->ucode_sysfs_en = false;
3281                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3282         } else
3283                 adev->ucode_sysfs_en = true;
3284
3285         if ((amdgpu_testing & 1)) {
3286                 if (adev->accel_working)
3287                         amdgpu_test_moves(adev);
3288                 else
3289                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3290         }
3291         if (amdgpu_benchmarking) {
3292                 if (adev->accel_working)
3293                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3294                 else
3295                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3296         }
3297
3298         /*
3299          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3300          * Otherwise the mgpu fan boost feature will be skipped due to the
3301          * gpu instance is counted less.
3302          */
3303         amdgpu_register_gpu_instance(adev);
3304
3305         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3306          * explicit gating rather than handling it automatically.
3307          */
3308         r = amdgpu_device_ip_late_init(adev);
3309         if (r) {
3310                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3311                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3312                 goto failed;
3313         }
3314
3315         /* must succeed. */
3316         amdgpu_ras_resume(adev);
3317
3318         queue_delayed_work(system_wq, &adev->delayed_init_work,
3319                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3320
3321         if (amdgpu_sriov_vf(adev))
3322                 flush_delayed_work(&adev->delayed_init_work);
3323
3324         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3325         if (r) {
3326                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3327                 return r;
3328         }
3329
3330         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3331                 r = amdgpu_pmu_init(adev);
3332         if (r)
3333                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3334
3335         return 0;
3336
3337 failed:
3338         amdgpu_vf_error_trans_all(adev);
3339         if (boco)
3340                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3341
3342         return r;
3343 }
3344
3345 /**
3346  * amdgpu_device_fini - tear down the driver
3347  *
3348  * @adev: amdgpu_device pointer
3349  *
3350  * Tear down the driver info (all asics).
3351  * Called at driver shutdown.
3352  */
3353 void amdgpu_device_fini(struct amdgpu_device *adev)
3354 {
3355         int r;
3356
3357         DRM_INFO("amdgpu: finishing device.\n");
3358         flush_delayed_work(&adev->delayed_init_work);
3359         adev->shutdown = true;
3360
3361         /* make sure IB test finished before entering exclusive mode
3362          * to avoid preemption on IB test
3363          * */
3364         if (amdgpu_sriov_vf(adev))
3365                 amdgpu_virt_request_full_gpu(adev, false);
3366
3367         /* disable all interrupts */
3368         amdgpu_irq_disable_all(adev);
3369         if (adev->mode_info.mode_config_initialized){
3370                 if (!amdgpu_device_has_dc_support(adev))
3371                         drm_helper_force_disable_all(adev->ddev);
3372                 else
3373                         drm_atomic_helper_shutdown(adev->ddev);
3374         }
3375         amdgpu_fence_driver_fini(adev);
3376         if (adev->pm_sysfs_en)
3377                 amdgpu_pm_sysfs_fini(adev);
3378         amdgpu_fbdev_fini(adev);
3379         r = amdgpu_device_ip_fini(adev);
3380         release_firmware(adev->firmware.gpu_info_fw);
3381         adev->firmware.gpu_info_fw = NULL;
3382         adev->accel_working = false;
3383         /* free i2c buses */
3384         if (!amdgpu_device_has_dc_support(adev))
3385                 amdgpu_i2c_fini(adev);
3386
3387         if (amdgpu_emu_mode != 1)
3388                 amdgpu_atombios_fini(adev);
3389
3390         kfree(adev->bios);
3391         adev->bios = NULL;
3392         if (amdgpu_has_atpx() &&
3393             (amdgpu_is_atpx_hybrid() ||
3394              amdgpu_has_atpx_dgpu_power_cntl()) &&
3395             !pci_is_thunderbolt_attached(adev->pdev))
3396                 vga_switcheroo_unregister_client(adev->pdev);
3397         if (amdgpu_device_supports_boco(adev->ddev))
3398                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3399         vga_client_register(adev->pdev, NULL, NULL, NULL);
3400         if (adev->rio_mem)
3401                 pci_iounmap(adev->pdev, adev->rio_mem);
3402         adev->rio_mem = NULL;
3403         iounmap(adev->rmmio);
3404         adev->rmmio = NULL;
3405         amdgpu_device_doorbell_fini(adev);
3406
3407         if (adev->ucode_sysfs_en)
3408                 amdgpu_ucode_sysfs_fini(adev);
3409
3410         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3411         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3412                 amdgpu_pmu_fini(adev);
3413         if (adev->discovery_bin)
3414                 amdgpu_discovery_fini(adev);
3415 }
3416
3417
3418 /*
3419  * Suspend & resume.
3420  */
3421 /**
3422  * amdgpu_device_suspend - initiate device suspend
3423  *
3424  * @dev: drm dev pointer
3425  * @fbcon : notify the fbdev of suspend
3426  *
3427  * Puts the hw in the suspend state (all asics).
3428  * Returns 0 for success or an error on failure.
3429  * Called at driver suspend.
3430  */
3431 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3432 {
3433         struct amdgpu_device *adev;
3434         struct drm_crtc *crtc;
3435         struct drm_connector *connector;
3436         struct drm_connector_list_iter iter;
3437         int r;
3438
3439         if (dev == NULL || dev->dev_private == NULL) {
3440                 return -ENODEV;
3441         }
3442
3443         adev = dev->dev_private;
3444
3445         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3446                 return 0;
3447
3448         adev->in_suspend = true;
3449         drm_kms_helper_poll_disable(dev);
3450
3451         if (fbcon)
3452                 amdgpu_fbdev_set_suspend(adev, 1);
3453
3454         cancel_delayed_work_sync(&adev->delayed_init_work);
3455
3456         if (!amdgpu_device_has_dc_support(adev)) {
3457                 /* turn off display hw */
3458                 drm_modeset_lock_all(dev);
3459                 drm_connector_list_iter_begin(dev, &iter);
3460                 drm_for_each_connector_iter(connector, &iter)
3461                         drm_helper_connector_dpms(connector,
3462                                                   DRM_MODE_DPMS_OFF);
3463                 drm_connector_list_iter_end(&iter);
3464                 drm_modeset_unlock_all(dev);
3465                         /* unpin the front buffers and cursors */
3466                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3467                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3468                         struct drm_framebuffer *fb = crtc->primary->fb;
3469                         struct amdgpu_bo *robj;
3470
3471                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3472                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3473                                 r = amdgpu_bo_reserve(aobj, true);
3474                                 if (r == 0) {
3475                                         amdgpu_bo_unpin(aobj);
3476                                         amdgpu_bo_unreserve(aobj);
3477                                 }
3478                         }
3479
3480                         if (fb == NULL || fb->obj[0] == NULL) {
3481                                 continue;
3482                         }
3483                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3484                         /* don't unpin kernel fb objects */
3485                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3486                                 r = amdgpu_bo_reserve(robj, true);
3487                                 if (r == 0) {
3488                                         amdgpu_bo_unpin(robj);
3489                                         amdgpu_bo_unreserve(robj);
3490                                 }
3491                         }
3492                 }
3493         }
3494
3495         amdgpu_ras_suspend(adev);
3496
3497         r = amdgpu_device_ip_suspend_phase1(adev);
3498
3499         amdgpu_amdkfd_suspend(adev, !fbcon);
3500
3501         /* evict vram memory */
3502         amdgpu_bo_evict_vram(adev);
3503
3504         amdgpu_fence_driver_suspend(adev);
3505
3506         r = amdgpu_device_ip_suspend_phase2(adev);
3507
3508         /* evict remaining vram memory
3509          * This second call to evict vram is to evict the gart page table
3510          * using the CPU.
3511          */
3512         amdgpu_bo_evict_vram(adev);
3513
3514         return 0;
3515 }
3516
3517 /**
3518  * amdgpu_device_resume - initiate device resume
3519  *
3520  * @dev: drm dev pointer
3521  * @fbcon : notify the fbdev of resume
3522  *
3523  * Bring the hw back to operating state (all asics).
3524  * Returns 0 for success or an error on failure.
3525  * Called at driver resume.
3526  */
3527 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3528 {
3529         struct drm_connector *connector;
3530         struct drm_connector_list_iter iter;
3531         struct amdgpu_device *adev = dev->dev_private;
3532         struct drm_crtc *crtc;
3533         int r = 0;
3534
3535         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3536                 return 0;
3537
3538         /* post card */
3539         if (amdgpu_device_need_post(adev)) {
3540                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3541                 if (r)
3542                         DRM_ERROR("amdgpu asic init failed\n");
3543         }
3544
3545         r = amdgpu_device_ip_resume(adev);
3546         if (r) {
3547                 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3548                 return r;
3549         }
3550         amdgpu_fence_driver_resume(adev);
3551
3552
3553         r = amdgpu_device_ip_late_init(adev);
3554         if (r)
3555                 return r;
3556
3557         queue_delayed_work(system_wq, &adev->delayed_init_work,
3558                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3559
3560         if (!amdgpu_device_has_dc_support(adev)) {
3561                 /* pin cursors */
3562                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3563                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3564
3565                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3566                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3567                                 r = amdgpu_bo_reserve(aobj, true);
3568                                 if (r == 0) {
3569                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3570                                         if (r != 0)
3571                                                 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3572                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3573                                         amdgpu_bo_unreserve(aobj);
3574                                 }
3575                         }
3576                 }
3577         }
3578         r = amdgpu_amdkfd_resume(adev, !fbcon);
3579         if (r)
3580                 return r;
3581
3582         /* Make sure IB tests flushed */
3583         flush_delayed_work(&adev->delayed_init_work);
3584
3585         /* blat the mode back in */
3586         if (fbcon) {
3587                 if (!amdgpu_device_has_dc_support(adev)) {
3588                         /* pre DCE11 */
3589                         drm_helper_resume_force_mode(dev);
3590
3591                         /* turn on display hw */
3592                         drm_modeset_lock_all(dev);
3593
3594                         drm_connector_list_iter_begin(dev, &iter);
3595                         drm_for_each_connector_iter(connector, &iter)
3596                                 drm_helper_connector_dpms(connector,
3597                                                           DRM_MODE_DPMS_ON);
3598                         drm_connector_list_iter_end(&iter);
3599
3600                         drm_modeset_unlock_all(dev);
3601                 }
3602                 amdgpu_fbdev_set_suspend(adev, 0);
3603         }
3604
3605         drm_kms_helper_poll_enable(dev);
3606
3607         amdgpu_ras_resume(adev);
3608
3609         /*
3610          * Most of the connector probing functions try to acquire runtime pm
3611          * refs to ensure that the GPU is powered on when connector polling is
3612          * performed. Since we're calling this from a runtime PM callback,
3613          * trying to acquire rpm refs will cause us to deadlock.
3614          *
3615          * Since we're guaranteed to be holding the rpm lock, it's safe to
3616          * temporarily disable the rpm helpers so this doesn't deadlock us.
3617          */
3618 #ifdef CONFIG_PM
3619         dev->dev->power.disable_depth++;
3620 #endif
3621         if (!amdgpu_device_has_dc_support(adev))
3622                 drm_helper_hpd_irq_event(dev);
3623         else
3624                 drm_kms_helper_hotplug_event(dev);
3625 #ifdef CONFIG_PM
3626         dev->dev->power.disable_depth--;
3627 #endif
3628         adev->in_suspend = false;
3629
3630         return 0;
3631 }
3632
3633 /**
3634  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3635  *
3636  * @adev: amdgpu_device pointer
3637  *
3638  * The list of all the hardware IPs that make up the asic is walked and
3639  * the check_soft_reset callbacks are run.  check_soft_reset determines
3640  * if the asic is still hung or not.
3641  * Returns true if any of the IPs are still in a hung state, false if not.
3642  */
3643 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3644 {
3645         int i;
3646         bool asic_hang = false;
3647
3648         if (amdgpu_sriov_vf(adev))
3649                 return true;
3650
3651         if (amdgpu_asic_need_full_reset(adev))
3652                 return true;
3653
3654         for (i = 0; i < adev->num_ip_blocks; i++) {
3655                 if (!adev->ip_blocks[i].status.valid)
3656                         continue;
3657                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3658                         adev->ip_blocks[i].status.hang =
3659                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3660                 if (adev->ip_blocks[i].status.hang) {
3661                         DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3662                         asic_hang = true;
3663                 }
3664         }
3665         return asic_hang;
3666 }
3667
3668 /**
3669  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3670  *
3671  * @adev: amdgpu_device pointer
3672  *
3673  * The list of all the hardware IPs that make up the asic is walked and the
3674  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3675  * handles any IP specific hardware or software state changes that are
3676  * necessary for a soft reset to succeed.
3677  * Returns 0 on success, negative error code on failure.
3678  */
3679 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3680 {
3681         int i, r = 0;
3682
3683         for (i = 0; i < adev->num_ip_blocks; i++) {
3684                 if (!adev->ip_blocks[i].status.valid)
3685                         continue;
3686                 if (adev->ip_blocks[i].status.hang &&
3687                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3688                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3689                         if (r)
3690                                 return r;
3691                 }
3692         }
3693
3694         return 0;
3695 }
3696
3697 /**
3698  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3699  *
3700  * @adev: amdgpu_device pointer
3701  *
3702  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3703  * reset is necessary to recover.
3704  * Returns true if a full asic reset is required, false if not.
3705  */
3706 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3707 {
3708         int i;
3709
3710         if (amdgpu_asic_need_full_reset(adev))
3711                 return true;
3712
3713         for (i = 0; i < adev->num_ip_blocks; i++) {
3714                 if (!adev->ip_blocks[i].status.valid)
3715                         continue;
3716                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3717                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3718                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3719                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3720                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3721                         if (adev->ip_blocks[i].status.hang) {
3722                                 DRM_INFO("Some block need full reset!\n");
3723                                 return true;
3724                         }
3725                 }
3726         }
3727         return false;
3728 }
3729
3730 /**
3731  * amdgpu_device_ip_soft_reset - do a soft reset
3732  *
3733  * @adev: amdgpu_device pointer
3734  *
3735  * The list of all the hardware IPs that make up the asic is walked and the
3736  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3737  * IP specific hardware or software state changes that are necessary to soft
3738  * reset the IP.
3739  * Returns 0 on success, negative error code on failure.
3740  */
3741 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3742 {
3743         int i, r = 0;
3744
3745         for (i = 0; i < adev->num_ip_blocks; i++) {
3746                 if (!adev->ip_blocks[i].status.valid)
3747                         continue;
3748                 if (adev->ip_blocks[i].status.hang &&
3749                     adev->ip_blocks[i].version->funcs->soft_reset) {
3750                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3751                         if (r)
3752                                 return r;
3753                 }
3754         }
3755
3756         return 0;
3757 }
3758
3759 /**
3760  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3761  *
3762  * @adev: amdgpu_device pointer
3763  *
3764  * The list of all the hardware IPs that make up the asic is walked and the
3765  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3766  * handles any IP specific hardware or software state changes that are
3767  * necessary after the IP has been soft reset.
3768  * Returns 0 on success, negative error code on failure.
3769  */
3770 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3771 {
3772         int i, r = 0;
3773
3774         for (i = 0; i < adev->num_ip_blocks; i++) {
3775                 if (!adev->ip_blocks[i].status.valid)
3776                         continue;
3777                 if (adev->ip_blocks[i].status.hang &&
3778                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3779                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3780                 if (r)
3781                         return r;
3782         }
3783
3784         return 0;
3785 }
3786
3787 /**
3788  * amdgpu_device_recover_vram - Recover some VRAM contents
3789  *
3790  * @adev: amdgpu_device pointer
3791  *
3792  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3793  * restore things like GPUVM page tables after a GPU reset where
3794  * the contents of VRAM might be lost.
3795  *
3796  * Returns:
3797  * 0 on success, negative error code on failure.
3798  */
3799 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3800 {
3801         struct dma_fence *fence = NULL, *next = NULL;
3802         struct amdgpu_bo *shadow;
3803         long r = 1, tmo;
3804
3805         if (amdgpu_sriov_runtime(adev))
3806                 tmo = msecs_to_jiffies(8000);
3807         else
3808                 tmo = msecs_to_jiffies(100);
3809
3810         DRM_INFO("recover vram bo from shadow start\n");
3811         mutex_lock(&adev->shadow_list_lock);
3812         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3813
3814                 /* No need to recover an evicted BO */
3815                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3816                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3817                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3818                         continue;
3819
3820                 r = amdgpu_bo_restore_shadow(shadow, &next);
3821                 if (r)
3822                         break;
3823
3824                 if (fence) {
3825                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3826                         dma_fence_put(fence);
3827                         fence = next;
3828                         if (tmo == 0) {
3829                                 r = -ETIMEDOUT;
3830                                 break;
3831                         } else if (tmo < 0) {
3832                                 r = tmo;
3833                                 break;
3834                         }
3835                 } else {
3836                         fence = next;
3837                 }
3838         }
3839         mutex_unlock(&adev->shadow_list_lock);
3840
3841         if (fence)
3842                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3843         dma_fence_put(fence);
3844
3845         if (r < 0 || tmo <= 0) {
3846                 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3847                 return -EIO;
3848         }
3849
3850         DRM_INFO("recover vram bo from shadow done\n");
3851         return 0;
3852 }
3853
3854
3855 /**
3856  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3857  *
3858  * @adev: amdgpu device pointer
3859  * @from_hypervisor: request from hypervisor
3860  *
3861  * do VF FLR and reinitialize Asic
3862  * return 0 means succeeded otherwise failed
3863  */
3864 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3865                                      bool from_hypervisor)
3866 {
3867         int r;
3868
3869         if (from_hypervisor)
3870                 r = amdgpu_virt_request_full_gpu(adev, true);
3871         else
3872                 r = amdgpu_virt_reset_gpu(adev);
3873         if (r)
3874                 return r;
3875
3876         amdgpu_amdkfd_pre_reset(adev);
3877
3878         /* Resume IP prior to SMC */
3879         r = amdgpu_device_ip_reinit_early_sriov(adev);
3880         if (r)
3881                 goto error;
3882
3883         amdgpu_virt_init_data_exchange(adev);
3884         /* we need recover gart prior to run SMC/CP/SDMA resume */
3885         amdgpu_gtt_mgr_recover(ttm_manager_type(&adev->mman.bdev, TTM_PL_TT));
3886
3887         r = amdgpu_device_fw_loading(adev);
3888         if (r)
3889                 return r;
3890
3891         /* now we are okay to resume SMC/CP/SDMA */
3892         r = amdgpu_device_ip_reinit_late_sriov(adev);
3893         if (r)
3894                 goto error;
3895
3896         amdgpu_irq_gpu_reset_resume_helper(adev);
3897         r = amdgpu_ib_ring_tests(adev);
3898         amdgpu_amdkfd_post_reset(adev);
3899
3900 error:
3901         amdgpu_virt_release_full_gpu(adev, true);
3902         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3903                 amdgpu_inc_vram_lost(adev);
3904                 r = amdgpu_device_recover_vram(adev);
3905         }
3906
3907         return r;
3908 }
3909
3910 /**
3911  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3912  *
3913  * @adev: amdgpu device pointer
3914  *
3915  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3916  * a hung GPU.
3917  */
3918 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3919 {
3920         if (!amdgpu_device_ip_check_soft_reset(adev)) {
3921                 DRM_INFO("Timeout, but no hardware hang detected.\n");
3922                 return false;
3923         }
3924
3925         if (amdgpu_gpu_recovery == 0)
3926                 goto disabled;
3927
3928         if (amdgpu_sriov_vf(adev))
3929                 return true;
3930
3931         if (amdgpu_gpu_recovery == -1) {
3932                 switch (adev->asic_type) {
3933                 case CHIP_BONAIRE:
3934                 case CHIP_HAWAII:
3935                 case CHIP_TOPAZ:
3936                 case CHIP_TONGA:
3937                 case CHIP_FIJI:
3938                 case CHIP_POLARIS10:
3939                 case CHIP_POLARIS11:
3940                 case CHIP_POLARIS12:
3941                 case CHIP_VEGAM:
3942                 case CHIP_VEGA20:
3943                 case CHIP_VEGA10:
3944                 case CHIP_VEGA12:
3945                 case CHIP_RAVEN:
3946                 case CHIP_ARCTURUS:
3947                 case CHIP_RENOIR:
3948                 case CHIP_NAVI10:
3949                 case CHIP_NAVI14:
3950                 case CHIP_NAVI12:
3951                 case CHIP_SIENNA_CICHLID:
3952                         break;
3953                 default:
3954                         goto disabled;
3955                 }
3956         }
3957
3958         return true;
3959
3960 disabled:
3961                 DRM_INFO("GPU recovery disabled.\n");
3962                 return false;
3963 }
3964
3965
3966 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3967                                         struct amdgpu_job *job,
3968                                         bool *need_full_reset_arg)
3969 {
3970         int i, r = 0;
3971         bool need_full_reset  = *need_full_reset_arg;
3972
3973         amdgpu_debugfs_wait_dump(adev);
3974
3975         /* block all schedulers and reset given job's ring */
3976         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3977                 struct amdgpu_ring *ring = adev->rings[i];
3978
3979                 if (!ring || !ring->sched.thread)
3980                         continue;
3981
3982                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3983                 amdgpu_fence_driver_force_completion(ring);
3984         }
3985
3986         if(job)
3987                 drm_sched_increase_karma(&job->base);
3988
3989         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
3990         if (!amdgpu_sriov_vf(adev)) {
3991
3992                 if (!need_full_reset)
3993                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3994
3995                 if (!need_full_reset) {
3996                         amdgpu_device_ip_pre_soft_reset(adev);
3997                         r = amdgpu_device_ip_soft_reset(adev);
3998                         amdgpu_device_ip_post_soft_reset(adev);
3999                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4000                                 DRM_INFO("soft reset failed, will fallback to full reset!\n");
4001                                 need_full_reset = true;
4002                         }
4003                 }
4004
4005                 if (need_full_reset)
4006                         r = amdgpu_device_ip_suspend(adev);
4007
4008                 *need_full_reset_arg = need_full_reset;
4009         }
4010
4011         return r;
4012 }
4013
4014 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4015                                struct list_head *device_list_handle,
4016                                bool *need_full_reset_arg)
4017 {
4018         struct amdgpu_device *tmp_adev = NULL;
4019         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4020         int r = 0;
4021
4022         /*
4023          * ASIC reset has to be done on all HGMI hive nodes ASAP
4024          * to allow proper links negotiation in FW (within 1 sec)
4025          */
4026         if (need_full_reset) {
4027                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4028                         /* For XGMI run all resets in parallel to speed up the process */
4029                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4030                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4031                                         r = -EALREADY;
4032                         } else
4033                                 r = amdgpu_asic_reset(tmp_adev);
4034
4035                         if (r) {
4036                                 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
4037                                          r, tmp_adev->ddev->unique);
4038                                 break;
4039                         }
4040                 }
4041
4042                 /* For XGMI wait for all resets to complete before proceed */
4043                 if (!r) {
4044                         list_for_each_entry(tmp_adev, device_list_handle,
4045                                             gmc.xgmi.head) {
4046                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4047                                         flush_work(&tmp_adev->xgmi_reset_work);
4048                                         r = tmp_adev->asic_reset_res;
4049                                         if (r)
4050                                                 break;
4051                                 }
4052                         }
4053                 }
4054         }
4055
4056         if (!r && amdgpu_ras_intr_triggered()) {
4057                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4058                         if (tmp_adev->mmhub.funcs &&
4059                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4060                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4061                 }
4062
4063                 amdgpu_ras_intr_cleared();
4064         }
4065
4066         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4067                 if (need_full_reset) {
4068                         /* post card */
4069                         if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
4070                                 DRM_WARN("asic atom init failed!");
4071
4072                         if (!r) {
4073                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4074                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4075                                 if (r)
4076                                         goto out;
4077
4078                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4079                                 if (vram_lost) {
4080                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4081                                         amdgpu_inc_vram_lost(tmp_adev);
4082                                 }
4083
4084                                 r = amdgpu_gtt_mgr_recover(ttm_manager_type(&tmp_adev->mman.bdev, TTM_PL_TT));
4085                                 if (r)
4086                                         goto out;
4087
4088                                 r = amdgpu_device_fw_loading(tmp_adev);
4089                                 if (r)
4090                                         return r;
4091
4092                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4093                                 if (r)
4094                                         goto out;
4095
4096                                 if (vram_lost)
4097                                         amdgpu_device_fill_reset_magic(tmp_adev);
4098
4099                                 /*
4100                                  * Add this ASIC as tracked as reset was already
4101                                  * complete successfully.
4102                                  */
4103                                 amdgpu_register_gpu_instance(tmp_adev);
4104
4105                                 r = amdgpu_device_ip_late_init(tmp_adev);
4106                                 if (r)
4107                                         goto out;
4108
4109                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4110
4111                                 /* must succeed. */
4112                                 amdgpu_ras_resume(tmp_adev);
4113
4114                                 /* Update PSP FW topology after reset */
4115                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4116                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4117                         }
4118                 }
4119
4120
4121 out:
4122                 if (!r) {
4123                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4124                         r = amdgpu_ib_ring_tests(tmp_adev);
4125                         if (r) {
4126                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4127                                 r = amdgpu_device_ip_suspend(tmp_adev);
4128                                 need_full_reset = true;
4129                                 r = -EAGAIN;
4130                                 goto end;
4131                         }
4132                 }
4133
4134                 if (!r)
4135                         r = amdgpu_device_recover_vram(tmp_adev);
4136                 else
4137                         tmp_adev->asic_reset_res = r;
4138         }
4139
4140 end:
4141         *need_full_reset_arg = need_full_reset;
4142         return r;
4143 }
4144
4145 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
4146 {
4147         if (trylock) {
4148                 if (!mutex_trylock(&adev->lock_reset))
4149                         return false;
4150         } else
4151                 mutex_lock(&adev->lock_reset);
4152
4153         atomic_inc(&adev->gpu_reset_counter);
4154         adev->in_gpu_reset = true;
4155         switch (amdgpu_asic_reset_method(adev)) {
4156         case AMD_RESET_METHOD_MODE1:
4157                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4158                 break;
4159         case AMD_RESET_METHOD_MODE2:
4160                 adev->mp1_state = PP_MP1_STATE_RESET;
4161                 break;
4162         default:
4163                 adev->mp1_state = PP_MP1_STATE_NONE;
4164                 break;
4165         }
4166
4167         return true;
4168 }
4169
4170 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4171 {
4172         amdgpu_vf_error_trans_all(adev);
4173         adev->mp1_state = PP_MP1_STATE_NONE;
4174         adev->in_gpu_reset = false;
4175         mutex_unlock(&adev->lock_reset);
4176 }
4177
4178 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4179 {
4180         struct pci_dev *p = NULL;
4181
4182         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4183                         adev->pdev->bus->number, 1);
4184         if (p) {
4185                 pm_runtime_enable(&(p->dev));
4186                 pm_runtime_resume(&(p->dev));
4187         }
4188 }
4189
4190 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4191 {
4192         enum amd_reset_method reset_method;
4193         struct pci_dev *p = NULL;
4194         u64 expires;
4195
4196         /*
4197          * For now, only BACO and mode1 reset are confirmed
4198          * to suffer the audio issue without proper suspended.
4199          */
4200         reset_method = amdgpu_asic_reset_method(adev);
4201         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4202              (reset_method != AMD_RESET_METHOD_MODE1))
4203                 return -EINVAL;
4204
4205         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4206                         adev->pdev->bus->number, 1);
4207         if (!p)
4208                 return -ENODEV;
4209
4210         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4211         if (!expires)
4212                 /*
4213                  * If we cannot get the audio device autosuspend delay,
4214                  * a fixed 4S interval will be used. Considering 3S is
4215                  * the audio controller default autosuspend delay setting.
4216                  * 4S used here is guaranteed to cover that.
4217                  */
4218                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4219
4220         while (!pm_runtime_status_suspended(&(p->dev))) {
4221                 if (!pm_runtime_suspend(&(p->dev)))
4222                         break;
4223
4224                 if (expires < ktime_get_mono_fast_ns()) {
4225                         dev_warn(adev->dev, "failed to suspend display audio\n");
4226                         /* TODO: abort the succeeding gpu reset? */
4227                         return -ETIMEDOUT;
4228                 }
4229         }
4230
4231         pm_runtime_disable(&(p->dev));
4232
4233         return 0;
4234 }
4235
4236 /**
4237  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4238  *
4239  * @adev: amdgpu device pointer
4240  * @job: which job trigger hang
4241  *
4242  * Attempt to reset the GPU if it has hung (all asics).
4243  * Attempt to do soft-reset or full-reset and reinitialize Asic
4244  * Returns 0 for success or an error on failure.
4245  */
4246
4247 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4248                               struct amdgpu_job *job)
4249 {
4250         struct list_head device_list, *device_list_handle =  NULL;
4251         bool need_full_reset = false;
4252         bool job_signaled = false;
4253         struct amdgpu_hive_info *hive = NULL;
4254         struct amdgpu_device *tmp_adev = NULL;
4255         int i, r = 0;
4256         bool need_emergency_restart = false;
4257         bool audio_suspended = false;
4258
4259         /**
4260          * Special case: RAS triggered and full reset isn't supported
4261          */
4262         need_emergency_restart = amdgpu_ras_need_emergency_restart(adev);
4263
4264         /*
4265          * Flush RAM to disk so that after reboot
4266          * the user can read log and see why the system rebooted.
4267          */
4268         if (need_emergency_restart && amdgpu_ras_get_context(adev)->reboot) {
4269                 DRM_WARN("Emergency reboot.");
4270
4271                 ksys_sync_helper();
4272                 emergency_restart();
4273         }
4274
4275         dev_info(adev->dev, "GPU %s begin!\n",
4276                 need_emergency_restart ? "jobs stop":"reset");
4277
4278         /*
4279          * Here we trylock to avoid chain of resets executing from
4280          * either trigger by jobs on different adevs in XGMI hive or jobs on
4281          * different schedulers for same device while this TO handler is running.
4282          * We always reset all schedulers for device and all devices for XGMI
4283          * hive so that should take care of them too.
4284          */
4285         hive = amdgpu_get_xgmi_hive(adev, true);
4286         if (hive && !mutex_trylock(&hive->reset_lock)) {
4287                 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4288                           job ? job->base.id : -1, hive->hive_id);
4289                 mutex_unlock(&hive->hive_lock);
4290                 return 0;
4291         }
4292
4293         /*
4294          * Build list of devices to reset.
4295          * In case we are in XGMI hive mode, resort the device list
4296          * to put adev in the 1st position.
4297          */
4298         INIT_LIST_HEAD(&device_list);
4299         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4300                 if (!hive)
4301                         return -ENODEV;
4302                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4303                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4304                 device_list_handle = &hive->device_list;
4305         } else {
4306                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4307                 device_list_handle = &device_list;
4308         }
4309
4310         /* block all schedulers and reset given job's ring */
4311         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4312                 if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
4313                         DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
4314                                   job ? job->base.id : -1);
4315                         mutex_unlock(&hive->hive_lock);
4316                         return 0;
4317                 }
4318
4319                 /*
4320                  * Try to put the audio codec into suspend state
4321                  * before gpu reset started.
4322                  *
4323                  * Due to the power domain of the graphics device
4324                  * is shared with AZ power domain. Without this,
4325                  * we may change the audio hardware from behind
4326                  * the audio driver's back. That will trigger
4327                  * some audio codec errors.
4328                  */
4329                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4330                         audio_suspended = true;
4331
4332                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4333
4334                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4335
4336                 if (!amdgpu_sriov_vf(tmp_adev))
4337                         amdgpu_amdkfd_pre_reset(tmp_adev);
4338
4339                 /*
4340                  * Mark these ASICs to be reseted as untracked first
4341                  * And add them back after reset completed
4342                  */
4343                 amdgpu_unregister_gpu_instance(tmp_adev);
4344
4345                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4346
4347                 /* disable ras on ALL IPs */
4348                 if (!need_emergency_restart &&
4349                       amdgpu_device_ip_need_full_reset(tmp_adev))
4350                         amdgpu_ras_suspend(tmp_adev);
4351
4352                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4353                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4354
4355                         if (!ring || !ring->sched.thread)
4356                                 continue;
4357
4358                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4359
4360                         if (need_emergency_restart)
4361                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4362                 }
4363         }
4364
4365         if (need_emergency_restart)
4366                 goto skip_sched_resume;
4367
4368         /*
4369          * Must check guilty signal here since after this point all old
4370          * HW fences are force signaled.
4371          *
4372          * job->base holds a reference to parent fence
4373          */
4374         if (job && job->base.s_fence->parent &&
4375             dma_fence_is_signaled(job->base.s_fence->parent)) {
4376                 job_signaled = true;
4377                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4378                 goto skip_hw_reset;
4379         }
4380
4381 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4382         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4383                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4384                                                  NULL,
4385                                                  &need_full_reset);
4386                 /*TODO Should we stop ?*/
4387                 if (r) {
4388                         DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4389                                   r, tmp_adev->ddev->unique);
4390                         tmp_adev->asic_reset_res = r;
4391                 }
4392         }
4393
4394         /* Actual ASIC resets if needed.*/
4395         /* TODO Implement XGMI hive reset logic for SRIOV */
4396         if (amdgpu_sriov_vf(adev)) {
4397                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4398                 if (r)
4399                         adev->asic_reset_res = r;
4400         } else {
4401                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4402                 if (r && r == -EAGAIN)
4403                         goto retry;
4404         }
4405
4406 skip_hw_reset:
4407
4408         /* Post ASIC reset for all devs .*/
4409         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4410
4411                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4412                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4413
4414                         if (!ring || !ring->sched.thread)
4415                                 continue;
4416
4417                         /* No point to resubmit jobs if we didn't HW reset*/
4418                         if (!tmp_adev->asic_reset_res && !job_signaled)
4419                                 drm_sched_resubmit_jobs(&ring->sched);
4420
4421                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4422                 }
4423
4424                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4425                         drm_helper_resume_force_mode(tmp_adev->ddev);
4426                 }
4427
4428                 tmp_adev->asic_reset_res = 0;
4429
4430                 if (r) {
4431                         /* bad news, how to tell it to userspace ? */
4432                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4433                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4434                 } else {
4435                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4436                 }
4437         }
4438
4439 skip_sched_resume:
4440         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4441                 /*unlock kfd: SRIOV would do it separately */
4442                 if (!need_emergency_restart && !amdgpu_sriov_vf(tmp_adev))
4443                         amdgpu_amdkfd_post_reset(tmp_adev);
4444                 if (audio_suspended)
4445                         amdgpu_device_resume_display_audio(tmp_adev);
4446                 amdgpu_device_unlock_adev(tmp_adev);
4447         }
4448
4449         if (hive) {
4450                 mutex_unlock(&hive->reset_lock);
4451                 mutex_unlock(&hive->hive_lock);
4452         }
4453
4454         if (r)
4455                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4456         return r;
4457 }
4458
4459 /**
4460  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4461  *
4462  * @adev: amdgpu_device pointer
4463  *
4464  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4465  * and lanes) of the slot the device is in. Handles APUs and
4466  * virtualized environments where PCIE config space may not be available.
4467  */
4468 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4469 {
4470         struct pci_dev *pdev;
4471         enum pci_bus_speed speed_cap, platform_speed_cap;
4472         enum pcie_link_width platform_link_width;
4473
4474         if (amdgpu_pcie_gen_cap)
4475                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4476
4477         if (amdgpu_pcie_lane_cap)
4478                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4479
4480         /* covers APUs as well */
4481         if (pci_is_root_bus(adev->pdev->bus)) {
4482                 if (adev->pm.pcie_gen_mask == 0)
4483                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4484                 if (adev->pm.pcie_mlw_mask == 0)
4485                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4486                 return;
4487         }
4488
4489         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4490                 return;
4491
4492         pcie_bandwidth_available(adev->pdev, NULL,
4493                                  &platform_speed_cap, &platform_link_width);
4494
4495         if (adev->pm.pcie_gen_mask == 0) {
4496                 /* asic caps */
4497                 pdev = adev->pdev;
4498                 speed_cap = pcie_get_speed_cap(pdev);
4499                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4500                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4501                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4502                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4503                 } else {
4504                         if (speed_cap == PCIE_SPEED_16_0GT)
4505                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4506                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4507                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4508                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4509                         else if (speed_cap == PCIE_SPEED_8_0GT)
4510                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4511                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4512                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4513                         else if (speed_cap == PCIE_SPEED_5_0GT)
4514                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4515                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4516                         else
4517                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4518                 }
4519                 /* platform caps */
4520                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4521                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4522                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4523                 } else {
4524                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4525                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4526                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4527                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4528                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4529                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4530                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4531                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4532                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4533                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4534                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4535                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4536                         else
4537                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4538
4539                 }
4540         }
4541         if (adev->pm.pcie_mlw_mask == 0) {
4542                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4543                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4544                 } else {
4545                         switch (platform_link_width) {
4546                         case PCIE_LNK_X32:
4547                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4548                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4549                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4550                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4551                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4552                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4553                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4554                                 break;
4555                         case PCIE_LNK_X16:
4556                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4557                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4558                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4559                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4560                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4561                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4562                                 break;
4563                         case PCIE_LNK_X12:
4564                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4565                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4566                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4567                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4568                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4569                                 break;
4570                         case PCIE_LNK_X8:
4571                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4572                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4573                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4574                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4575                                 break;
4576                         case PCIE_LNK_X4:
4577                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4578                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4579                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4580                                 break;
4581                         case PCIE_LNK_X2:
4582                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4583                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4584                                 break;
4585                         case PCIE_LNK_X1:
4586                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4587                                 break;
4588                         default:
4589                                 break;
4590                         }
4591                 }
4592         }
4593 }
4594
4595 int amdgpu_device_baco_enter(struct drm_device *dev)
4596 {
4597         struct amdgpu_device *adev = dev->dev_private;
4598         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4599
4600         if (!amdgpu_device_supports_baco(adev->ddev))
4601                 return -ENOTSUPP;
4602
4603         if (ras && ras->supported)
4604                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4605
4606         return amdgpu_dpm_baco_enter(adev);
4607 }
4608
4609 int amdgpu_device_baco_exit(struct drm_device *dev)
4610 {
4611         struct amdgpu_device *adev = dev->dev_private;
4612         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4613         int ret = 0;
4614
4615         if (!amdgpu_device_supports_baco(adev->ddev))
4616                 return -ENOTSUPP;
4617
4618         ret = amdgpu_dpm_baco_exit(adev);
4619         if (ret)
4620                 return ret;
4621
4622         if (ras && ras->supported)
4623                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4624
4625         return 0;
4626 }