drivers/gpu/drm/amd/amdgpu/amdgpu_device.c

   1 /*
   2  * Copyright 2008 Advanced Micro Devices, Inc.
   3  * Copyright 2008 Red Hat Inc.
   4  * Copyright 2009 Jerome Glisse.
   5  *
   6  * Permission is hereby granted, free of charge, to any person obtaining a
   7  * copy of this software and associated documentation files (the "Software"),
   8  * to deal in the Software without restriction, including without limitation
   9  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
  10  * and/or sell copies of the Software, and to permit persons to whom the
  11  * Software is furnished to do so, subject to the following conditions:
  12  *
  13  * The above copyright notice and this permission notice shall be included in
  14  * all copies or substantial portions of the Software.
  15  *
  16  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  17  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  18  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  19  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  20  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  21  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  22  * OTHER DEALINGS IN THE SOFTWARE.
  23  *
  24  * Authors: Dave Airlie
  25  *          Alex Deucher
  26  *          Jerome Glisse
  27  */
  28 #include <linux/power_supply.h>
  29 #include <linux/kthread.h>
  30 #include <linux/module.h>
  31 #include <linux/console.h>
  32 #include <linux/slab.h>
  33
  34 #include <drm/drm_atomic_helper.h>
  35 #include <drm/drm_probe_helper.h>
  36 #include <drm/amdgpu_drm.h>
  37 #include <linux/vgaarb.h>
  38 #include <linux/vga_switcheroo.h>
  39 #include <linux/efi.h>
  40 #include "amdgpu.h"
  41 #include "amdgpu_trace.h"
  42 #include "amdgpu_i2c.h"
  43 #include "atom.h"
  44 #include "amdgpu_atombios.h"
  45 #include "amdgpu_atomfirmware.h"
  46 #include "amd_pcie.h"
  47 #ifdef CONFIG_DRM_AMDGPU_SI
  48 #include "si.h"
  49 #endif
  50 #ifdef CONFIG_DRM_AMDGPU_CIK
  51 #include "cik.h"
  52 #endif
  53 #include "vi.h"
  54 #include "soc15.h"
  55 #include "nv.h"
  56 #include "bif/bif_4_1_d.h"
  57 #include <linux/pci.h>
  58 #include <linux/firmware.h>
  59 #include "amdgpu_vf_error.h"
  60
  61 #include "amdgpu_amdkfd.h"
  62 #include "amdgpu_pm.h"
  63
  64 #include "amdgpu_xgmi.h"
  65 #include "amdgpu_ras.h"
  66 #include "amdgpu_pmu.h"
  67 #include "amdgpu_fru_eeprom.h"
  68
  69 #include <linux/suspend.h>
  70 #include <drm/task_barrier.h>
  71 #include <linux/pm_runtime.h>
  72
  73 MODULE_FIRMWARE("amdgpu/vega10_gpu_info.bin");
  74 MODULE_FIRMWARE("amdgpu/vega12_gpu_info.bin");
  75 MODULE_FIRMWARE("amdgpu/raven_gpu_info.bin");
  76 MODULE_FIRMWARE("amdgpu/picasso_gpu_info.bin");
  77 MODULE_FIRMWARE("amdgpu/raven2_gpu_info.bin");
  78 MODULE_FIRMWARE("amdgpu/arcturus_gpu_info.bin");
  79 MODULE_FIRMWARE("amdgpu/renoir_gpu_info.bin");
  80 MODULE_FIRMWARE("amdgpu/navi10_gpu_info.bin");
  81 MODULE_FIRMWARE("amdgpu/navi14_gpu_info.bin");
  82 MODULE_FIRMWARE("amdgpu/navi12_gpu_info.bin");
  83 MODULE_FIRMWARE("amdgpu/sienna_cichlid_gpu_info.bin");
  84
  85 #define AMDGPU_RESUME_MS                2000
  86
  87 const char *amdgpu_asic_name[] = {
  88         "TAHITI",
  89         "PITCAIRN",
  90         "VERDE",
  91         "OLAND",
  92         "HAINAN",
  93         "BONAIRE",
  94         "KAVERI",
  95         "KABINI",
  96         "HAWAII",
  97         "MULLINS",
  98         "TOPAZ",
  99         "TONGA",
 100         "FIJI",
 101         "CARRIZO",
 102         "STONEY",
 103         "POLARIS10",
 104         "POLARIS11",
 105         "POLARIS12",
 106         "VEGAM",
 107         "VEGA10",
 108         "VEGA12",
 109         "VEGA20",
 110         "RAVEN",
 111         "ARCTURUS",
 112         "RENOIR",
 113         "NAVI10",
 114         "NAVI14",
 115         "NAVI12",
 116         "SIENNA_CICHLID",
 117         "LAST",
 118 };
 119
 120 /**
 121  * DOC: pcie_replay_count
 122  *
 123  * The amdgpu driver provides a sysfs API for reporting the total number
 124  * of PCIe replays (NAKs)
 125  * The file pcie_replay_count is used for this and returns the total
 126  * number of replays as a sum of the NAKs generated and NAKs received
 127  */
 128
 129 static ssize_t amdgpu_device_get_pcie_replay_count(struct device *dev,
 130                 struct device_attribute *attr, char *buf)
 131 {
 132         struct drm_device *ddev = dev_get_drvdata(dev);
 133         struct amdgpu_device *adev = ddev->dev_private;
 134         uint64_t cnt = amdgpu_asic_get_pcie_replay_count(adev);
 135
 136         return snprintf(buf, PAGE_SIZE, "%llu\n", cnt);
 137 }
 138
 139 static DEVICE_ATTR(pcie_replay_count, S_IRUGO,
 140                 amdgpu_device_get_pcie_replay_count, NULL);
 141
 142 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev);
 143
 144 /**
 145  * DOC: product_name
 146  *
 147  * The amdgpu driver provides a sysfs API for reporting the product name
 148  * for the device
 149  * The file serial_number is used for this and returns the product name
 150  * as returned from the FRU.
 151  * NOTE: This is only available for certain server cards
 152  */
 153
 154 static ssize_t amdgpu_device_get_product_name(struct device *dev,
 155                 struct device_attribute *attr, char *buf)
 156 {
 157         struct drm_device *ddev = dev_get_drvdata(dev);
 158         struct amdgpu_device *adev = ddev->dev_private;
 159
 160         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_name);
 161 }
 162
 163 static DEVICE_ATTR(product_name, S_IRUGO,
 164                 amdgpu_device_get_product_name, NULL);
 165
 166 /**
 167  * DOC: product_number
 168  *
 169  * The amdgpu driver provides a sysfs API for reporting the part number
 170  * for the device
 171  * The file serial_number is used for this and returns the part number
 172  * as returned from the FRU.
 173  * NOTE: This is only available for certain server cards
 174  */
 175
 176 static ssize_t amdgpu_device_get_product_number(struct device *dev,
 177                 struct device_attribute *attr, char *buf)
 178 {
 179         struct drm_device *ddev = dev_get_drvdata(dev);
 180         struct amdgpu_device *adev = ddev->dev_private;
 181
 182         return snprintf(buf, PAGE_SIZE, "%s\n", adev->product_number);
 183 }
 184
 185 static DEVICE_ATTR(product_number, S_IRUGO,
 186                 amdgpu_device_get_product_number, NULL);
 187
 188 /**
 189  * DOC: serial_number
 190  *
 191  * The amdgpu driver provides a sysfs API for reporting the serial number
 192  * for the device
 193  * The file serial_number is used for this and returns the serial number
 194  * as returned from the FRU.
 195  * NOTE: This is only available for certain server cards
 196  */
 197
 198 static ssize_t amdgpu_device_get_serial_number(struct device *dev,
 199                 struct device_attribute *attr, char *buf)
 200 {
 201         struct drm_device *ddev = dev_get_drvdata(dev);
 202         struct amdgpu_device *adev = ddev->dev_private;
 203
 204         return snprintf(buf, PAGE_SIZE, "%s\n", adev->serial);
 205 }
 206
 207 static DEVICE_ATTR(serial_number, S_IRUGO,
 208                 amdgpu_device_get_serial_number, NULL);
 209
 210 /**
 211  * amdgpu_device_supports_boco - Is the device a dGPU with HG/PX power control
 212  *
 213  * @dev: drm_device pointer
 214  *
 215  * Returns true if the device is a dGPU with HG/PX power control,
 216  * otherwise return false.
 217  */
 218 bool amdgpu_device_supports_boco(struct drm_device *dev)
 219 {
 220         struct amdgpu_device *adev = dev->dev_private;
 221
 222         if (adev->flags & AMD_IS_PX)
 223                 return true;
 224         return false;
 225 }
 226
 227 /**
 228  * amdgpu_device_supports_baco - Does the device support BACO
 229  *
 230  * @dev: drm_device pointer
 231  *
 232  * Returns true if the device supporte BACO,
 233  * otherwise return false.
 234  */
 235 bool amdgpu_device_supports_baco(struct drm_device *dev)
 236 {
 237         struct amdgpu_device *adev = dev->dev_private;
 238
 239         return amdgpu_asic_supports_baco(adev);
 240 }
 241
 242 /**
 243  * VRAM access helper functions.
 244  *
 245  * amdgpu_device_vram_access - read/write a buffer in vram
 246  *
 247  * @adev: amdgpu_device pointer
 248  * @pos: offset of the buffer in vram
 249  * @buf: virtual address of the buffer in system memory
 250  * @size: read/write size, sizeof(@buf) must > @size
 251  * @write: true - write to vram, otherwise - read from vram
 252  */
 253 void amdgpu_device_vram_access(struct amdgpu_device *adev, loff_t pos,
 254                                uint32_t *buf, size_t size, bool write)
 255 {
 256         unsigned long flags;
 257         uint32_t hi = ~0;
 258         uint64_t last;
 259
 260
 261 #ifdef CONFIG_64BIT
 262         last = min(pos + size, adev->gmc.visible_vram_size);
 263         if (last > pos) {
 264                 void __iomem *addr = adev->mman.aper_base_kaddr + pos;
 265                 size_t count = last - pos;
 266
 267                 if (write) {
 268                         memcpy_toio(addr, buf, count);
 269                         mb();
 270                         amdgpu_asic_flush_hdp(adev, NULL);
 271                 } else {
 272                         amdgpu_asic_invalidate_hdp(adev, NULL);
 273                         mb();
 274                         memcpy_fromio(buf, addr, count);
 275                 }
 276
 277                 if (count == size)
 278                         return;
 279
 280                 pos += count;
 281                 buf += count / 4;
 282                 size -= count;
 283         }
 284 #endif
 285
 286         spin_lock_irqsave(&adev->mmio_idx_lock, flags);
 287         for (last = pos + size; pos < last; pos += 4) {
 288                 uint32_t tmp = pos >> 31;
 289
 290                 WREG32_NO_KIQ(mmMM_INDEX, ((uint32_t)pos) | 0x80000000);
 291                 if (tmp != hi) {
 292                         WREG32_NO_KIQ(mmMM_INDEX_HI, tmp);
 293                         hi = tmp;
 294                 }
 295                 if (write)
 296                         WREG32_NO_KIQ(mmMM_DATA, *buf++);
 297                 else
 298                         *buf++ = RREG32_NO_KIQ(mmMM_DATA);
 299         }
 300         spin_unlock_irqrestore(&adev->mmio_idx_lock, flags);
 301 }
 302
 303 /*
 304  * device register access helper functions.
 305  */
 306 /**
 307  * amdgpu_device_rreg - read a register
 308  *
 309  * @adev: amdgpu_device pointer
 310  * @reg: dword aligned register offset
 311  * @acc_flags: access flags which require special behavior
 312  *
 313  * Returns the 32 bit value from the offset specified.
 314  */
 315 uint32_t amdgpu_device_rreg(struct amdgpu_device *adev, uint32_t reg,
 316                             uint32_t acc_flags)
 317 {
 318         uint32_t ret;
 319
 320         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 321                 return amdgpu_kiq_rreg(adev, reg);
 322
 323         if ((reg * 4) < adev->rmmio_size)
 324                 ret = readl(((void __iomem *)adev->rmmio) + (reg * 4));
 325         else
 326                 ret = adev->pcie_rreg(adev, (reg * 4));
 327         trace_amdgpu_device_rreg(adev->pdev->device, reg, ret);
 328         return ret;
 329 }
 330
 331 /*
 332  * MMIO register read with bytes helper functions
 333  * @offset:bytes offset from MMIO start
 334  *
 335 */
 336
 337 /**
 338  * amdgpu_mm_rreg8 - read a memory mapped IO register
 339  *
 340  * @adev: amdgpu_device pointer
 341  * @offset: byte aligned register offset
 342  *
 343  * Returns the 8 bit value from the offset specified.
 344  */
 345 uint8_t amdgpu_mm_rreg8(struct amdgpu_device *adev, uint32_t offset) {
 346         if (offset < adev->rmmio_size)
 347                 return (readb(adev->rmmio + offset));
 348         BUG();
 349 }
 350
 351 /*
 352  * MMIO register write with bytes helper functions
 353  * @offset:bytes offset from MMIO start
 354  * @value: the value want to be written to the register
 355  *
 356 */
 357 /**
 358  * amdgpu_mm_wreg8 - read a memory mapped IO register
 359  *
 360  * @adev: amdgpu_device pointer
 361  * @offset: byte aligned register offset
 362  * @value: 8 bit value to write
 363  *
 364  * Writes the value specified to the offset specified.
 365  */
 366 void amdgpu_mm_wreg8(struct amdgpu_device *adev, uint32_t offset, uint8_t value) {
 367         if (offset < adev->rmmio_size)
 368                 writeb(value, adev->rmmio + offset);
 369         else
 370                 BUG();
 371 }
 372
 373 void static inline amdgpu_device_wreg_no_kiq(struct amdgpu_device *adev, uint32_t reg,
 374                                              uint32_t v, uint32_t acc_flags)
 375 {
 376         trace_amdgpu_device_wreg(adev->pdev->device, reg, v);
 377
 378         if ((reg * 4) < adev->rmmio_size)
 379                 writel(v, ((void __iomem *)adev->rmmio) + (reg * 4));
 380         else
 381                 adev->pcie_wreg(adev, (reg * 4), v);
 382 }
 383
 384 /**
 385  * amdgpu_device_wreg - write to a register
 386  *
 387  * @adev: amdgpu_device pointer
 388  * @reg: dword aligned register offset
 389  * @v: 32 bit value to write to the register
 390  * @acc_flags: access flags which require special behavior
 391  *
 392  * Writes the value specified to the offset specified.
 393  */
 394 void amdgpu_device_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 395                         uint32_t acc_flags)
 396 {
 397         if (!(acc_flags & AMDGPU_REGS_NO_KIQ) && amdgpu_sriov_runtime(adev))
 398                 return amdgpu_kiq_wreg(adev, reg, v);
 399
 400         amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);
 401 }
 402
 403 /*
 404  * amdgpu_mm_wreg_mmio_rlc -  write register either with mmio or with RLC path if in range
 405  *
 406  * this function is invoked only the debugfs register access
 407  * */
 408 void amdgpu_mm_wreg_mmio_rlc(struct amdgpu_device *adev, uint32_t reg, uint32_t v,
 409                     uint32_t acc_flags)
 410 {
 411         if (amdgpu_sriov_fullaccess(adev) &&
 412                 adev->gfx.rlc.funcs &&
 413                 adev->gfx.rlc.funcs->is_rlcg_access_range) {
 414
 415                 if (adev->gfx.rlc.funcs->is_rlcg_access_range(adev, reg))
 416                         return adev->gfx.rlc.funcs->rlcg_wreg(adev, reg, v);
 417         }
 418
 419         amdgpu_device_wreg_no_kiq(adev, reg, v, acc_flags);
 420 }
 421
 422 /**
 423  * amdgpu_io_rreg - read an IO register
 424  *
 425  * @adev: amdgpu_device pointer
 426  * @reg: dword aligned register offset
 427  *
 428  * Returns the 32 bit value from the offset specified.
 429  */
 430 u32 amdgpu_io_rreg(struct amdgpu_device *adev, u32 reg)
 431 {
 432         if ((reg * 4) < adev->rio_mem_size)
 433                 return ioread32(adev->rio_mem + (reg * 4));
 434         else {
 435                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 436                 return ioread32(adev->rio_mem + (mmMM_DATA * 4));
 437         }
 438 }
 439
 440 /**
 441  * amdgpu_io_wreg - write to an IO register
 442  *
 443  * @adev: amdgpu_device pointer
 444  * @reg: dword aligned register offset
 445  * @v: 32 bit value to write to the register
 446  *
 447  * Writes the value specified to the offset specified.
 448  */
 449 void amdgpu_io_wreg(struct amdgpu_device *adev, u32 reg, u32 v)
 450 {
 451         if ((reg * 4) < adev->rio_mem_size)
 452                 iowrite32(v, adev->rio_mem + (reg * 4));
 453         else {
 454                 iowrite32((reg * 4), adev->rio_mem + (mmMM_INDEX * 4));
 455                 iowrite32(v, adev->rio_mem + (mmMM_DATA * 4));
 456         }
 457 }
 458
 459 /**
 460  * amdgpu_mm_rdoorbell - read a doorbell dword
 461  *
 462  * @adev: amdgpu_device pointer
 463  * @index: doorbell index
 464  *
 465  * Returns the value in the doorbell aperture at the
 466  * requested doorbell index (CIK).
 467  */
 468 u32 amdgpu_mm_rdoorbell(struct amdgpu_device *adev, u32 index)
 469 {
 470         if (index < adev->doorbell.num_doorbells) {
 471                 return readl(adev->doorbell.ptr + index);
 472         } else {
 473                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 474                 return 0;
 475         }
 476 }
 477
 478 /**
 479  * amdgpu_mm_wdoorbell - write a doorbell dword
 480  *
 481  * @adev: amdgpu_device pointer
 482  * @index: doorbell index
 483  * @v: value to write
 484  *
 485  * Writes @v to the doorbell aperture at the
 486  * requested doorbell index (CIK).
 487  */
 488 void amdgpu_mm_wdoorbell(struct amdgpu_device *adev, u32 index, u32 v)
 489 {
 490         if (index < adev->doorbell.num_doorbells) {
 491                 writel(v, adev->doorbell.ptr + index);
 492         } else {
 493                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 494         }
 495 }
 496
 497 /**
 498  * amdgpu_mm_rdoorbell64 - read a doorbell Qword
 499  *
 500  * @adev: amdgpu_device pointer
 501  * @index: doorbell index
 502  *
 503  * Returns the value in the doorbell aperture at the
 504  * requested doorbell index (VEGA10+).
 505  */
 506 u64 amdgpu_mm_rdoorbell64(struct amdgpu_device *adev, u32 index)
 507 {
 508         if (index < adev->doorbell.num_doorbells) {
 509                 return atomic64_read((atomic64_t *)(adev->doorbell.ptr + index));
 510         } else {
 511                 DRM_ERROR("reading beyond doorbell aperture: 0x%08x!\n", index);
 512                 return 0;
 513         }
 514 }
 515
 516 /**
 517  * amdgpu_mm_wdoorbell64 - write a doorbell Qword
 518  *
 519  * @adev: amdgpu_device pointer
 520  * @index: doorbell index
 521  * @v: value to write
 522  *
 523  * Writes @v to the doorbell aperture at the
 524  * requested doorbell index (VEGA10+).
 525  */
 526 void amdgpu_mm_wdoorbell64(struct amdgpu_device *adev, u32 index, u64 v)
 527 {
 528         if (index < adev->doorbell.num_doorbells) {
 529                 atomic64_set((atomic64_t *)(adev->doorbell.ptr + index), v);
 530         } else {
 531                 DRM_ERROR("writing beyond doorbell aperture: 0x%08x!\n", index);
 532         }
 533 }
 534
 535 /**
 536  * amdgpu_invalid_rreg - dummy reg read function
 537  *
 538  * @adev: amdgpu device pointer
 539  * @reg: offset of register
 540  *
 541  * Dummy register read function.  Used for register blocks
 542  * that certain asics don't have (all asics).
 543  * Returns the value in the register.
 544  */
 545 static uint32_t amdgpu_invalid_rreg(struct amdgpu_device *adev, uint32_t reg)
 546 {
 547         DRM_ERROR("Invalid callback to read register 0x%04X\n", reg);
 548         BUG();
 549         return 0;
 550 }
 551
 552 /**
 553  * amdgpu_invalid_wreg - dummy reg write function
 554  *
 555  * @adev: amdgpu device pointer
 556  * @reg: offset of register
 557  * @v: value to write to the register
 558  *
 559  * Dummy register read function.  Used for register blocks
 560  * that certain asics don't have (all asics).
 561  */
 562 static void amdgpu_invalid_wreg(struct amdgpu_device *adev, uint32_t reg, uint32_t v)
 563 {
 564         DRM_ERROR("Invalid callback to write register 0x%04X with 0x%08X\n",
 565                   reg, v);
 566         BUG();
 567 }
 568
 569 /**
 570  * amdgpu_invalid_rreg64 - dummy 64 bit reg read function
 571  *
 572  * @adev: amdgpu device pointer
 573  * @reg: offset of register
 574  *
 575  * Dummy register read function.  Used for register blocks
 576  * that certain asics don't have (all asics).
 577  * Returns the value in the register.
 578  */
 579 static uint64_t amdgpu_invalid_rreg64(struct amdgpu_device *adev, uint32_t reg)
 580 {
 581         DRM_ERROR("Invalid callback to read 64 bit register 0x%04X\n", reg);
 582         BUG();
 583         return 0;
 584 }
 585
 586 /**
 587  * amdgpu_invalid_wreg64 - dummy reg write function
 588  *
 589  * @adev: amdgpu device pointer
 590  * @reg: offset of register
 591  * @v: value to write to the register
 592  *
 593  * Dummy register read function.  Used for register blocks
 594  * that certain asics don't have (all asics).
 595  */
 596 static void amdgpu_invalid_wreg64(struct amdgpu_device *adev, uint32_t reg, uint64_t v)
 597 {
 598         DRM_ERROR("Invalid callback to write 64 bit register 0x%04X with 0x%08llX\n",
 599                   reg, v);
 600         BUG();
 601 }
 602
 603 /**
 604  * amdgpu_block_invalid_rreg - dummy reg read function
 605  *
 606  * @adev: amdgpu device pointer
 607  * @block: offset of instance
 608  * @reg: offset of register
 609  *
 610  * Dummy register read function.  Used for register blocks
 611  * that certain asics don't have (all asics).
 612  * Returns the value in the register.
 613  */
 614 static uint32_t amdgpu_block_invalid_rreg(struct amdgpu_device *adev,
 615                                           uint32_t block, uint32_t reg)
 616 {
 617         DRM_ERROR("Invalid callback to read register 0x%04X in block 0x%04X\n",
 618                   reg, block);
 619         BUG();
 620         return 0;
 621 }
 622
 623 /**
 624  * amdgpu_block_invalid_wreg - dummy reg write function
 625  *
 626  * @adev: amdgpu device pointer
 627  * @block: offset of instance
 628  * @reg: offset of register
 629  * @v: value to write to the register
 630  *
 631  * Dummy register read function.  Used for register blocks
 632  * that certain asics don't have (all asics).
 633  */
 634 static void amdgpu_block_invalid_wreg(struct amdgpu_device *adev,
 635                                       uint32_t block,
 636                                       uint32_t reg, uint32_t v)
 637 {
 638         DRM_ERROR("Invalid block callback to write register 0x%04X in block 0x%04X with 0x%08X\n",
 639                   reg, block, v);
 640         BUG();
 641 }
 642
 643 /**
 644  * amdgpu_device_vram_scratch_init - allocate the VRAM scratch page
 645  *
 646  * @adev: amdgpu device pointer
 647  *
 648  * Allocates a scratch page of VRAM for use by various things in the
 649  * driver.
 650  */
 651 static int amdgpu_device_vram_scratch_init(struct amdgpu_device *adev)
 652 {
 653         return amdgpu_bo_create_kernel(adev, AMDGPU_GPU_PAGE_SIZE,
 654                                        PAGE_SIZE, AMDGPU_GEM_DOMAIN_VRAM,
 655                                        &adev->vram_scratch.robj,
 656                                        &adev->vram_scratch.gpu_addr,
 657                                        (void **)&adev->vram_scratch.ptr);
 658 }
 659
 660 /**
 661  * amdgpu_device_vram_scratch_fini - Free the VRAM scratch page
 662  *
 663  * @adev: amdgpu device pointer
 664  *
 665  * Frees the VRAM scratch page.
 666  */
 667 static void amdgpu_device_vram_scratch_fini(struct amdgpu_device *adev)
 668 {
 669         amdgpu_bo_free_kernel(&adev->vram_scratch.robj, NULL, NULL);
 670 }
 671
 672 /**
 673  * amdgpu_device_program_register_sequence - program an array of registers.
 674  *
 675  * @adev: amdgpu_device pointer
 676  * @registers: pointer to the register array
 677  * @array_size: size of the register array
 678  *
 679  * Programs an array or registers with and and or masks.
 680  * This is a helper for setting golden registers.
 681  */
 682 void amdgpu_device_program_register_sequence(struct amdgpu_device *adev,
 683                                              const u32 *registers,
 684                                              const u32 array_size)
 685 {
 686         u32 tmp, reg, and_mask, or_mask;
 687         int i;
 688
 689         if (array_size % 3)
 690                 return;
 691
 692         for (i = 0; i < array_size; i +=3) {
 693                 reg = registers[i + 0];
 694                 and_mask = registers[i + 1];
 695                 or_mask = registers[i + 2];
 696
 697                 if (and_mask == 0xffffffff) {
 698                         tmp = or_mask;
 699                 } else {
 700                         tmp = RREG32(reg);
 701                         tmp &= ~and_mask;
 702                         if (adev->family >= AMDGPU_FAMILY_AI)
 703                                 tmp |= (or_mask & and_mask);
 704                         else
 705                                 tmp |= or_mask;
 706                 }
 707                 WREG32(reg, tmp);
 708         }
 709 }
 710
 711 /**
 712  * amdgpu_device_pci_config_reset - reset the GPU
 713  *
 714  * @adev: amdgpu_device pointer
 715  *
 716  * Resets the GPU using the pci config reset sequence.
 717  * Only applicable to asics prior to vega10.
 718  */
 719 void amdgpu_device_pci_config_reset(struct amdgpu_device *adev)
 720 {
 721         pci_write_config_dword(adev->pdev, 0x7c, AMDGPU_ASIC_RESET_DATA);
 722 }
 723
 724 /*
 725  * GPU doorbell aperture helpers function.
 726  */
 727 /**
 728  * amdgpu_device_doorbell_init - Init doorbell driver information.
 729  *
 730  * @adev: amdgpu_device pointer
 731  *
 732  * Init doorbell driver information (CIK)
 733  * Returns 0 on success, error on failure.
 734  */
 735 static int amdgpu_device_doorbell_init(struct amdgpu_device *adev)
 736 {
 737
 738         /* No doorbell on SI hardware generation */
 739         if (adev->asic_type < CHIP_BONAIRE) {
 740                 adev->doorbell.base = 0;
 741                 adev->doorbell.size = 0;
 742                 adev->doorbell.num_doorbells = 0;
 743                 adev->doorbell.ptr = NULL;
 744                 return 0;
 745         }
 746
 747         if (pci_resource_flags(adev->pdev, 2) & IORESOURCE_UNSET)
 748                 return -EINVAL;
 749
 750         amdgpu_asic_init_doorbell_index(adev);
 751
 752         /* doorbell bar mapping */
 753         adev->doorbell.base = pci_resource_start(adev->pdev, 2);
 754         adev->doorbell.size = pci_resource_len(adev->pdev, 2);
 755
 756         adev->doorbell.num_doorbells = min_t(u32, adev->doorbell.size / sizeof(u32),
 757                                              adev->doorbell_index.max_assignment+1);
 758         if (adev->doorbell.num_doorbells == 0)
 759                 return -EINVAL;
 760
 761         /* For Vega, reserve and map two pages on doorbell BAR since SDMA
 762          * paging queue doorbell use the second page. The
 763          * AMDGPU_DOORBELL64_MAX_ASSIGNMENT definition assumes all the
 764          * doorbells are in the first page. So with paging queue enabled,
 765          * the max num_doorbells should + 1 page (0x400 in dword)
 766          */
 767         if (adev->asic_type >= CHIP_VEGA10)
 768                 adev->doorbell.num_doorbells += 0x400;
 769
 770         adev->doorbell.ptr = ioremap(adev->doorbell.base,
 771                                      adev->doorbell.num_doorbells *
 772                                      sizeof(u32));
 773         if (adev->doorbell.ptr == NULL)
 774                 return -ENOMEM;
 775
 776         return 0;
 777 }
 778
 779 /**
 780  * amdgpu_device_doorbell_fini - Tear down doorbell driver information.
 781  *
 782  * @adev: amdgpu_device pointer
 783  *
 784  * Tear down doorbell driver information (CIK)
 785  */
 786 static void amdgpu_device_doorbell_fini(struct amdgpu_device *adev)
 787 {
 788         iounmap(adev->doorbell.ptr);
 789         adev->doorbell.ptr = NULL;
 790 }
 791
 792
 793
 794 /*
 795  * amdgpu_device_wb_*()
 796  * Writeback is the method by which the GPU updates special pages in memory
 797  * with the status of certain GPU events (fences, ring pointers,etc.).
 798  */
 799
 800 /**
 801  * amdgpu_device_wb_fini - Disable Writeback and free memory
 802  *
 803  * @adev: amdgpu_device pointer
 804  *
 805  * Disables Writeback and frees the Writeback memory (all asics).
 806  * Used at driver shutdown.
 807  */
 808 static void amdgpu_device_wb_fini(struct amdgpu_device *adev)
 809 {
 810         if (adev->wb.wb_obj) {
 811                 amdgpu_bo_free_kernel(&adev->wb.wb_obj,
 812                                       &adev->wb.gpu_addr,
 813                                       (void **)&adev->wb.wb);
 814                 adev->wb.wb_obj = NULL;
 815         }
 816 }
 817
 818 /**
 819  * amdgpu_device_wb_init- Init Writeback driver info and allocate memory
 820  *
 821  * @adev: amdgpu_device pointer
 822  *
 823  * Initializes writeback and allocates writeback memory (all asics).
 824  * Used at driver startup.
 825  * Returns 0 on success or an -error on failure.
 826  */
 827 static int amdgpu_device_wb_init(struct amdgpu_device *adev)
 828 {
 829         int r;
 830
 831         if (adev->wb.wb_obj == NULL) {
 832                 /* AMDGPU_MAX_WB * sizeof(uint32_t) * 8 = AMDGPU_MAX_WB 256bit slots */
 833                 r = amdgpu_bo_create_kernel(adev, AMDGPU_MAX_WB * sizeof(uint32_t) * 8,
 834                                             PAGE_SIZE, AMDGPU_GEM_DOMAIN_GTT,
 835                                             &adev->wb.wb_obj, &adev->wb.gpu_addr,
 836                                             (void **)&adev->wb.wb);
 837                 if (r) {
 838                         dev_warn(adev->dev, "(%d) create WB bo failed\n", r);
 839                         return r;
 840                 }
 841
 842                 adev->wb.num_wb = AMDGPU_MAX_WB;
 843                 memset(&adev->wb.used, 0, sizeof(adev->wb.used));
 844
 845                 /* clear wb memory */
 846                 memset((char *)adev->wb.wb, 0, AMDGPU_MAX_WB * sizeof(uint32_t) * 8);
 847         }
 848
 849         return 0;
 850 }
 851
 852 /**
 853  * amdgpu_device_wb_get - Allocate a wb entry
 854  *
 855  * @adev: amdgpu_device pointer
 856  * @wb: wb index
 857  *
 858  * Allocate a wb slot for use by the driver (all asics).
 859  * Returns 0 on success or -EINVAL on failure.
 860  */
 861 int amdgpu_device_wb_get(struct amdgpu_device *adev, u32 *wb)
 862 {
 863         unsigned long offset = find_first_zero_bit(adev->wb.used, adev->wb.num_wb);
 864
 865         if (offset < adev->wb.num_wb) {
 866                 __set_bit(offset, adev->wb.used);
 867                 *wb = offset << 3; /* convert to dw offset */
 868                 return 0;
 869         } else {
 870                 return -EINVAL;
 871         }
 872 }
 873
 874 /**
 875  * amdgpu_device_wb_free - Free a wb entry
 876  *
 877  * @adev: amdgpu_device pointer
 878  * @wb: wb index
 879  *
 880  * Free a wb slot allocated for use by the driver (all asics)
 881  */
 882 void amdgpu_device_wb_free(struct amdgpu_device *adev, u32 wb)
 883 {
 884         wb >>= 3;
 885         if (wb < adev->wb.num_wb)
 886                 __clear_bit(wb, adev->wb.used);
 887 }
 888
 889 /**
 890  * amdgpu_device_resize_fb_bar - try to resize FB BAR
 891  *
 892  * @adev: amdgpu_device pointer
 893  *
 894  * Try to resize FB BAR to make all VRAM CPU accessible. We try very hard not
 895  * to fail, but if any of the BARs is not accessible after the size we abort
 896  * driver loading by returning -ENODEV.
 897  */
 898 int amdgpu_device_resize_fb_bar(struct amdgpu_device *adev)
 899 {
 900         u64 space_needed = roundup_pow_of_two(adev->gmc.real_vram_size);
 901         u32 rbar_size = order_base_2(((space_needed >> 20) | 1)) - 1;
 902         struct pci_bus *root;
 903         struct resource *res;
 904         unsigned i;
 905         u16 cmd;
 906         int r;
 907
 908         /* Bypass for VF */
 909         if (amdgpu_sriov_vf(adev))
 910                 return 0;
 911
 912         /* skip if the bios has already enabled large BAR */
 913         if (adev->gmc.real_vram_size &&
 914             (pci_resource_len(adev->pdev, 0) >= adev->gmc.real_vram_size))
 915                 return 0;
 916
 917         /* Check if the root BUS has 64bit memory resources */
 918         root = adev->pdev->bus;
 919         while (root->parent)
 920                 root = root->parent;
 921
 922         pci_bus_for_each_resource(root, res, i) {
 923                 if (res && res->flags & (IORESOURCE_MEM | IORESOURCE_MEM_64) &&
 924                     res->start > 0x100000000ull)
 925                         break;
 926         }
 927
 928         /* Trying to resize is pointless without a root hub window above 4GB */
 929         if (!res)
 930                 return 0;
 931
 932         /* Disable memory decoding while we change the BAR addresses and size */
 933         pci_read_config_word(adev->pdev, PCI_COMMAND, &cmd);
 934         pci_write_config_word(adev->pdev, PCI_COMMAND,
 935                               cmd & ~PCI_COMMAND_MEMORY);
 936
 937         /* Free the VRAM and doorbell BAR, we most likely need to move both. */
 938         amdgpu_device_doorbell_fini(adev);
 939         if (adev->asic_type >= CHIP_BONAIRE)
 940                 pci_release_resource(adev->pdev, 2);
 941
 942         pci_release_resource(adev->pdev, 0);
 943
 944         r = pci_resize_resource(adev->pdev, 0, rbar_size);
 945         if (r == -ENOSPC)
 946                 DRM_INFO("Not enough PCI address space for a large BAR.");
 947         else if (r && r != -ENOTSUPP)
 948                 DRM_ERROR("Problem resizing BAR0 (%d).", r);
 949
 950         pci_assign_unassigned_bus_resources(adev->pdev->bus);
 951
 952         /* When the doorbell or fb BAR isn't available we have no chance of
 953          * using the device.
 954          */
 955         r = amdgpu_device_doorbell_init(adev);
 956         if (r || (pci_resource_flags(adev->pdev, 0) & IORESOURCE_UNSET))
 957                 return -ENODEV;
 958
 959         pci_write_config_word(adev->pdev, PCI_COMMAND, cmd);
 960
 961         return 0;
 962 }
 963
 964 /*
 965  * GPU helpers function.
 966  */
 967 /**
 968  * amdgpu_device_need_post - check if the hw need post or not
 969  *
 970  * @adev: amdgpu_device pointer
 971  *
 972  * Check if the asic has been initialized (all asics) at driver startup
 973  * or post is needed if  hw reset is performed.
 974  * Returns true if need or false if not.
 975  */
 976 bool amdgpu_device_need_post(struct amdgpu_device *adev)
 977 {
 978         uint32_t reg;
 979
 980         if (amdgpu_sriov_vf(adev))
 981                 return false;
 982
 983         if (amdgpu_passthrough(adev)) {
 984                 /* for FIJI: In whole GPU pass-through virtualization case, after VM reboot
 985                  * some old smc fw still need driver do vPost otherwise gpu hang, while
 986                  * those smc fw version above 22.15 doesn't have this flaw, so we force
 987                  * vpost executed for smc version below 22.15
 988                  */
 989                 if (adev->asic_type == CHIP_FIJI) {
 990                         int err;
 991                         uint32_t fw_ver;
 992                         err = request_firmware(&adev->pm.fw, "amdgpu/fiji_smc.bin", adev->dev);
 993                         /* force vPost if error occured */
 994                         if (err)
 995                                 return true;
 996
 997                         fw_ver = *((uint32_t *)adev->pm.fw->data + 69);
 998                         if (fw_ver < 0x00160e00)
 999                                 return true;
1000                 }
1001         }
1002
1003         if (adev->has_hw_reset) {
1004                 adev->has_hw_reset = false;
1005                 return true;
1006         }
1007
1008         /* bios scratch used on CIK+ */
1009         if (adev->asic_type >= CHIP_BONAIRE)
1010                 return amdgpu_atombios_scratch_need_asic_init(adev);
1011
1012         /* check MEM_SIZE for older asics */
1013         reg = amdgpu_asic_get_config_memsize(adev);
1014
1015         if ((reg != 0) && (reg != 0xffffffff))
1016                 return false;
1017
1018         return true;
1019 }
1020
1021 /* if we get transitioned to only one device, take VGA back */
1022 /**
1023  * amdgpu_device_vga_set_decode - enable/disable vga decode
1024  *
1025  * @cookie: amdgpu_device pointer
1026  * @state: enable/disable vga decode
1027  *
1028  * Enable/disable vga decode (all asics).
1029  * Returns VGA resource flags.
1030  */
1031 static unsigned int amdgpu_device_vga_set_decode(void *cookie, bool state)
1032 {
1033         struct amdgpu_device *adev = cookie;
1034         amdgpu_asic_set_vga_state(adev, state);
1035         if (state)
1036                 return VGA_RSRC_LEGACY_IO | VGA_RSRC_LEGACY_MEM |
1037                        VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1038         else
1039                 return VGA_RSRC_NORMAL_IO | VGA_RSRC_NORMAL_MEM;
1040 }
1041
1042 /**
1043  * amdgpu_device_check_block_size - validate the vm block size
1044  *
1045  * @adev: amdgpu_device pointer
1046  *
1047  * Validates the vm block size specified via module parameter.
1048  * The vm block size defines number of bits in page table versus page directory,
1049  * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1050  * page table and the remaining bits are in the page directory.
1051  */
1052 static void amdgpu_device_check_block_size(struct amdgpu_device *adev)
1053 {
1054         /* defines number of bits in page table versus page directory,
1055          * a page is 4KB so we have 12 bits offset, minimum 9 bits in the
1056          * page table and the remaining bits are in the page directory */
1057         if (amdgpu_vm_block_size == -1)
1058                 return;
1059
1060         if (amdgpu_vm_block_size < 9) {
1061                 dev_warn(adev->dev, "VM page table size (%d) too small\n",
1062                          amdgpu_vm_block_size);
1063                 amdgpu_vm_block_size = -1;
1064         }
1065 }
1066
1067 /**
1068  * amdgpu_device_check_vm_size - validate the vm size
1069  *
1070  * @adev: amdgpu_device pointer
1071  *
1072  * Validates the vm size in GB specified via module parameter.
1073  * The VM size is the size of the GPU virtual memory space in GB.
1074  */
1075 static void amdgpu_device_check_vm_size(struct amdgpu_device *adev)
1076 {
1077         /* no need to check the default value */
1078         if (amdgpu_vm_size == -1)
1079                 return;
1080
1081         if (amdgpu_vm_size < 1) {
1082                 dev_warn(adev->dev, "VM size (%d) too small, min is 1GB\n",
1083                          amdgpu_vm_size);
1084                 amdgpu_vm_size = -1;
1085         }
1086 }
1087
1088 static void amdgpu_device_check_smu_prv_buffer_size(struct amdgpu_device *adev)
1089 {
1090         struct sysinfo si;
1091         bool is_os_64 = (sizeof(void *) == 8);
1092         uint64_t total_memory;
1093         uint64_t dram_size_seven_GB = 0x1B8000000;
1094         uint64_t dram_size_three_GB = 0xB8000000;
1095
1096         if (amdgpu_smu_memory_pool_size == 0)
1097                 return;
1098
1099         if (!is_os_64) {
1100                 DRM_WARN("Not 64-bit OS, feature not supported\n");
1101                 goto def_value;
1102         }
1103         si_meminfo(&si);
1104         total_memory = (uint64_t)si.totalram * si.mem_unit;
1105
1106         if ((amdgpu_smu_memory_pool_size == 1) ||
1107                 (amdgpu_smu_memory_pool_size == 2)) {
1108                 if (total_memory < dram_size_three_GB)
1109                         goto def_value1;
1110         } else if ((amdgpu_smu_memory_pool_size == 4) ||
1111                 (amdgpu_smu_memory_pool_size == 8)) {
1112                 if (total_memory < dram_size_seven_GB)
1113                         goto def_value1;
1114         } else {
1115                 DRM_WARN("Smu memory pool size not supported\n");
1116                 goto def_value;
1117         }
1118         adev->pm.smu_prv_buffer_size = amdgpu_smu_memory_pool_size << 28;
1119
1120         return;
1121
1122 def_value1:
1123         DRM_WARN("No enough system memory\n");
1124 def_value:
1125         adev->pm.smu_prv_buffer_size = 0;
1126 }
1127
1128 /**
1129  * amdgpu_device_check_arguments - validate module params
1130  *
1131  * @adev: amdgpu_device pointer
1132  *
1133  * Validates certain module parameters and updates
1134  * the associated values used by the driver (all asics).
1135  */
1136 static int amdgpu_device_check_arguments(struct amdgpu_device *adev)
1137 {
1138         if (amdgpu_sched_jobs < 4) {
1139                 dev_warn(adev->dev, "sched jobs (%d) must be at least 4\n",
1140                          amdgpu_sched_jobs);
1141                 amdgpu_sched_jobs = 4;
1142         } else if (!is_power_of_2(amdgpu_sched_jobs)){
1143                 dev_warn(adev->dev, "sched jobs (%d) must be a power of 2\n",
1144                          amdgpu_sched_jobs);
1145                 amdgpu_sched_jobs = roundup_pow_of_two(amdgpu_sched_jobs);
1146         }
1147
1148         if (amdgpu_gart_size != -1 && amdgpu_gart_size < 32) {
1149                 /* gart size must be greater or equal to 32M */
1150                 dev_warn(adev->dev, "gart size (%d) too small\n",
1151                          amdgpu_gart_size);
1152                 amdgpu_gart_size = -1;
1153         }
1154
1155         if (amdgpu_gtt_size != -1 && amdgpu_gtt_size < 32) {
1156                 /* gtt size must be greater or equal to 32M */
1157                 dev_warn(adev->dev, "gtt size (%d) too small\n",
1158                                  amdgpu_gtt_size);
1159                 amdgpu_gtt_size = -1;
1160         }
1161
1162         /* valid range is between 4 and 9 inclusive */
1163         if (amdgpu_vm_fragment_size != -1 &&
1164             (amdgpu_vm_fragment_size > 9 || amdgpu_vm_fragment_size < 4)) {
1165                 dev_warn(adev->dev, "valid range is between 4 and 9\n");
1166                 amdgpu_vm_fragment_size = -1;
1167         }
1168
1169         if (amdgpu_sched_hw_submission < 2) {
1170                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be at least 2\n",
1171                          amdgpu_sched_hw_submission);
1172                 amdgpu_sched_hw_submission = 2;
1173         } else if (!is_power_of_2(amdgpu_sched_hw_submission)) {
1174                 dev_warn(adev->dev, "sched hw submission jobs (%d) must be a power of 2\n",
1175                          amdgpu_sched_hw_submission);
1176                 amdgpu_sched_hw_submission = roundup_pow_of_two(amdgpu_sched_hw_submission);
1177         }
1178
1179         amdgpu_device_check_smu_prv_buffer_size(adev);
1180
1181         amdgpu_device_check_vm_size(adev);
1182
1183         amdgpu_device_check_block_size(adev);
1184
1185         adev->firmware.load_type = amdgpu_ucode_get_load_type(adev, amdgpu_fw_load_type);
1186
1187         amdgpu_gmc_tmz_set(adev);
1188
1189         return 0;
1190 }
1191
1192 /**
1193  * amdgpu_switcheroo_set_state - set switcheroo state
1194  *
1195  * @pdev: pci dev pointer
1196  * @state: vga_switcheroo state
1197  *
1198  * Callback for the switcheroo driver.  Suspends or resumes the
1199  * the asics before or after it is powered up using ACPI methods.
1200  */
1201 static void amdgpu_switcheroo_set_state(struct pci_dev *pdev, enum vga_switcheroo_state state)
1202 {
1203         struct drm_device *dev = pci_get_drvdata(pdev);
1204         int r;
1205
1206         if (amdgpu_device_supports_boco(dev) && state == VGA_SWITCHEROO_OFF)
1207                 return;
1208
1209         if (state == VGA_SWITCHEROO_ON) {
1210                 pr_info("switched on\n");
1211                 /* don't suspend or resume card normally */
1212                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1213
1214                 pci_set_power_state(dev->pdev, PCI_D0);
1215                 pci_restore_state(dev->pdev);
1216                 r = pci_enable_device(dev->pdev);
1217                 if (r)
1218                         DRM_WARN("pci_enable_device failed (%d)\n", r);
1219                 amdgpu_device_resume(dev, true);
1220
1221                 dev->switch_power_state = DRM_SWITCH_POWER_ON;
1222                 drm_kms_helper_poll_enable(dev);
1223         } else {
1224                 pr_info("switched off\n");
1225                 drm_kms_helper_poll_disable(dev);
1226                 dev->switch_power_state = DRM_SWITCH_POWER_CHANGING;
1227                 amdgpu_device_suspend(dev, true);
1228                 pci_save_state(dev->pdev);
1229                 /* Shut down the device */
1230                 pci_disable_device(dev->pdev);
1231                 pci_set_power_state(dev->pdev, PCI_D3cold);
1232                 dev->switch_power_state = DRM_SWITCH_POWER_OFF;
1233         }
1234 }
1235
1236 /**
1237  * amdgpu_switcheroo_can_switch - see if switcheroo state can change
1238  *
1239  * @pdev: pci dev pointer
1240  *
1241  * Callback for the switcheroo driver.  Check of the switcheroo
1242  * state can be changed.
1243  * Returns true if the state can be changed, false if not.
1244  */
1245 static bool amdgpu_switcheroo_can_switch(struct pci_dev *pdev)
1246 {
1247         struct drm_device *dev = pci_get_drvdata(pdev);
1248
1249         /*
1250         * FIXME: open_count is protected by drm_global_mutex but that would lead to
1251         * locking inversion with the driver load path. And the access here is
1252         * completely racy anyway. So don't bother with locking for now.
1253         */
1254         return atomic_read(&dev->open_count) == 0;
1255 }
1256
1257 static const struct vga_switcheroo_client_ops amdgpu_switcheroo_ops = {
1258         .set_gpu_state = amdgpu_switcheroo_set_state,
1259         .reprobe = NULL,
1260         .can_switch = amdgpu_switcheroo_can_switch,
1261 };
1262
1263 /**
1264  * amdgpu_device_ip_set_clockgating_state - set the CG state
1265  *
1266  * @dev: amdgpu_device pointer
1267  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1268  * @state: clockgating state (gate or ungate)
1269  *
1270  * Sets the requested clockgating state for all instances of
1271  * the hardware IP specified.
1272  * Returns the error code from the last instance.
1273  */
1274 int amdgpu_device_ip_set_clockgating_state(void *dev,
1275                                            enum amd_ip_block_type block_type,
1276                                            enum amd_clockgating_state state)
1277 {
1278         struct amdgpu_device *adev = dev;
1279         int i, r = 0;
1280
1281         for (i = 0; i < adev->num_ip_blocks; i++) {
1282                 if (!adev->ip_blocks[i].status.valid)
1283                         continue;
1284                 if (adev->ip_blocks[i].version->type != block_type)
1285                         continue;
1286                 if (!adev->ip_blocks[i].version->funcs->set_clockgating_state)
1287                         continue;
1288                 r = adev->ip_blocks[i].version->funcs->set_clockgating_state(
1289                         (void *)adev, state);
1290                 if (r)
1291                         DRM_ERROR("set_clockgating_state of IP block <%s> failed %d\n",
1292                                   adev->ip_blocks[i].version->funcs->name, r);
1293         }
1294         return r;
1295 }
1296
1297 /**
1298  * amdgpu_device_ip_set_powergating_state - set the PG state
1299  *
1300  * @dev: amdgpu_device pointer
1301  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1302  * @state: powergating state (gate or ungate)
1303  *
1304  * Sets the requested powergating state for all instances of
1305  * the hardware IP specified.
1306  * Returns the error code from the last instance.
1307  */
1308 int amdgpu_device_ip_set_powergating_state(void *dev,
1309                                            enum amd_ip_block_type block_type,
1310                                            enum amd_powergating_state state)
1311 {
1312         struct amdgpu_device *adev = dev;
1313         int i, r = 0;
1314
1315         for (i = 0; i < adev->num_ip_blocks; i++) {
1316                 if (!adev->ip_blocks[i].status.valid)
1317                         continue;
1318                 if (adev->ip_blocks[i].version->type != block_type)
1319                         continue;
1320                 if (!adev->ip_blocks[i].version->funcs->set_powergating_state)
1321                         continue;
1322                 r = adev->ip_blocks[i].version->funcs->set_powergating_state(
1323                         (void *)adev, state);
1324                 if (r)
1325                         DRM_ERROR("set_powergating_state of IP block <%s> failed %d\n",
1326                                   adev->ip_blocks[i].version->funcs->name, r);
1327         }
1328         return r;
1329 }
1330
1331 /**
1332  * amdgpu_device_ip_get_clockgating_state - get the CG state
1333  *
1334  * @adev: amdgpu_device pointer
1335  * @flags: clockgating feature flags
1336  *
1337  * Walks the list of IPs on the device and updates the clockgating
1338  * flags for each IP.
1339  * Updates @flags with the feature flags for each hardware IP where
1340  * clockgating is enabled.
1341  */
1342 void amdgpu_device_ip_get_clockgating_state(struct amdgpu_device *adev,
1343                                             u32 *flags)
1344 {
1345         int i;
1346
1347         for (i = 0; i < adev->num_ip_blocks; i++) {
1348                 if (!adev->ip_blocks[i].status.valid)
1349                         continue;
1350                 if (adev->ip_blocks[i].version->funcs->get_clockgating_state)
1351                         adev->ip_blocks[i].version->funcs->get_clockgating_state((void *)adev, flags);
1352         }
1353 }
1354
1355 /**
1356  * amdgpu_device_ip_wait_for_idle - wait for idle
1357  *
1358  * @adev: amdgpu_device pointer
1359  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1360  *
1361  * Waits for the request hardware IP to be idle.
1362  * Returns 0 for success or a negative error code on failure.
1363  */
1364 int amdgpu_device_ip_wait_for_idle(struct amdgpu_device *adev,
1365                                    enum amd_ip_block_type block_type)
1366 {
1367         int i, r;
1368
1369         for (i = 0; i < adev->num_ip_blocks; i++) {
1370                 if (!adev->ip_blocks[i].status.valid)
1371                         continue;
1372                 if (adev->ip_blocks[i].version->type == block_type) {
1373                         r = adev->ip_blocks[i].version->funcs->wait_for_idle((void *)adev);
1374                         if (r)
1375                                 return r;
1376                         break;
1377                 }
1378         }
1379         return 0;
1380
1381 }
1382
1383 /**
1384  * amdgpu_device_ip_is_idle - is the hardware IP idle
1385  *
1386  * @adev: amdgpu_device pointer
1387  * @block_type: Type of hardware IP (SMU, GFX, UVD, etc.)
1388  *
1389  * Check if the hardware IP is idle or not.
1390  * Returns true if it the IP is idle, false if not.
1391  */
1392 bool amdgpu_device_ip_is_idle(struct amdgpu_device *adev,
1393                               enum amd_ip_block_type block_type)
1394 {
1395         int i;
1396
1397         for (i = 0; i < adev->num_ip_blocks; i++) {
1398                 if (!adev->ip_blocks[i].status.valid)
1399                         continue;
1400                 if (adev->ip_blocks[i].version->type == block_type)
1401                         return adev->ip_blocks[i].version->funcs->is_idle((void *)adev);
1402         }
1403         return true;
1404
1405 }
1406
1407 /**
1408  * amdgpu_device_ip_get_ip_block - get a hw IP pointer
1409  *
1410  * @adev: amdgpu_device pointer
1411  * @type: Type of hardware IP (SMU, GFX, UVD, etc.)
1412  *
1413  * Returns a pointer to the hardware IP block structure
1414  * if it exists for the asic, otherwise NULL.
1415  */
1416 struct amdgpu_ip_block *
1417 amdgpu_device_ip_get_ip_block(struct amdgpu_device *adev,
1418                               enum amd_ip_block_type type)
1419 {
1420         int i;
1421
1422         for (i = 0; i < adev->num_ip_blocks; i++)
1423                 if (adev->ip_blocks[i].version->type == type)
1424                         return &adev->ip_blocks[i];
1425
1426         return NULL;
1427 }
1428
1429 /**
1430  * amdgpu_device_ip_block_version_cmp
1431  *
1432  * @adev: amdgpu_device pointer
1433  * @type: enum amd_ip_block_type
1434  * @major: major version
1435  * @minor: minor version
1436  *
1437  * return 0 if equal or greater
1438  * return 1 if smaller or the ip_block doesn't exist
1439  */
1440 int amdgpu_device_ip_block_version_cmp(struct amdgpu_device *adev,
1441                                        enum amd_ip_block_type type,
1442                                        u32 major, u32 minor)
1443 {
1444         struct amdgpu_ip_block *ip_block = amdgpu_device_ip_get_ip_block(adev, type);
1445
1446         if (ip_block && ((ip_block->version->major > major) ||
1447                         ((ip_block->version->major == major) &&
1448                         (ip_block->version->minor >= minor))))
1449                 return 0;
1450
1451         return 1;
1452 }
1453
1454 /**
1455  * amdgpu_device_ip_block_add
1456  *
1457  * @adev: amdgpu_device pointer
1458  * @ip_block_version: pointer to the IP to add
1459  *
1460  * Adds the IP block driver information to the collection of IPs
1461  * on the asic.
1462  */
1463 int amdgpu_device_ip_block_add(struct amdgpu_device *adev,
1464                                const struct amdgpu_ip_block_version *ip_block_version)
1465 {
1466         if (!ip_block_version)
1467                 return -EINVAL;
1468
1469         DRM_INFO("add ip block number %d <%s>\n", adev->num_ip_blocks,
1470                   ip_block_version->funcs->name);
1471
1472         adev->ip_blocks[adev->num_ip_blocks++].version = ip_block_version;
1473
1474         return 0;
1475 }
1476
1477 /**
1478  * amdgpu_device_enable_virtual_display - enable virtual display feature
1479  *
1480  * @adev: amdgpu_device pointer
1481  *
1482  * Enabled the virtual display feature if the user has enabled it via
1483  * the module parameter virtual_display.  This feature provides a virtual
1484  * display hardware on headless boards or in virtualized environments.
1485  * This function parses and validates the configuration string specified by
1486  * the user and configues the virtual display configuration (number of
1487  * virtual connectors, crtcs, etc.) specified.
1488  */
1489 static void amdgpu_device_enable_virtual_display(struct amdgpu_device *adev)
1490 {
1491         adev->enable_virtual_display = false;
1492
1493         if (amdgpu_virtual_display) {
1494                 struct drm_device *ddev = adev->ddev;
1495                 const char *pci_address_name = pci_name(ddev->pdev);
1496                 char *pciaddstr, *pciaddstr_tmp, *pciaddname_tmp, *pciaddname;
1497
1498                 pciaddstr = kstrdup(amdgpu_virtual_display, GFP_KERNEL);
1499                 pciaddstr_tmp = pciaddstr;
1500                 while ((pciaddname_tmp = strsep(&pciaddstr_tmp, ";"))) {
1501                         pciaddname = strsep(&pciaddname_tmp, ",");
1502                         if (!strcmp("all", pciaddname)
1503                             || !strcmp(pci_address_name, pciaddname)) {
1504                                 long num_crtc;
1505                                 int res = -1;
1506
1507                                 adev->enable_virtual_display = true;
1508
1509                                 if (pciaddname_tmp)
1510                                         res = kstrtol(pciaddname_tmp, 10,
1511                                                       &num_crtc);
1512
1513                                 if (!res) {
1514                                         if (num_crtc < 1)
1515                                                 num_crtc = 1;
1516                                         if (num_crtc > 6)
1517                                                 num_crtc = 6;
1518                                         adev->mode_info.num_crtc = num_crtc;
1519                                 } else {
1520                                         adev->mode_info.num_crtc = 1;
1521                                 }
1522                                 break;
1523                         }
1524                 }
1525
1526                 DRM_INFO("virtual display string:%s, %s:virtual_display:%d, num_crtc:%d\n",
1527                          amdgpu_virtual_display, pci_address_name,
1528                          adev->enable_virtual_display, adev->mode_info.num_crtc);
1529
1530                 kfree(pciaddstr);
1531         }
1532 }
1533
1534 /**
1535  * amdgpu_device_parse_gpu_info_fw - parse gpu info firmware
1536  *
1537  * @adev: amdgpu_device pointer
1538  *
1539  * Parses the asic configuration parameters specified in the gpu info
1540  * firmware and makes them availale to the driver for use in configuring
1541  * the asic.
1542  * Returns 0 on success, -EINVAL on failure.
1543  */
1544 static int amdgpu_device_parse_gpu_info_fw(struct amdgpu_device *adev)
1545 {
1546         const char *chip_name;
1547         char fw_name[40];
1548         int err;
1549         const struct gpu_info_firmware_header_v1_0 *hdr;
1550
1551         adev->firmware.gpu_info_fw = NULL;
1552
1553         if (adev->discovery_bin) {
1554                 amdgpu_discovery_get_gfx_info(adev);
1555
1556                 /*
1557                  * FIXME: The bounding box is still needed by Navi12, so
1558                  * temporarily read it from gpu_info firmware. Should be droped
1559                  * when DAL no longer needs it.
1560                  */
1561                 if (adev->asic_type != CHIP_NAVI12)
1562                         return 0;
1563         }
1564
1565         switch (adev->asic_type) {
1566 #ifdef CONFIG_DRM_AMDGPU_SI
1567         case CHIP_VERDE:
1568         case CHIP_TAHITI:
1569         case CHIP_PITCAIRN:
1570         case CHIP_OLAND:
1571         case CHIP_HAINAN:
1572 #endif
1573 #ifdef CONFIG_DRM_AMDGPU_CIK
1574         case CHIP_BONAIRE:
1575         case CHIP_HAWAII:
1576         case CHIP_KAVERI:
1577         case CHIP_KABINI:
1578         case CHIP_MULLINS:
1579 #endif
1580         case CHIP_TOPAZ:
1581         case CHIP_TONGA:
1582         case CHIP_FIJI:
1583         case CHIP_POLARIS10:
1584         case CHIP_POLARIS11:
1585         case CHIP_POLARIS12:
1586         case CHIP_VEGAM:
1587         case CHIP_CARRIZO:
1588         case CHIP_STONEY:
1589         case CHIP_VEGA20:
1590         default:
1591                 return 0;
1592         case CHIP_VEGA10:
1593                 chip_name = "vega10";
1594                 break;
1595         case CHIP_VEGA12:
1596                 chip_name = "vega12";
1597                 break;
1598         case CHIP_RAVEN:
1599                 if (adev->apu_flags & AMD_APU_IS_RAVEN2)
1600                         chip_name = "raven2";
1601                 else if (adev->apu_flags & AMD_APU_IS_PICASSO)
1602                         chip_name = "picasso";
1603                 else
1604                         chip_name = "raven";
1605                 break;
1606         case CHIP_ARCTURUS:
1607                 chip_name = "arcturus";
1608                 break;
1609         case CHIP_RENOIR:
1610                 chip_name = "renoir";
1611                 break;
1612         case CHIP_NAVI10:
1613                 chip_name = "navi10";
1614                 break;
1615         case CHIP_NAVI14:
1616                 chip_name = "navi14";
1617                 break;
1618         case CHIP_NAVI12:
1619                 chip_name = "navi12";
1620                 break;
1621         case CHIP_SIENNA_CICHLID:
1622                 chip_name = "sienna_cichlid";
1623                 break;
1624         }
1625
1626         snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_gpu_info.bin", chip_name);
1627         err = request_firmware(&adev->firmware.gpu_info_fw, fw_name, adev->dev);
1628         if (err) {
1629                 dev_err(adev->dev,
1630                         "Failed to load gpu_info firmware \"%s\"\n",
1631                         fw_name);
1632                 goto out;
1633         }
1634         err = amdgpu_ucode_validate(adev->firmware.gpu_info_fw);
1635         if (err) {
1636                 dev_err(adev->dev,
1637                         "Failed to validate gpu_info firmware \"%s\"\n",
1638                         fw_name);
1639                 goto out;
1640         }
1641
1642         hdr = (const struct gpu_info_firmware_header_v1_0 *)adev->firmware.gpu_info_fw->data;
1643         amdgpu_ucode_print_gpu_info_hdr(&hdr->header);
1644
1645         switch (hdr->version_major) {
1646         case 1:
1647         {
1648                 const struct gpu_info_firmware_v1_0 *gpu_info_fw =
1649                         (const struct gpu_info_firmware_v1_0 *)(adev->firmware.gpu_info_fw->data +
1650                                                                 le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1651
1652                 /*
1653                  * Should be droped when DAL no longer needs it.
1654                  */
1655                 if (adev->asic_type == CHIP_NAVI12)
1656                         goto parse_soc_bounding_box;
1657
1658                 adev->gfx.config.max_shader_engines = le32_to_cpu(gpu_info_fw->gc_num_se);
1659                 adev->gfx.config.max_cu_per_sh = le32_to_cpu(gpu_info_fw->gc_num_cu_per_sh);
1660                 adev->gfx.config.max_sh_per_se = le32_to_cpu(gpu_info_fw->gc_num_sh_per_se);
1661                 adev->gfx.config.max_backends_per_se = le32_to_cpu(gpu_info_fw->gc_num_rb_per_se);
1662                 adev->gfx.config.max_texture_channel_caches =
1663                         le32_to_cpu(gpu_info_fw->gc_num_tccs);
1664                 adev->gfx.config.max_gprs = le32_to_cpu(gpu_info_fw->gc_num_gprs);
1665                 adev->gfx.config.max_gs_threads = le32_to_cpu(gpu_info_fw->gc_num_max_gs_thds);
1666                 adev->gfx.config.gs_vgt_table_depth = le32_to_cpu(gpu_info_fw->gc_gs_table_depth);
1667                 adev->gfx.config.gs_prim_buffer_depth = le32_to_cpu(gpu_info_fw->gc_gsprim_buff_depth);
1668                 adev->gfx.config.double_offchip_lds_buf =
1669                         le32_to_cpu(gpu_info_fw->gc_double_offchip_lds_buffer);
1670                 adev->gfx.cu_info.wave_front_size = le32_to_cpu(gpu_info_fw->gc_wave_size);
1671                 adev->gfx.cu_info.max_waves_per_simd =
1672                         le32_to_cpu(gpu_info_fw->gc_max_waves_per_simd);
1673                 adev->gfx.cu_info.max_scratch_slots_per_cu =
1674                         le32_to_cpu(gpu_info_fw->gc_max_scratch_slots_per_cu);
1675                 adev->gfx.cu_info.lds_size = le32_to_cpu(gpu_info_fw->gc_lds_size);
1676                 if (hdr->version_minor >= 1) {
1677                         const struct gpu_info_firmware_v1_1 *gpu_info_fw =
1678                                 (const struct gpu_info_firmware_v1_1 *)(adev->firmware.gpu_info_fw->data +
1679                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1680                         adev->gfx.config.num_sc_per_sh =
1681                                 le32_to_cpu(gpu_info_fw->num_sc_per_sh);
1682                         adev->gfx.config.num_packer_per_sc =
1683                                 le32_to_cpu(gpu_info_fw->num_packer_per_sc);
1684                 }
1685
1686 parse_soc_bounding_box:
1687                 /*
1688                  * soc bounding box info is not integrated in disocovery table,
1689                  * we always need to parse it from gpu info firmware if needed.
1690                  */
1691                 if (hdr->version_minor == 2) {
1692                         const struct gpu_info_firmware_v1_2 *gpu_info_fw =
1693                                 (const struct gpu_info_firmware_v1_2 *)(adev->firmware.gpu_info_fw->data +
1694                                                                         le32_to_cpu(hdr->header.ucode_array_offset_bytes));
1695                         adev->dm.soc_bounding_box = &gpu_info_fw->soc_bounding_box;
1696                 }
1697                 break;
1698         }
1699         default:
1700                 dev_err(adev->dev,
1701                         "Unsupported gpu_info table %d\n", hdr->header.ucode_version);
1702                 err = -EINVAL;
1703                 goto out;
1704         }
1705 out:
1706         return err;
1707 }
1708
1709 /**
1710  * amdgpu_device_ip_early_init - run early init for hardware IPs
1711  *
1712  * @adev: amdgpu_device pointer
1713  *
1714  * Early initialization pass for hardware IPs.  The hardware IPs that make
1715  * up each asic are discovered each IP's early_init callback is run.  This
1716  * is the first stage in initializing the asic.
1717  * Returns 0 on success, negative error code on failure.
1718  */
1719 static int amdgpu_device_ip_early_init(struct amdgpu_device *adev)
1720 {
1721         int i, r;
1722
1723         amdgpu_device_enable_virtual_display(adev);
1724
1725         switch (adev->asic_type) {
1726 #ifdef CONFIG_DRM_AMDGPU_SI
1727         case CHIP_VERDE:
1728         case CHIP_TAHITI:
1729         case CHIP_PITCAIRN:
1730         case CHIP_OLAND:
1731         case CHIP_HAINAN:
1732                 adev->family = AMDGPU_FAMILY_SI;
1733                 r = si_set_ip_blocks(adev);
1734                 if (r)
1735                         return r;
1736                 break;
1737 #endif
1738 #ifdef CONFIG_DRM_AMDGPU_CIK
1739         case CHIP_BONAIRE:
1740         case CHIP_HAWAII:
1741         case CHIP_KAVERI:
1742         case CHIP_KABINI:
1743         case CHIP_MULLINS:
1744                 if (adev->flags & AMD_IS_APU)
1745                         adev->family = AMDGPU_FAMILY_KV;
1746                 else
1747                         adev->family = AMDGPU_FAMILY_CI;
1748
1749                 r = cik_set_ip_blocks(adev);
1750                 if (r)
1751                         return r;
1752                 break;
1753 #endif
1754         case CHIP_TOPAZ:
1755         case CHIP_TONGA:
1756         case CHIP_FIJI:
1757         case CHIP_POLARIS10:
1758         case CHIP_POLARIS11:
1759         case CHIP_POLARIS12:
1760         case CHIP_VEGAM:
1761         case CHIP_CARRIZO:
1762         case CHIP_STONEY:
1763                 if (adev->flags & AMD_IS_APU)
1764                         adev->family = AMDGPU_FAMILY_CZ;
1765                 else
1766                         adev->family = AMDGPU_FAMILY_VI;
1767
1768                 r = vi_set_ip_blocks(adev);
1769                 if (r)
1770                         return r;
1771                 break;
1772         case CHIP_VEGA10:
1773         case CHIP_VEGA12:
1774         case CHIP_VEGA20:
1775         case CHIP_RAVEN:
1776         case CHIP_ARCTURUS:
1777         case CHIP_RENOIR:
1778                 if (adev->flags & AMD_IS_APU)
1779                         adev->family = AMDGPU_FAMILY_RV;
1780                 else
1781                         adev->family = AMDGPU_FAMILY_AI;
1782
1783                 r = soc15_set_ip_blocks(adev);
1784                 if (r)
1785                         return r;
1786                 break;
1787         case  CHIP_NAVI10:
1788         case  CHIP_NAVI14:
1789         case  CHIP_NAVI12:
1790         case  CHIP_SIENNA_CICHLID:
1791                 adev->family = AMDGPU_FAMILY_NV;
1792
1793                 r = nv_set_ip_blocks(adev);
1794                 if (r)
1795                         return r;
1796                 break;
1797         default:
1798                 /* FIXME: not supported yet */
1799                 return -EINVAL;
1800         }
1801
1802         amdgpu_amdkfd_device_probe(adev);
1803
1804         if (amdgpu_sriov_vf(adev)) {
1805                 /* handle vbios stuff prior full access mode for new handshake */
1806                 if (adev->virt.req_init_data_ver == 1) {
1807                         if (!amdgpu_get_bios(adev)) {
1808                                 DRM_ERROR("failed to get vbios\n");
1809                                 return -EINVAL;
1810                         }
1811
1812                         r = amdgpu_atombios_init(adev);
1813                         if (r) {
1814                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1815                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1816                                 return r;
1817                         }
1818                 }
1819         }
1820
1821         /* we need to send REQ_GPU here for legacy handshaker otherwise the vbios
1822          * will not be prepared by host for this VF */
1823         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver < 1) {
1824                 r = amdgpu_virt_request_full_gpu(adev, true);
1825                 if (r)
1826                         return r;
1827         }
1828
1829         adev->pm.pp_feature = amdgpu_pp_feature_mask;
1830         if (amdgpu_sriov_vf(adev) || sched_policy == KFD_SCHED_POLICY_NO_HWS)
1831                 adev->pm.pp_feature &= ~PP_GFXOFF_MASK;
1832
1833         for (i = 0; i < adev->num_ip_blocks; i++) {
1834                 if ((amdgpu_ip_block_mask & (1 << i)) == 0) {
1835                         DRM_ERROR("disabled ip block: %d <%s>\n",
1836                                   i, adev->ip_blocks[i].version->funcs->name);
1837                         adev->ip_blocks[i].status.valid = false;
1838                 } else {
1839                         if (adev->ip_blocks[i].version->funcs->early_init) {
1840                                 r = adev->ip_blocks[i].version->funcs->early_init((void *)adev);
1841                                 if (r == -ENOENT) {
1842                                         adev->ip_blocks[i].status.valid = false;
1843                                 } else if (r) {
1844                                         DRM_ERROR("early_init of IP block <%s> failed %d\n",
1845                                                   adev->ip_blocks[i].version->funcs->name, r);
1846                                         return r;
1847                                 } else {
1848                                         adev->ip_blocks[i].status.valid = true;
1849                                 }
1850                         } else {
1851                                 adev->ip_blocks[i].status.valid = true;
1852                         }
1853                 }
1854                 /* get the vbios after the asic_funcs are set up */
1855                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON) {
1856                         r = amdgpu_device_parse_gpu_info_fw(adev);
1857                         if (r)
1858                                 return r;
1859
1860                         /* skip vbios handling for new handshake */
1861                         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver == 1)
1862                                 continue;
1863
1864                         /* Read BIOS */
1865                         if (!amdgpu_get_bios(adev))
1866                                 return -EINVAL;
1867
1868                         r = amdgpu_atombios_init(adev);
1869                         if (r) {
1870                                 dev_err(adev->dev, "amdgpu_atombios_init failed\n");
1871                                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_INIT_FAIL, 0, 0);
1872                                 return r;
1873                         }
1874                 }
1875         }
1876
1877         adev->cg_flags &= amdgpu_cg_mask;
1878         adev->pg_flags &= amdgpu_pg_mask;
1879
1880         return 0;
1881 }
1882
1883 static int amdgpu_device_ip_hw_init_phase1(struct amdgpu_device *adev)
1884 {
1885         int i, r;
1886
1887         for (i = 0; i < adev->num_ip_blocks; i++) {
1888                 if (!adev->ip_blocks[i].status.sw)
1889                         continue;
1890                 if (adev->ip_blocks[i].status.hw)
1891                         continue;
1892                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
1893                     (amdgpu_sriov_vf(adev) && (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)) ||
1894                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
1895                         r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1896                         if (r) {
1897                                 DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1898                                           adev->ip_blocks[i].version->funcs->name, r);
1899                                 return r;
1900                         }
1901                         adev->ip_blocks[i].status.hw = true;
1902                 }
1903         }
1904
1905         return 0;
1906 }
1907
1908 static int amdgpu_device_ip_hw_init_phase2(struct amdgpu_device *adev)
1909 {
1910         int i, r;
1911
1912         for (i = 0; i < adev->num_ip_blocks; i++) {
1913                 if (!adev->ip_blocks[i].status.sw)
1914                         continue;
1915                 if (adev->ip_blocks[i].status.hw)
1916                         continue;
1917                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1918                 if (r) {
1919                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1920                                   adev->ip_blocks[i].version->funcs->name, r);
1921                         return r;
1922                 }
1923                 adev->ip_blocks[i].status.hw = true;
1924         }
1925
1926         return 0;
1927 }
1928
1929 static int amdgpu_device_fw_loading(struct amdgpu_device *adev)
1930 {
1931         int r = 0;
1932         int i;
1933         uint32_t smu_version;
1934
1935         if (adev->asic_type >= CHIP_VEGA10) {
1936                 for (i = 0; i < adev->num_ip_blocks; i++) {
1937                         if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_PSP)
1938                                 continue;
1939
1940                         /* no need to do the fw loading again if already done*/
1941                         if (adev->ip_blocks[i].status.hw == true)
1942                                 break;
1943
1944                         if (adev->in_gpu_reset || adev->in_suspend) {
1945                                 r = adev->ip_blocks[i].version->funcs->resume(adev);
1946                                 if (r) {
1947                                         DRM_ERROR("resume of IP block <%s> failed %d\n",
1948                                                           adev->ip_blocks[i].version->funcs->name, r);
1949                                         return r;
1950                                 }
1951                         } else {
1952                                 r = adev->ip_blocks[i].version->funcs->hw_init(adev);
1953                                 if (r) {
1954                                         DRM_ERROR("hw_init of IP block <%s> failed %d\n",
1955                                                           adev->ip_blocks[i].version->funcs->name, r);
1956                                         return r;
1957                                 }
1958                         }
1959
1960                         adev->ip_blocks[i].status.hw = true;
1961                         break;
1962                 }
1963         }
1964
1965         if (!amdgpu_sriov_vf(adev) || adev->asic_type == CHIP_TONGA)
1966                 r = amdgpu_pm_load_smu_firmware(adev, &smu_version);
1967
1968         return r;
1969 }
1970
1971 /**
1972  * amdgpu_device_ip_init - run init for hardware IPs
1973  *
1974  * @adev: amdgpu_device pointer
1975  *
1976  * Main initialization pass for hardware IPs.  The list of all the hardware
1977  * IPs that make up the asic is walked and the sw_init and hw_init callbacks
1978  * are run.  sw_init initializes the software state associated with each IP
1979  * and hw_init initializes the hardware associated with each IP.
1980  * Returns 0 on success, negative error code on failure.
1981  */
1982 static int amdgpu_device_ip_init(struct amdgpu_device *adev)
1983 {
1984         int i, r;
1985
1986         r = amdgpu_ras_init(adev);
1987         if (r)
1988                 return r;
1989
1990         if (amdgpu_sriov_vf(adev) && adev->virt.req_init_data_ver > 0) {
1991                 r = amdgpu_virt_request_full_gpu(adev, true);
1992                 if (r)
1993                         return -EAGAIN;
1994         }
1995
1996         for (i = 0; i < adev->num_ip_blocks; i++) {
1997                 if (!adev->ip_blocks[i].status.valid)
1998                         continue;
1999                 r = adev->ip_blocks[i].version->funcs->sw_init((void *)adev);
2000                 if (r) {
2001                         DRM_ERROR("sw_init of IP block <%s> failed %d\n",
2002                                   adev->ip_blocks[i].version->funcs->name, r);
2003                         goto init_failed;
2004                 }
2005                 adev->ip_blocks[i].status.sw = true;
2006
2007                 /* need to do gmc hw init early so we can allocate gpu mem */
2008                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2009                         r = amdgpu_device_vram_scratch_init(adev);
2010                         if (r) {
2011                                 DRM_ERROR("amdgpu_vram_scratch_init failed %d\n", r);
2012                                 goto init_failed;
2013                         }
2014                         r = adev->ip_blocks[i].version->funcs->hw_init((void *)adev);
2015                         if (r) {
2016                                 DRM_ERROR("hw_init %d failed %d\n", i, r);
2017                                 goto init_failed;
2018                         }
2019                         r = amdgpu_device_wb_init(adev);
2020                         if (r) {
2021                                 DRM_ERROR("amdgpu_device_wb_init failed %d\n", r);
2022                                 goto init_failed;
2023                         }
2024                         adev->ip_blocks[i].status.hw = true;
2025
2026                         /* right after GMC hw init, we create CSA */
2027                         if (amdgpu_mcbp || amdgpu_sriov_vf(adev)) {
2028                                 r = amdgpu_allocate_static_csa(adev, &adev->virt.csa_obj,
2029                                                                 AMDGPU_GEM_DOMAIN_VRAM,
2030                                                                 AMDGPU_CSA_SIZE);
2031                                 if (r) {
2032                                         DRM_ERROR("allocate CSA failed %d\n", r);
2033                                         goto init_failed;
2034                                 }
2035                         }
2036                 }
2037         }
2038
2039         if (amdgpu_sriov_vf(adev))
2040                 amdgpu_virt_init_data_exchange(adev);
2041
2042         r = amdgpu_ib_pool_init(adev);
2043         if (r) {
2044                 dev_err(adev->dev, "IB initialization failed (%d).\n", r);
2045                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_IB_INIT_FAIL, 0, r);
2046                 goto init_failed;
2047         }
2048
2049         r = amdgpu_ucode_create_bo(adev); /* create ucode bo when sw_init complete*/
2050         if (r)
2051                 goto init_failed;
2052
2053         r = amdgpu_device_ip_hw_init_phase1(adev);
2054         if (r)
2055                 goto init_failed;
2056
2057         r = amdgpu_device_fw_loading(adev);
2058         if (r)
2059                 goto init_failed;
2060
2061         r = amdgpu_device_ip_hw_init_phase2(adev);
2062         if (r)
2063                 goto init_failed;
2064
2065         /*
2066          * retired pages will be loaded from eeprom and reserved here,
2067          * it should be called after amdgpu_device_ip_hw_init_phase2  since
2068          * for some ASICs the RAS EEPROM code relies on SMU fully functioning
2069          * for I2C communication which only true at this point.
2070          * recovery_init may fail, but it can free all resources allocated by
2071          * itself and its failure should not stop amdgpu init process.
2072          *
2073          * Note: theoretically, this should be called before all vram allocations
2074          * to protect retired page from abusing
2075          */
2076         amdgpu_ras_recovery_init(adev);
2077
2078         if (adev->gmc.xgmi.num_physical_nodes > 1)
2079                 amdgpu_xgmi_add_device(adev);
2080         amdgpu_amdkfd_device_init(adev);
2081
2082         amdgpu_fru_get_product_info(adev);
2083
2084 init_failed:
2085         if (amdgpu_sriov_vf(adev))
2086                 amdgpu_virt_release_full_gpu(adev, true);
2087
2088         return r;
2089 }
2090
2091 /**
2092  * amdgpu_device_fill_reset_magic - writes reset magic to gart pointer
2093  *
2094  * @adev: amdgpu_device pointer
2095  *
2096  * Writes a reset magic value to the gart pointer in VRAM.  The driver calls
2097  * this function before a GPU reset.  If the value is retained after a
2098  * GPU reset, VRAM has not been lost.  Some GPU resets may destry VRAM contents.
2099  */
2100 static void amdgpu_device_fill_reset_magic(struct amdgpu_device *adev)
2101 {
2102         memcpy(adev->reset_magic, adev->gart.ptr, AMDGPU_RESET_MAGIC_NUM);
2103 }
2104
2105 /**
2106  * amdgpu_device_check_vram_lost - check if vram is valid
2107  *
2108  * @adev: amdgpu_device pointer
2109  *
2110  * Checks the reset magic value written to the gart pointer in VRAM.
2111  * The driver calls this after a GPU reset to see if the contents of
2112  * VRAM is lost or now.
2113  * returns true if vram is lost, false if not.
2114  */
2115 static bool amdgpu_device_check_vram_lost(struct amdgpu_device *adev)
2116 {
2117         if (memcmp(adev->gart.ptr, adev->reset_magic,
2118                         AMDGPU_RESET_MAGIC_NUM))
2119                 return true;
2120
2121         if (!adev->in_gpu_reset)
2122                 return false;
2123
2124         /*
2125          * For all ASICs with baco/mode1 reset, the VRAM is
2126          * always assumed to be lost.
2127          */
2128         switch (amdgpu_asic_reset_method(adev)) {
2129         case AMD_RESET_METHOD_BACO:
2130         case AMD_RESET_METHOD_MODE1:
2131                 return true;
2132         default:
2133                 return false;
2134         }
2135 }
2136
2137 /**
2138  * amdgpu_device_set_cg_state - set clockgating for amdgpu device
2139  *
2140  * @adev: amdgpu_device pointer
2141  * @state: clockgating state (gate or ungate)
2142  *
2143  * The list of all the hardware IPs that make up the asic is walked and the
2144  * set_clockgating_state callbacks are run.
2145  * Late initialization pass enabling clockgating for hardware IPs.
2146  * Fini or suspend, pass disabling clockgating for hardware IPs.
2147  * Returns 0 on success, negative error code on failure.
2148  */
2149
2150 static int amdgpu_device_set_cg_state(struct amdgpu_device *adev,
2151                                                 enum amd_clockgating_state state)
2152 {
2153         int i, j, r;
2154
2155         if (amdgpu_emu_mode == 1)
2156                 return 0;
2157
2158         for (j = 0; j < adev->num_ip_blocks; j++) {
2159                 i = state == AMD_CG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2160                 if (!adev->ip_blocks[i].status.late_initialized)
2161                         continue;
2162                 /* skip CG for VCE/UVD, it's handled specially */
2163                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2164                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2165                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2166                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2167                     adev->ip_blocks[i].version->funcs->set_clockgating_state) {
2168                         /* enable clockgating to save power */
2169                         r = adev->ip_blocks[i].version->funcs->set_clockgating_state((void *)adev,
2170                                                                                      state);
2171                         if (r) {
2172                                 DRM_ERROR("set_clockgating_state(gate) of IP block <%s> failed %d\n",
2173                                           adev->ip_blocks[i].version->funcs->name, r);
2174                                 return r;
2175                         }
2176                 }
2177         }
2178
2179         return 0;
2180 }
2181
2182 static int amdgpu_device_set_pg_state(struct amdgpu_device *adev, enum amd_powergating_state state)
2183 {
2184         int i, j, r;
2185
2186         if (amdgpu_emu_mode == 1)
2187                 return 0;
2188
2189         for (j = 0; j < adev->num_ip_blocks; j++) {
2190                 i = state == AMD_PG_STATE_GATE ? j : adev->num_ip_blocks - j - 1;
2191                 if (!adev->ip_blocks[i].status.late_initialized)
2192                         continue;
2193                 /* skip CG for VCE/UVD, it's handled specially */
2194                 if (adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_UVD &&
2195                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCE &&
2196                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_VCN &&
2197                     adev->ip_blocks[i].version->type != AMD_IP_BLOCK_TYPE_JPEG &&
2198                     adev->ip_blocks[i].version->funcs->set_powergating_state) {
2199                         /* enable powergating to save power */
2200                         r = adev->ip_blocks[i].version->funcs->set_powergating_state((void *)adev,
2201                                                                                         state);
2202                         if (r) {
2203                                 DRM_ERROR("set_powergating_state(gate) of IP block <%s> failed %d\n",
2204                                           adev->ip_blocks[i].version->funcs->name, r);
2205                                 return r;
2206                         }
2207                 }
2208         }
2209         return 0;
2210 }
2211
2212 static int amdgpu_device_enable_mgpu_fan_boost(void)
2213 {
2214         struct amdgpu_gpu_instance *gpu_ins;
2215         struct amdgpu_device *adev;
2216         int i, ret = 0;
2217
2218         mutex_lock(&mgpu_info.mutex);
2219
2220         /*
2221          * MGPU fan boost feature should be enabled
2222          * only when there are two or more dGPUs in
2223          * the system
2224          */
2225         if (mgpu_info.num_dgpu < 2)
2226                 goto out;
2227
2228         for (i = 0; i < mgpu_info.num_dgpu; i++) {
2229                 gpu_ins = &(mgpu_info.gpu_ins[i]);
2230                 adev = gpu_ins->adev;
2231                 if (!(adev->flags & AMD_IS_APU) &&
2232                     !gpu_ins->mgpu_fan_enabled &&
2233                     adev->powerplay.pp_funcs &&
2234                     adev->powerplay.pp_funcs->enable_mgpu_fan_boost) {
2235                         ret = amdgpu_dpm_enable_mgpu_fan_boost(adev);
2236                         if (ret)
2237                                 break;
2238
2239                         gpu_ins->mgpu_fan_enabled = 1;
2240                 }
2241         }
2242
2243 out:
2244         mutex_unlock(&mgpu_info.mutex);
2245
2246         return ret;
2247 }
2248
2249 /**
2250  * amdgpu_device_ip_late_init - run late init for hardware IPs
2251  *
2252  * @adev: amdgpu_device pointer
2253  *
2254  * Late initialization pass for hardware IPs.  The list of all the hardware
2255  * IPs that make up the asic is walked and the late_init callbacks are run.
2256  * late_init covers any special initialization that an IP requires
2257  * after all of the have been initialized or something that needs to happen
2258  * late in the init process.
2259  * Returns 0 on success, negative error code on failure.
2260  */
2261 static int amdgpu_device_ip_late_init(struct amdgpu_device *adev)
2262 {
2263         struct amdgpu_gpu_instance *gpu_instance;
2264         int i = 0, r;
2265
2266         for (i = 0; i < adev->num_ip_blocks; i++) {
2267                 if (!adev->ip_blocks[i].status.hw)
2268                         continue;
2269                 if (adev->ip_blocks[i].version->funcs->late_init) {
2270                         r = adev->ip_blocks[i].version->funcs->late_init((void *)adev);
2271                         if (r) {
2272                                 DRM_ERROR("late_init of IP block <%s> failed %d\n",
2273                                           adev->ip_blocks[i].version->funcs->name, r);
2274                                 return r;
2275                         }
2276                 }
2277                 adev->ip_blocks[i].status.late_initialized = true;
2278         }
2279
2280         amdgpu_ras_set_error_query_ready(adev, true);
2281
2282         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_GATE);
2283         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_GATE);
2284
2285         amdgpu_device_fill_reset_magic(adev);
2286
2287         r = amdgpu_device_enable_mgpu_fan_boost();
2288         if (r)
2289                 DRM_ERROR("enable mgpu fan boost failed (%d).\n", r);
2290
2291
2292         if (adev->gmc.xgmi.num_physical_nodes > 1) {
2293                 mutex_lock(&mgpu_info.mutex);
2294
2295                 /*
2296                  * Reset device p-state to low as this was booted with high.
2297                  *
2298                  * This should be performed only after all devices from the same
2299                  * hive get initialized.
2300                  *
2301                  * However, it's unknown how many device in the hive in advance.
2302                  * As this is counted one by one during devices initializations.
2303                  *
2304                  * So, we wait for all XGMI interlinked devices initialized.
2305                  * This may bring some delays as those devices may come from
2306                  * different hives. But that should be OK.
2307                  */
2308                 if (mgpu_info.num_dgpu == adev->gmc.xgmi.num_physical_nodes) {
2309                         for (i = 0; i < mgpu_info.num_gpu; i++) {
2310                                 gpu_instance = &(mgpu_info.gpu_ins[i]);
2311                                 if (gpu_instance->adev->flags & AMD_IS_APU)
2312                                         continue;
2313
2314                                 r = amdgpu_xgmi_set_pstate(gpu_instance->adev,
2315                                                 AMDGPU_XGMI_PSTATE_MIN);
2316                                 if (r) {
2317                                         DRM_ERROR("pstate setting failed (%d).\n", r);
2318                                         break;
2319                                 }
2320                         }
2321                 }
2322
2323                 mutex_unlock(&mgpu_info.mutex);
2324         }
2325
2326         return 0;
2327 }
2328
2329 /**
2330  * amdgpu_device_ip_fini - run fini for hardware IPs
2331  *
2332  * @adev: amdgpu_device pointer
2333  *
2334  * Main teardown pass for hardware IPs.  The list of all the hardware
2335  * IPs that make up the asic is walked and the hw_fini and sw_fini callbacks
2336  * are run.  hw_fini tears down the hardware associated with each IP
2337  * and sw_fini tears down any software state associated with each IP.
2338  * Returns 0 on success, negative error code on failure.
2339  */
2340 static int amdgpu_device_ip_fini(struct amdgpu_device *adev)
2341 {
2342         int i, r;
2343
2344         if (amdgpu_sriov_vf(adev) && adev->virt.ras_init_done)
2345                 amdgpu_virt_release_ras_err_handler_data(adev);
2346
2347         amdgpu_ras_pre_fini(adev);
2348
2349         if (adev->gmc.xgmi.num_physical_nodes > 1)
2350                 amdgpu_xgmi_remove_device(adev);
2351
2352         amdgpu_amdkfd_device_fini(adev);
2353
2354         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2355         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2356
2357         /* need to disable SMC first */
2358         for (i = 0; i < adev->num_ip_blocks; i++) {
2359                 if (!adev->ip_blocks[i].status.hw)
2360                         continue;
2361                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2362                         r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2363                         /* XXX handle errors */
2364                         if (r) {
2365                                 DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2366                                           adev->ip_blocks[i].version->funcs->name, r);
2367                         }
2368                         adev->ip_blocks[i].status.hw = false;
2369                         break;
2370                 }
2371         }
2372
2373         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2374                 if (!adev->ip_blocks[i].status.hw)
2375                         continue;
2376
2377                 r = adev->ip_blocks[i].version->funcs->hw_fini((void *)adev);
2378                 /* XXX handle errors */
2379                 if (r) {
2380                         DRM_DEBUG("hw_fini of IP block <%s> failed %d\n",
2381                                   adev->ip_blocks[i].version->funcs->name, r);
2382                 }
2383
2384                 adev->ip_blocks[i].status.hw = false;
2385         }
2386
2387
2388         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2389                 if (!adev->ip_blocks[i].status.sw)
2390                         continue;
2391
2392                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) {
2393                         amdgpu_ucode_free_bo(adev);
2394                         amdgpu_free_static_csa(&adev->virt.csa_obj);
2395                         amdgpu_device_wb_fini(adev);
2396                         amdgpu_device_vram_scratch_fini(adev);
2397                         amdgpu_ib_pool_fini(adev);
2398                 }
2399
2400                 r = adev->ip_blocks[i].version->funcs->sw_fini((void *)adev);
2401                 /* XXX handle errors */
2402                 if (r) {
2403                         DRM_DEBUG("sw_fini of IP block <%s> failed %d\n",
2404                                   adev->ip_blocks[i].version->funcs->name, r);
2405                 }
2406                 adev->ip_blocks[i].status.sw = false;
2407                 adev->ip_blocks[i].status.valid = false;
2408         }
2409
2410         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2411                 if (!adev->ip_blocks[i].status.late_initialized)
2412                         continue;
2413                 if (adev->ip_blocks[i].version->funcs->late_fini)
2414                         adev->ip_blocks[i].version->funcs->late_fini((void *)adev);
2415                 adev->ip_blocks[i].status.late_initialized = false;
2416         }
2417
2418         amdgpu_ras_fini(adev);
2419
2420         if (amdgpu_sriov_vf(adev))
2421                 if (amdgpu_virt_release_full_gpu(adev, false))
2422                         DRM_ERROR("failed to release exclusive mode on fini\n");
2423
2424         return 0;
2425 }
2426
2427 /**
2428  * amdgpu_device_delayed_init_work_handler - work handler for IB tests
2429  *
2430  * @work: work_struct.
2431  */
2432 static void amdgpu_device_delayed_init_work_handler(struct work_struct *work)
2433 {
2434         struct amdgpu_device *adev =
2435                 container_of(work, struct amdgpu_device, delayed_init_work.work);
2436         int r;
2437
2438         r = amdgpu_ib_ring_tests(adev);
2439         if (r)
2440                 DRM_ERROR("ib ring test failed (%d).\n", r);
2441 }
2442
2443 static void amdgpu_device_delay_enable_gfx_off(struct work_struct *work)
2444 {
2445         struct amdgpu_device *adev =
2446                 container_of(work, struct amdgpu_device, gfx.gfx_off_delay_work.work);
2447
2448         mutex_lock(&adev->gfx.gfx_off_mutex);
2449         if (!adev->gfx.gfx_off_state && !adev->gfx.gfx_off_req_count) {
2450                 if (!amdgpu_dpm_set_powergating_by_smu(adev, AMD_IP_BLOCK_TYPE_GFX, true))
2451                         adev->gfx.gfx_off_state = true;
2452         }
2453         mutex_unlock(&adev->gfx.gfx_off_mutex);
2454 }
2455
2456 /**
2457  * amdgpu_device_ip_suspend_phase1 - run suspend for hardware IPs (phase 1)
2458  *
2459  * @adev: amdgpu_device pointer
2460  *
2461  * Main suspend function for hardware IPs.  The list of all the hardware
2462  * IPs that make up the asic is walked, clockgating is disabled and the
2463  * suspend callbacks are run.  suspend puts the hardware and software state
2464  * in each IP into a state suitable for suspend.
2465  * Returns 0 on success, negative error code on failure.
2466  */
2467 static int amdgpu_device_ip_suspend_phase1(struct amdgpu_device *adev)
2468 {
2469         int i, r;
2470
2471         amdgpu_device_set_pg_state(adev, AMD_PG_STATE_UNGATE);
2472         amdgpu_device_set_cg_state(adev, AMD_CG_STATE_UNGATE);
2473
2474         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2475                 if (!adev->ip_blocks[i].status.valid)
2476                         continue;
2477                 /* displays are handled separately */
2478                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) {
2479                         /* XXX handle errors */
2480                         r = adev->ip_blocks[i].version->funcs->suspend(adev);
2481                         /* XXX handle errors */
2482                         if (r) {
2483                                 DRM_ERROR("suspend of IP block <%s> failed %d\n",
2484                                           adev->ip_blocks[i].version->funcs->name, r);
2485                                 return r;
2486                         }
2487                         adev->ip_blocks[i].status.hw = false;
2488                 }
2489         }
2490
2491         return 0;
2492 }
2493
2494 /**
2495  * amdgpu_device_ip_suspend_phase2 - run suspend for hardware IPs (phase 2)
2496  *
2497  * @adev: amdgpu_device pointer
2498  *
2499  * Main suspend function for hardware IPs.  The list of all the hardware
2500  * IPs that make up the asic is walked, clockgating is disabled and the
2501  * suspend callbacks are run.  suspend puts the hardware and software state
2502  * in each IP into a state suitable for suspend.
2503  * Returns 0 on success, negative error code on failure.
2504  */
2505 static int amdgpu_device_ip_suspend_phase2(struct amdgpu_device *adev)
2506 {
2507         int i, r;
2508
2509         for (i = adev->num_ip_blocks - 1; i >= 0; i--) {
2510                 if (!adev->ip_blocks[i].status.valid)
2511                         continue;
2512                 /* displays are handled in phase1 */
2513                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE)
2514                         continue;
2515                 /* PSP lost connection when err_event_athub occurs */
2516                 if (amdgpu_ras_intr_triggered() &&
2517                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
2518                         adev->ip_blocks[i].status.hw = false;
2519                         continue;
2520                 }
2521                 /* XXX handle errors */
2522                 r = adev->ip_blocks[i].version->funcs->suspend(adev);
2523                 /* XXX handle errors */
2524                 if (r) {
2525                         DRM_ERROR("suspend of IP block <%s> failed %d\n",
2526                                   adev->ip_blocks[i].version->funcs->name, r);
2527                 }
2528                 adev->ip_blocks[i].status.hw = false;
2529                 /* handle putting the SMC in the appropriate state */
2530                 if(!amdgpu_sriov_vf(adev)){
2531                         if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) {
2532                                 r = amdgpu_dpm_set_mp1_state(adev, adev->mp1_state);
2533                                 if (r) {
2534                                         DRM_ERROR("SMC failed to set mp1 state %d, %d\n",
2535                                                         adev->mp1_state, r);
2536                                         return r;
2537                                 }
2538                         }
2539                 }
2540                 adev->ip_blocks[i].status.hw = false;
2541         }
2542
2543         return 0;
2544 }
2545
2546 /**
2547  * amdgpu_device_ip_suspend - run suspend for hardware IPs
2548  *
2549  * @adev: amdgpu_device pointer
2550  *
2551  * Main suspend function for hardware IPs.  The list of all the hardware
2552  * IPs that make up the asic is walked, clockgating is disabled and the
2553  * suspend callbacks are run.  suspend puts the hardware and software state
2554  * in each IP into a state suitable for suspend.
2555  * Returns 0 on success, negative error code on failure.
2556  */
2557 int amdgpu_device_ip_suspend(struct amdgpu_device *adev)
2558 {
2559         int r;
2560
2561         if (amdgpu_sriov_vf(adev))
2562                 amdgpu_virt_request_full_gpu(adev, false);
2563
2564         r = amdgpu_device_ip_suspend_phase1(adev);
2565         if (r)
2566                 return r;
2567         r = amdgpu_device_ip_suspend_phase2(adev);
2568
2569         if (amdgpu_sriov_vf(adev))
2570                 amdgpu_virt_release_full_gpu(adev, false);
2571
2572         return r;
2573 }
2574
2575 static int amdgpu_device_ip_reinit_early_sriov(struct amdgpu_device *adev)
2576 {
2577         int i, r;
2578
2579         static enum amd_ip_block_type ip_order[] = {
2580                 AMD_IP_BLOCK_TYPE_GMC,
2581                 AMD_IP_BLOCK_TYPE_COMMON,
2582                 AMD_IP_BLOCK_TYPE_PSP,
2583                 AMD_IP_BLOCK_TYPE_IH,
2584         };
2585
2586         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2587                 int j;
2588                 struct amdgpu_ip_block *block;
2589
2590                 for (j = 0; j < adev->num_ip_blocks; j++) {
2591                         block = &adev->ip_blocks[j];
2592
2593                         block->status.hw = false;
2594                         if (block->version->type != ip_order[i] ||
2595                                 !block->status.valid)
2596                                 continue;
2597
2598                         r = block->version->funcs->hw_init(adev);
2599                         DRM_INFO("RE-INIT-early: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2600                         if (r)
2601                                 return r;
2602                         block->status.hw = true;
2603                 }
2604         }
2605
2606         return 0;
2607 }
2608
2609 static int amdgpu_device_ip_reinit_late_sriov(struct amdgpu_device *adev)
2610 {
2611         int i, r;
2612
2613         static enum amd_ip_block_type ip_order[] = {
2614                 AMD_IP_BLOCK_TYPE_SMC,
2615                 AMD_IP_BLOCK_TYPE_DCE,
2616                 AMD_IP_BLOCK_TYPE_GFX,
2617                 AMD_IP_BLOCK_TYPE_SDMA,
2618                 AMD_IP_BLOCK_TYPE_UVD,
2619                 AMD_IP_BLOCK_TYPE_VCE,
2620                 AMD_IP_BLOCK_TYPE_VCN
2621         };
2622
2623         for (i = 0; i < ARRAY_SIZE(ip_order); i++) {
2624                 int j;
2625                 struct amdgpu_ip_block *block;
2626
2627                 for (j = 0; j < adev->num_ip_blocks; j++) {
2628                         block = &adev->ip_blocks[j];
2629
2630                         if (block->version->type != ip_order[i] ||
2631                                 !block->status.valid ||
2632                                 block->status.hw)
2633                                 continue;
2634
2635                         if (block->version->type == AMD_IP_BLOCK_TYPE_SMC)
2636                                 r = block->version->funcs->resume(adev);
2637                         else
2638                                 r = block->version->funcs->hw_init(adev);
2639
2640                         DRM_INFO("RE-INIT-late: %s %s\n", block->version->funcs->name, r?"failed":"succeeded");
2641                         if (r)
2642                                 return r;
2643                         block->status.hw = true;
2644                 }
2645         }
2646
2647         return 0;
2648 }
2649
2650 /**
2651  * amdgpu_device_ip_resume_phase1 - run resume for hardware IPs
2652  *
2653  * @adev: amdgpu_device pointer
2654  *
2655  * First resume function for hardware IPs.  The list of all the hardware
2656  * IPs that make up the asic is walked and the resume callbacks are run for
2657  * COMMON, GMC, and IH.  resume puts the hardware into a functional state
2658  * after a suspend and updates the software state as necessary.  This
2659  * function is also used for restoring the GPU after a GPU reset.
2660  * Returns 0 on success, negative error code on failure.
2661  */
2662 static int amdgpu_device_ip_resume_phase1(struct amdgpu_device *adev)
2663 {
2664         int i, r;
2665
2666         for (i = 0; i < adev->num_ip_blocks; i++) {
2667                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2668                         continue;
2669                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2670                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2671                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH) {
2672
2673                         r = adev->ip_blocks[i].version->funcs->resume(adev);
2674                         if (r) {
2675                                 DRM_ERROR("resume of IP block <%s> failed %d\n",
2676                                           adev->ip_blocks[i].version->funcs->name, r);
2677                                 return r;
2678                         }
2679                         adev->ip_blocks[i].status.hw = true;
2680                 }
2681         }
2682
2683         return 0;
2684 }
2685
2686 /**
2687  * amdgpu_device_ip_resume_phase2 - run resume for hardware IPs
2688  *
2689  * @adev: amdgpu_device pointer
2690  *
2691  * First resume function for hardware IPs.  The list of all the hardware
2692  * IPs that make up the asic is walked and the resume callbacks are run for
2693  * all blocks except COMMON, GMC, and IH.  resume puts the hardware into a
2694  * functional state after a suspend and updates the software state as
2695  * necessary.  This function is also used for restoring the GPU after a GPU
2696  * reset.
2697  * Returns 0 on success, negative error code on failure.
2698  */
2699 static int amdgpu_device_ip_resume_phase2(struct amdgpu_device *adev)
2700 {
2701         int i, r;
2702
2703         for (i = 0; i < adev->num_ip_blocks; i++) {
2704                 if (!adev->ip_blocks[i].status.valid || adev->ip_blocks[i].status.hw)
2705                         continue;
2706                 if (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_COMMON ||
2707                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC ||
2708                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_IH ||
2709                     adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP)
2710                         continue;
2711                 r = adev->ip_blocks[i].version->funcs->resume(adev);
2712                 if (r) {
2713                         DRM_ERROR("resume of IP block <%s> failed %d\n",
2714                                   adev->ip_blocks[i].version->funcs->name, r);
2715                         return r;
2716                 }
2717                 adev->ip_blocks[i].status.hw = true;
2718         }
2719
2720         return 0;
2721 }
2722
2723 /**
2724  * amdgpu_device_ip_resume - run resume for hardware IPs
2725  *
2726  * @adev: amdgpu_device pointer
2727  *
2728  * Main resume function for hardware IPs.  The hardware IPs
2729  * are split into two resume functions because they are
2730  * are also used in in recovering from a GPU reset and some additional
2731  * steps need to be take between them.  In this case (S3/S4) they are
2732  * run sequentially.
2733  * Returns 0 on success, negative error code on failure.
2734  */
2735 static int amdgpu_device_ip_resume(struct amdgpu_device *adev)
2736 {
2737         int r;
2738
2739         r = amdgpu_device_ip_resume_phase1(adev);
2740         if (r)
2741                 return r;
2742
2743         r = amdgpu_device_fw_loading(adev);
2744         if (r)
2745                 return r;
2746
2747         r = amdgpu_device_ip_resume_phase2(adev);
2748
2749         return r;
2750 }
2751
2752 /**
2753  * amdgpu_device_detect_sriov_bios - determine if the board supports SR-IOV
2754  *
2755  * @adev: amdgpu_device pointer
2756  *
2757  * Query the VBIOS data tables to determine if the board supports SR-IOV.
2758  */
2759 static void amdgpu_device_detect_sriov_bios(struct amdgpu_device *adev)
2760 {
2761         if (amdgpu_sriov_vf(adev)) {
2762                 if (adev->is_atom_fw) {
2763                         if (amdgpu_atomfirmware_gpu_supports_virtualization(adev))
2764                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2765                 } else {
2766                         if (amdgpu_atombios_has_gpu_virtualization_table(adev))
2767                                 adev->virt.caps |= AMDGPU_SRIOV_CAPS_SRIOV_VBIOS;
2768                 }
2769
2770                 if (!(adev->virt.caps & AMDGPU_SRIOV_CAPS_SRIOV_VBIOS))
2771                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_NO_VBIOS, 0, 0);
2772         }
2773 }
2774
2775 /**
2776  * amdgpu_device_asic_has_dc_support - determine if DC supports the asic
2777  *
2778  * @asic_type: AMD asic type
2779  *
2780  * Check if there is DC (new modesetting infrastructre) support for an asic.
2781  * returns true if DC has support, false if not.
2782  */
2783 bool amdgpu_device_asic_has_dc_support(enum amd_asic_type asic_type)
2784 {
2785         switch (asic_type) {
2786 #if defined(CONFIG_DRM_AMD_DC)
2787         case CHIP_BONAIRE:
2788         case CHIP_KAVERI:
2789         case CHIP_KABINI:
2790         case CHIP_MULLINS:
2791                 /*
2792                  * We have systems in the wild with these ASICs that require
2793                  * LVDS and VGA support which is not supported with DC.
2794                  *
2795                  * Fallback to the non-DC driver here by default so as not to
2796                  * cause regressions.
2797                  */
2798                 return amdgpu_dc > 0;
2799         case CHIP_HAWAII:
2800         case CHIP_CARRIZO:
2801         case CHIP_STONEY:
2802         case CHIP_POLARIS10:
2803         case CHIP_POLARIS11:
2804         case CHIP_POLARIS12:
2805         case CHIP_VEGAM:
2806         case CHIP_TONGA:
2807         case CHIP_FIJI:
2808         case CHIP_VEGA10:
2809         case CHIP_VEGA12:
2810         case CHIP_VEGA20:
2811 #if defined(CONFIG_DRM_AMD_DC_DCN)
2812         case CHIP_RAVEN:
2813         case CHIP_NAVI10:
2814         case CHIP_NAVI14:
2815         case CHIP_NAVI12:
2816         case CHIP_RENOIR:
2817 #endif
2818 #if defined(CONFIG_DRM_AMD_DC_DCN3_0)
2819         case CHIP_SIENNA_CICHLID:
2820 #endif
2821                 return amdgpu_dc != 0;
2822 #endif
2823         default:
2824                 if (amdgpu_dc > 0)
2825                         DRM_INFO("Display Core has been requested via kernel parameter "
2826                                          "but isn't supported by ASIC, ignoring\n");
2827                 return false;
2828         }
2829 }
2830
2831 /**
2832  * amdgpu_device_has_dc_support - check if dc is supported
2833  *
2834  * @adev: amdgpu_device_pointer
2835  *
2836  * Returns true for supported, false for not supported
2837  */
2838 bool amdgpu_device_has_dc_support(struct amdgpu_device *adev)
2839 {
2840         if (amdgpu_sriov_vf(adev))
2841                 return false;
2842
2843         return amdgpu_device_asic_has_dc_support(adev->asic_type);
2844 }
2845
2846
2847 static void amdgpu_device_xgmi_reset_func(struct work_struct *__work)
2848 {
2849         struct amdgpu_device *adev =
2850                 container_of(__work, struct amdgpu_device, xgmi_reset_work);
2851         struct amdgpu_hive_info *hive = amdgpu_get_xgmi_hive(adev, 0);
2852
2853         /* It's a bug to not have a hive within this function */
2854         if (WARN_ON(!hive))
2855                 return;
2856
2857         /*
2858          * Use task barrier to synchronize all xgmi reset works across the
2859          * hive. task_barrier_enter and task_barrier_exit will block
2860          * until all the threads running the xgmi reset works reach
2861          * those points. task_barrier_full will do both blocks.
2862          */
2863         if (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) {
2864
2865                 task_barrier_enter(&hive->tb);
2866                 adev->asic_reset_res = amdgpu_device_baco_enter(adev->ddev);
2867
2868                 if (adev->asic_reset_res)
2869                         goto fail;
2870
2871                 task_barrier_exit(&hive->tb);
2872                 adev->asic_reset_res = amdgpu_device_baco_exit(adev->ddev);
2873
2874                 if (adev->asic_reset_res)
2875                         goto fail;
2876
2877                 if (adev->mmhub.funcs && adev->mmhub.funcs->reset_ras_error_count)
2878                         adev->mmhub.funcs->reset_ras_error_count(adev);
2879         } else {
2880
2881                 task_barrier_full(&hive->tb);
2882                 adev->asic_reset_res =  amdgpu_asic_reset(adev);
2883         }
2884
2885 fail:
2886         if (adev->asic_reset_res)
2887                 DRM_WARN("ASIC reset failed with error, %d for drm dev, %s",
2888                          adev->asic_reset_res, adev->ddev->unique);
2889 }
2890
2891 static int amdgpu_device_get_job_timeout_settings(struct amdgpu_device *adev)
2892 {
2893         char *input = amdgpu_lockup_timeout;
2894         char *timeout_setting = NULL;
2895         int index = 0;
2896         long timeout;
2897         int ret = 0;
2898
2899         /*
2900          * By default timeout for non compute jobs is 10000.
2901          * And there is no timeout enforced on compute jobs.
2902          * In SR-IOV or passthrough mode, timeout for compute
2903          * jobs are 60000 by default.
2904          */
2905         adev->gfx_timeout = msecs_to_jiffies(10000);
2906         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2907         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2908                 adev->compute_timeout =  msecs_to_jiffies(60000);
2909         else
2910                 adev->compute_timeout = MAX_SCHEDULE_TIMEOUT;
2911
2912         if (strnlen(input, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2913                 while ((timeout_setting = strsep(&input, ",")) &&
2914                                 strnlen(timeout_setting, AMDGPU_MAX_TIMEOUT_PARAM_LENGTH)) {
2915                         ret = kstrtol(timeout_setting, 0, &timeout);
2916                         if (ret)
2917                                 return ret;
2918
2919                         if (timeout == 0) {
2920                                 index++;
2921                                 continue;
2922                         } else if (timeout < 0) {
2923                                 timeout = MAX_SCHEDULE_TIMEOUT;
2924                         } else {
2925                                 timeout = msecs_to_jiffies(timeout);
2926                         }
2927
2928                         switch (index++) {
2929                         case 0:
2930                                 adev->gfx_timeout = timeout;
2931                                 break;
2932                         case 1:
2933                                 adev->compute_timeout = timeout;
2934                                 break;
2935                         case 2:
2936                                 adev->sdma_timeout = timeout;
2937                                 break;
2938                         case 3:
2939                                 adev->video_timeout = timeout;
2940                                 break;
2941                         default:
2942                                 break;
2943                         }
2944                 }
2945                 /*
2946                  * There is only one value specified and
2947                  * it should apply to all non-compute jobs.
2948                  */
2949                 if (index == 1) {
2950                         adev->sdma_timeout = adev->video_timeout = adev->gfx_timeout;
2951                         if (amdgpu_sriov_vf(adev) || amdgpu_passthrough(adev))
2952                                 adev->compute_timeout = adev->gfx_timeout;
2953                 }
2954         }
2955
2956         return ret;
2957 }
2958
2959 static const struct attribute *amdgpu_dev_attributes[] = {
2960         &dev_attr_product_name.attr,
2961         &dev_attr_product_number.attr,
2962         &dev_attr_serial_number.attr,
2963         &dev_attr_pcie_replay_count.attr,
2964         NULL
2965 };
2966
2967 /**
2968  * amdgpu_device_init - initialize the driver
2969  *
2970  * @adev: amdgpu_device pointer
2971  * @ddev: drm dev pointer
2972  * @pdev: pci dev pointer
2973  * @flags: driver flags
2974  *
2975  * Initializes the driver info and hw (all asics).
2976  * Returns 0 for success or an error on failure.
2977  * Called at driver startup.
2978  */
2979 int amdgpu_device_init(struct amdgpu_device *adev,
2980                        struct drm_device *ddev,
2981                        struct pci_dev *pdev,
2982                        uint32_t flags)
2983 {
2984         int r, i;
2985         bool boco = false;
2986         u32 max_MBps;
2987
2988         adev->shutdown = false;
2989         adev->dev = &pdev->dev;
2990         adev->ddev = ddev;
2991         adev->pdev = pdev;
2992         adev->flags = flags;
2993
2994         if (amdgpu_force_asic_type >= 0 && amdgpu_force_asic_type < CHIP_LAST)
2995                 adev->asic_type = amdgpu_force_asic_type;
2996         else
2997                 adev->asic_type = flags & AMD_ASIC_MASK;
2998
2999         adev->usec_timeout = AMDGPU_MAX_USEC_TIMEOUT;
3000         if (amdgpu_emu_mode == 1)
3001                 adev->usec_timeout *= 10;
3002         adev->gmc.gart_size = 512 * 1024 * 1024;
3003         adev->accel_working = false;
3004         adev->num_rings = 0;
3005         adev->mman.buffer_funcs = NULL;
3006         adev->mman.buffer_funcs_ring = NULL;
3007         adev->vm_manager.vm_pte_funcs = NULL;
3008         adev->vm_manager.vm_pte_num_scheds = 0;
3009         adev->gmc.gmc_funcs = NULL;
3010         adev->fence_context = dma_fence_context_alloc(AMDGPU_MAX_RINGS);
3011         bitmap_zero(adev->gfx.pipe_reserve_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
3012
3013         adev->smc_rreg = &amdgpu_invalid_rreg;
3014         adev->smc_wreg = &amdgpu_invalid_wreg;
3015         adev->pcie_rreg = &amdgpu_invalid_rreg;
3016         adev->pcie_wreg = &amdgpu_invalid_wreg;
3017         adev->pciep_rreg = &amdgpu_invalid_rreg;
3018         adev->pciep_wreg = &amdgpu_invalid_wreg;
3019         adev->pcie_rreg64 = &amdgpu_invalid_rreg64;
3020         adev->pcie_wreg64 = &amdgpu_invalid_wreg64;
3021         adev->uvd_ctx_rreg = &amdgpu_invalid_rreg;
3022         adev->uvd_ctx_wreg = &amdgpu_invalid_wreg;
3023         adev->didt_rreg = &amdgpu_invalid_rreg;
3024         adev->didt_wreg = &amdgpu_invalid_wreg;
3025         adev->gc_cac_rreg = &amdgpu_invalid_rreg;
3026         adev->gc_cac_wreg = &amdgpu_invalid_wreg;
3027         adev->audio_endpt_rreg = &amdgpu_block_invalid_rreg;
3028         adev->audio_endpt_wreg = &amdgpu_block_invalid_wreg;
3029
3030         DRM_INFO("initializing kernel modesetting (%s 0x%04X:0x%04X 0x%04X:0x%04X 0x%02X).\n",
3031                  amdgpu_asic_name[adev->asic_type], pdev->vendor, pdev->device,
3032                  pdev->subsystem_vendor, pdev->subsystem_device, pdev->revision);
3033
3034         /* mutex initialization are all done here so we
3035          * can recall function without having locking issues */
3036         atomic_set(&adev->irq.ih.lock, 0);
3037         mutex_init(&adev->firmware.mutex);
3038         mutex_init(&adev->pm.mutex);
3039         mutex_init(&adev->gfx.gpu_clock_mutex);
3040         mutex_init(&adev->srbm_mutex);
3041         mutex_init(&adev->gfx.pipe_reserve_mutex);
3042         mutex_init(&adev->gfx.gfx_off_mutex);
3043         mutex_init(&adev->grbm_idx_mutex);
3044         mutex_init(&adev->mn_lock);
3045         mutex_init(&adev->virt.vf_errors.lock);
3046         hash_init(adev->mn_hash);
3047         mutex_init(&adev->lock_reset);
3048         mutex_init(&adev->psp.mutex);
3049         mutex_init(&adev->notifier_lock);
3050
3051         r = amdgpu_device_check_arguments(adev);
3052         if (r)
3053                 return r;
3054
3055         spin_lock_init(&adev->mmio_idx_lock);
3056         spin_lock_init(&adev->smc_idx_lock);
3057         spin_lock_init(&adev->pcie_idx_lock);
3058         spin_lock_init(&adev->uvd_ctx_idx_lock);
3059         spin_lock_init(&adev->didt_idx_lock);
3060         spin_lock_init(&adev->gc_cac_idx_lock);
3061         spin_lock_init(&adev->se_cac_idx_lock);
3062         spin_lock_init(&adev->audio_endpt_idx_lock);
3063         spin_lock_init(&adev->mm_stats.lock);
3064
3065         INIT_LIST_HEAD(&adev->shadow_list);
3066         mutex_init(&adev->shadow_list_lock);
3067
3068         INIT_DELAYED_WORK(&adev->delayed_init_work,
3069                           amdgpu_device_delayed_init_work_handler);
3070         INIT_DELAYED_WORK(&adev->gfx.gfx_off_delay_work,
3071                           amdgpu_device_delay_enable_gfx_off);
3072
3073         INIT_WORK(&adev->xgmi_reset_work, amdgpu_device_xgmi_reset_func);
3074
3075         adev->gfx.gfx_off_req_count = 1;
3076         adev->pm.ac_power = power_supply_is_system_supplied() > 0;
3077
3078         atomic_set(&adev->throttling_logging_enabled, 1);
3079         /*
3080          * If throttling continues, logging will be performed every minute
3081          * to avoid log flooding. "-1" is subtracted since the thermal
3082          * throttling interrupt comes every second. Thus, the total logging
3083          * interval is 59 seconds(retelimited printk interval) + 1(waiting
3084          * for throttling interrupt) = 60 seconds.
3085          */
3086         ratelimit_state_init(&adev->throttling_logging_rs, (60 - 1) * HZ, 1);
3087         ratelimit_set_flags(&adev->throttling_logging_rs, RATELIMIT_MSG_ON_RELEASE);
3088
3089         /* Registers mapping */
3090         /* TODO: block userspace mapping of io register */
3091         if (adev->asic_type >= CHIP_BONAIRE) {
3092                 adev->rmmio_base = pci_resource_start(adev->pdev, 5);
3093                 adev->rmmio_size = pci_resource_len(adev->pdev, 5);
3094         } else {
3095                 adev->rmmio_base = pci_resource_start(adev->pdev, 2);
3096                 adev->rmmio_size = pci_resource_len(adev->pdev, 2);
3097         }
3098
3099         adev->rmmio = ioremap(adev->rmmio_base, adev->rmmio_size);
3100         if (adev->rmmio == NULL) {
3101                 return -ENOMEM;
3102         }
3103         DRM_INFO("register mmio base: 0x%08X\n", (uint32_t)adev->rmmio_base);
3104         DRM_INFO("register mmio size: %u\n", (unsigned)adev->rmmio_size);
3105
3106         /* io port mapping */
3107         for (i = 0; i < DEVICE_COUNT_RESOURCE; i++) {
3108                 if (pci_resource_flags(adev->pdev, i) & IORESOURCE_IO) {
3109                         adev->rio_mem_size = pci_resource_len(adev->pdev, i);
3110                         adev->rio_mem = pci_iomap(adev->pdev, i, adev->rio_mem_size);
3111                         break;
3112                 }
3113         }
3114         if (adev->rio_mem == NULL)
3115                 DRM_INFO("PCI I/O BAR is not found.\n");
3116
3117         /* enable PCIE atomic ops */
3118         r = pci_enable_atomic_ops_to_root(adev->pdev,
3119                                           PCI_EXP_DEVCAP2_ATOMIC_COMP32 |
3120                                           PCI_EXP_DEVCAP2_ATOMIC_COMP64);
3121         if (r) {
3122                 adev->have_atomics_support = false;
3123                 DRM_INFO("PCIE atomic ops is not supported\n");
3124         } else {
3125                 adev->have_atomics_support = true;
3126         }
3127
3128         amdgpu_device_get_pcie_info(adev);
3129
3130         if (amdgpu_mcbp)
3131                 DRM_INFO("MCBP is enabled\n");
3132
3133         if (amdgpu_mes && adev->asic_type >= CHIP_NAVI10)
3134                 adev->enable_mes = true;
3135
3136         /* detect hw virtualization here */
3137         amdgpu_detect_virtualization(adev);
3138
3139         r = amdgpu_device_get_job_timeout_settings(adev);
3140         if (r) {
3141                 dev_err(adev->dev, "invalid lockup_timeout parameter syntax\n");
3142                 return r;
3143         }
3144
3145         /* early init functions */
3146         r = amdgpu_device_ip_early_init(adev);
3147         if (r)
3148                 return r;
3149
3150         /* doorbell bar mapping and doorbell index init*/
3151         amdgpu_device_doorbell_init(adev);
3152
3153         /* if we have > 1 VGA cards, then disable the amdgpu VGA resources */
3154         /* this will fail for cards that aren't VGA class devices, just
3155          * ignore it */
3156         vga_client_register(adev->pdev, adev, NULL, amdgpu_device_vga_set_decode);
3157
3158         if (amdgpu_device_supports_boco(ddev))
3159                 boco = true;
3160         if (amdgpu_has_atpx() &&
3161             (amdgpu_is_atpx_hybrid() ||
3162              amdgpu_has_atpx_dgpu_power_cntl()) &&
3163             !pci_is_thunderbolt_attached(adev->pdev))
3164                 vga_switcheroo_register_client(adev->pdev,
3165                                                &amdgpu_switcheroo_ops, boco);
3166         if (boco)
3167                 vga_switcheroo_init_domain_pm_ops(adev->dev, &adev->vga_pm_domain);
3168
3169         if (amdgpu_emu_mode == 1) {
3170                 /* post the asic on emulation mode */
3171                 emu_soc_asic_init(adev);
3172                 goto fence_driver_init;
3173         }
3174
3175         /* detect if we are with an SRIOV vbios */
3176         amdgpu_device_detect_sriov_bios(adev);
3177
3178         /* check if we need to reset the asic
3179          *  E.g., driver was not cleanly unloaded previously, etc.
3180          */
3181         if (!amdgpu_sriov_vf(adev) && amdgpu_asic_need_reset_on_init(adev)) {
3182                 r = amdgpu_asic_reset(adev);
3183                 if (r) {
3184                         dev_err(adev->dev, "asic reset on init failed\n");
3185                         goto failed;
3186                 }
3187         }
3188
3189         /* Post card if necessary */
3190         if (amdgpu_device_need_post(adev)) {
3191                 if (!adev->bios) {
3192                         dev_err(adev->dev, "no vBIOS found\n");
3193                         r = -EINVAL;
3194                         goto failed;
3195                 }
3196                 DRM_INFO("GPU posting now...\n");
3197                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3198                 if (r) {
3199                         dev_err(adev->dev, "gpu post error!\n");
3200                         goto failed;
3201                 }
3202         }
3203
3204         if (adev->is_atom_fw) {
3205                 /* Initialize clocks */
3206                 r = amdgpu_atomfirmware_get_clock_info(adev);
3207                 if (r) {
3208                         dev_err(adev->dev, "amdgpu_atomfirmware_get_clock_info failed\n");
3209                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3210                         goto failed;
3211                 }
3212         } else {
3213                 /* Initialize clocks */
3214                 r = amdgpu_atombios_get_clock_info(adev);
3215                 if (r) {
3216                         dev_err(adev->dev, "amdgpu_atombios_get_clock_info failed\n");
3217                         amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_ATOMBIOS_GET_CLOCK_FAIL, 0, 0);
3218                         goto failed;
3219                 }
3220                 /* init i2c buses */
3221                 if (!amdgpu_device_has_dc_support(adev))
3222                         amdgpu_atombios_i2c_init(adev);
3223         }
3224
3225 fence_driver_init:
3226         /* Fence driver */
3227         r = amdgpu_fence_driver_init(adev);
3228         if (r) {
3229                 dev_err(adev->dev, "amdgpu_fence_driver_init failed\n");
3230                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_FENCE_INIT_FAIL, 0, 0);
3231                 goto failed;
3232         }
3233
3234         /* init the mode config */
3235         drm_mode_config_init(adev->ddev);
3236
3237         r = amdgpu_device_ip_init(adev);
3238         if (r) {
3239                 /* failed in exclusive mode due to timeout */
3240                 if (amdgpu_sriov_vf(adev) &&
3241                     !amdgpu_sriov_runtime(adev) &&
3242                     amdgpu_virt_mmio_blocked(adev) &&
3243                     !amdgpu_virt_wait_reset(adev)) {
3244                         dev_err(adev->dev, "VF exclusive mode timeout\n");
3245                         /* Don't send request since VF is inactive. */
3246                         adev->virt.caps &= ~AMDGPU_SRIOV_CAPS_RUNTIME;
3247                         adev->virt.ops = NULL;
3248                         r = -EAGAIN;
3249                         goto failed;
3250                 }
3251                 dev_err(adev->dev, "amdgpu_device_ip_init failed\n");
3252                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_INIT_FAIL, 0, 0);
3253                 goto failed;
3254         }
3255
3256         dev_info(adev->dev,
3257                 "SE %d, SH per SE %d, CU per SH %d, active_cu_number %d\n",
3258                         adev->gfx.config.max_shader_engines,
3259                         adev->gfx.config.max_sh_per_se,
3260                         adev->gfx.config.max_cu_per_sh,
3261                         adev->gfx.cu_info.number);
3262
3263         adev->accel_working = true;
3264
3265         amdgpu_vm_check_compute_bug(adev);
3266
3267         /* Initialize the buffer migration limit. */
3268         if (amdgpu_moverate >= 0)
3269                 max_MBps = amdgpu_moverate;
3270         else
3271                 max_MBps = 8; /* Allow 8 MB/s. */
3272         /* Get a log2 for easy divisions. */
3273         adev->mm_stats.log2_max_MBps = ilog2(max(1u, max_MBps));
3274
3275         amdgpu_fbdev_init(adev);
3276
3277         r = amdgpu_pm_sysfs_init(adev);
3278         if (r) {
3279                 adev->pm_sysfs_en = false;
3280                 DRM_ERROR("registering pm debugfs failed (%d).\n", r);
3281         } else
3282                 adev->pm_sysfs_en = true;
3283
3284         r = amdgpu_ucode_sysfs_init(adev);
3285         if (r) {
3286                 adev->ucode_sysfs_en = false;
3287                 DRM_ERROR("Creating firmware sysfs failed (%d).\n", r);
3288         } else
3289                 adev->ucode_sysfs_en = true;
3290
3291         if ((amdgpu_testing & 1)) {
3292                 if (adev->accel_working)
3293                         amdgpu_test_moves(adev);
3294                 else
3295                         DRM_INFO("amdgpu: acceleration disabled, skipping move tests\n");
3296         }
3297         if (amdgpu_benchmarking) {
3298                 if (adev->accel_working)
3299                         amdgpu_benchmark(adev, amdgpu_benchmarking);
3300                 else
3301                         DRM_INFO("amdgpu: acceleration disabled, skipping benchmarks\n");
3302         }
3303
3304         /*
3305          * Register gpu instance before amdgpu_device_enable_mgpu_fan_boost.
3306          * Otherwise the mgpu fan boost feature will be skipped due to the
3307          * gpu instance is counted less.
3308          */
3309         amdgpu_register_gpu_instance(adev);
3310
3311         /* enable clockgating, etc. after ib tests, etc. since some blocks require
3312          * explicit gating rather than handling it automatically.
3313          */
3314         r = amdgpu_device_ip_late_init(adev);
3315         if (r) {
3316                 dev_err(adev->dev, "amdgpu_device_ip_late_init failed\n");
3317                 amdgpu_vf_error_put(adev, AMDGIM_ERROR_VF_AMDGPU_LATE_INIT_FAIL, 0, r);
3318                 goto failed;
3319         }
3320
3321         /* must succeed. */
3322         amdgpu_ras_resume(adev);
3323
3324         queue_delayed_work(system_wq, &adev->delayed_init_work,
3325                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3326
3327         r = sysfs_create_files(&adev->dev->kobj, amdgpu_dev_attributes);
3328         if (r) {
3329                 dev_err(adev->dev, "Could not create amdgpu device attr\n");
3330                 return r;
3331         }
3332
3333         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3334                 r = amdgpu_pmu_init(adev);
3335         if (r)
3336                 dev_err(adev->dev, "amdgpu_pmu_init failed\n");
3337
3338         return 0;
3339
3340 failed:
3341         amdgpu_vf_error_trans_all(adev);
3342         if (boco)
3343                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3344
3345         return r;
3346 }
3347
3348 /**
3349  * amdgpu_device_fini - tear down the driver
3350  *
3351  * @adev: amdgpu_device pointer
3352  *
3353  * Tear down the driver info (all asics).
3354  * Called at driver shutdown.
3355  */
3356 void amdgpu_device_fini(struct amdgpu_device *adev)
3357 {
3358         int r;
3359
3360         DRM_INFO("amdgpu: finishing device.\n");
3361         flush_delayed_work(&adev->delayed_init_work);
3362         adev->shutdown = true;
3363
3364         /* make sure IB test finished before entering exclusive mode
3365          * to avoid preemption on IB test
3366          * */
3367         if (amdgpu_sriov_vf(adev))
3368                 amdgpu_virt_request_full_gpu(adev, false);
3369
3370         /* disable all interrupts */
3371         amdgpu_irq_disable_all(adev);
3372         if (adev->mode_info.mode_config_initialized){
3373                 if (!amdgpu_device_has_dc_support(adev))
3374                         drm_helper_force_disable_all(adev->ddev);
3375                 else
3376                         drm_atomic_helper_shutdown(adev->ddev);
3377         }
3378         amdgpu_fence_driver_fini(adev);
3379         if (adev->pm_sysfs_en)
3380                 amdgpu_pm_sysfs_fini(adev);
3381         amdgpu_fbdev_fini(adev);
3382         r = amdgpu_device_ip_fini(adev);
3383         release_firmware(adev->firmware.gpu_info_fw);
3384         adev->firmware.gpu_info_fw = NULL;
3385         adev->accel_working = false;
3386         /* free i2c buses */
3387         if (!amdgpu_device_has_dc_support(adev))
3388                 amdgpu_i2c_fini(adev);
3389
3390         if (amdgpu_emu_mode != 1)
3391                 amdgpu_atombios_fini(adev);
3392
3393         kfree(adev->bios);
3394         adev->bios = NULL;
3395         if (amdgpu_has_atpx() &&
3396             (amdgpu_is_atpx_hybrid() ||
3397              amdgpu_has_atpx_dgpu_power_cntl()) &&
3398             !pci_is_thunderbolt_attached(adev->pdev))
3399                 vga_switcheroo_unregister_client(adev->pdev);
3400         if (amdgpu_device_supports_boco(adev->ddev))
3401                 vga_switcheroo_fini_domain_pm_ops(adev->dev);
3402         vga_client_register(adev->pdev, NULL, NULL, NULL);
3403         if (adev->rio_mem)
3404                 pci_iounmap(adev->pdev, adev->rio_mem);
3405         adev->rio_mem = NULL;
3406         iounmap(adev->rmmio);
3407         adev->rmmio = NULL;
3408         amdgpu_device_doorbell_fini(adev);
3409
3410         if (adev->ucode_sysfs_en)
3411                 amdgpu_ucode_sysfs_fini(adev);
3412
3413         sysfs_remove_files(&adev->dev->kobj, amdgpu_dev_attributes);
3414         if (IS_ENABLED(CONFIG_PERF_EVENTS))
3415                 amdgpu_pmu_fini(adev);
3416         if (adev->discovery_bin)
3417                 amdgpu_discovery_fini(adev);
3418 }
3419
3420
3421 /*
3422  * Suspend & resume.
3423  */
3424 /**
3425  * amdgpu_device_suspend - initiate device suspend
3426  *
3427  * @dev: drm dev pointer
3428  * @fbcon : notify the fbdev of suspend
3429  *
3430  * Puts the hw in the suspend state (all asics).
3431  * Returns 0 for success or an error on failure.
3432  * Called at driver suspend.
3433  */
3434 int amdgpu_device_suspend(struct drm_device *dev, bool fbcon)
3435 {
3436         struct amdgpu_device *adev;
3437         struct drm_crtc *crtc;
3438         struct drm_connector *connector;
3439         struct drm_connector_list_iter iter;
3440         int r;
3441
3442         if (dev == NULL || dev->dev_private == NULL) {
3443                 return -ENODEV;
3444         }
3445
3446         adev = dev->dev_private;
3447
3448         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3449                 return 0;
3450
3451         adev->in_suspend = true;
3452         drm_kms_helper_poll_disable(dev);
3453
3454         if (fbcon)
3455                 amdgpu_fbdev_set_suspend(adev, 1);
3456
3457         cancel_delayed_work_sync(&adev->delayed_init_work);
3458
3459         if (!amdgpu_device_has_dc_support(adev)) {
3460                 /* turn off display hw */
3461                 drm_modeset_lock_all(dev);
3462                 drm_connector_list_iter_begin(dev, &iter);
3463                 drm_for_each_connector_iter(connector, &iter)
3464                         drm_helper_connector_dpms(connector,
3465                                                   DRM_MODE_DPMS_OFF);
3466                 drm_connector_list_iter_end(&iter);
3467                 drm_modeset_unlock_all(dev);
3468                         /* unpin the front buffers and cursors */
3469                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3470                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3471                         struct drm_framebuffer *fb = crtc->primary->fb;
3472                         struct amdgpu_bo *robj;
3473
3474                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3475                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3476                                 r = amdgpu_bo_reserve(aobj, true);
3477                                 if (r == 0) {
3478                                         amdgpu_bo_unpin(aobj);
3479                                         amdgpu_bo_unreserve(aobj);
3480                                 }
3481                         }
3482
3483                         if (fb == NULL || fb->obj[0] == NULL) {
3484                                 continue;
3485                         }
3486                         robj = gem_to_amdgpu_bo(fb->obj[0]);
3487                         /* don't unpin kernel fb objects */
3488                         if (!amdgpu_fbdev_robj_is_fb(adev, robj)) {
3489                                 r = amdgpu_bo_reserve(robj, true);
3490                                 if (r == 0) {
3491                                         amdgpu_bo_unpin(robj);
3492                                         amdgpu_bo_unreserve(robj);
3493                                 }
3494                         }
3495                 }
3496         }
3497
3498         amdgpu_ras_suspend(adev);
3499
3500         r = amdgpu_device_ip_suspend_phase1(adev);
3501
3502         amdgpu_amdkfd_suspend(adev, !fbcon);
3503
3504         /* evict vram memory */
3505         amdgpu_bo_evict_vram(adev);
3506
3507         amdgpu_fence_driver_suspend(adev);
3508
3509         r = amdgpu_device_ip_suspend_phase2(adev);
3510
3511         /* evict remaining vram memory
3512          * This second call to evict vram is to evict the gart page table
3513          * using the CPU.
3514          */
3515         amdgpu_bo_evict_vram(adev);
3516
3517         return 0;
3518 }
3519
3520 /**
3521  * amdgpu_device_resume - initiate device resume
3522  *
3523  * @dev: drm dev pointer
3524  * @fbcon : notify the fbdev of resume
3525  *
3526  * Bring the hw back to operating state (all asics).
3527  * Returns 0 for success or an error on failure.
3528  * Called at driver resume.
3529  */
3530 int amdgpu_device_resume(struct drm_device *dev, bool fbcon)
3531 {
3532         struct drm_connector *connector;
3533         struct drm_connector_list_iter iter;
3534         struct amdgpu_device *adev = dev->dev_private;
3535         struct drm_crtc *crtc;
3536         int r = 0;
3537
3538         if (dev->switch_power_state == DRM_SWITCH_POWER_OFF)
3539                 return 0;
3540
3541         /* post card */
3542         if (amdgpu_device_need_post(adev)) {
3543                 r = amdgpu_atom_asic_init(adev->mode_info.atom_context);
3544                 if (r)
3545                         DRM_ERROR("amdgpu asic init failed\n");
3546         }
3547
3548         r = amdgpu_device_ip_resume(adev);
3549         if (r) {
3550                 DRM_ERROR("amdgpu_device_ip_resume failed (%d).\n", r);
3551                 return r;
3552         }
3553         amdgpu_fence_driver_resume(adev);
3554
3555
3556         r = amdgpu_device_ip_late_init(adev);
3557         if (r)
3558                 return r;
3559
3560         queue_delayed_work(system_wq, &adev->delayed_init_work,
3561                            msecs_to_jiffies(AMDGPU_RESUME_MS));
3562
3563         if (!amdgpu_device_has_dc_support(adev)) {
3564                 /* pin cursors */
3565                 list_for_each_entry(crtc, &dev->mode_config.crtc_list, head) {
3566                         struct amdgpu_crtc *amdgpu_crtc = to_amdgpu_crtc(crtc);
3567
3568                         if (amdgpu_crtc->cursor_bo && !adev->enable_virtual_display) {
3569                                 struct amdgpu_bo *aobj = gem_to_amdgpu_bo(amdgpu_crtc->cursor_bo);
3570                                 r = amdgpu_bo_reserve(aobj, true);
3571                                 if (r == 0) {
3572                                         r = amdgpu_bo_pin(aobj, AMDGPU_GEM_DOMAIN_VRAM);
3573                                         if (r != 0)
3574                                                 DRM_ERROR("Failed to pin cursor BO (%d)\n", r);
3575                                         amdgpu_crtc->cursor_addr = amdgpu_bo_gpu_offset(aobj);
3576                                         amdgpu_bo_unreserve(aobj);
3577                                 }
3578                         }
3579                 }
3580         }
3581         r = amdgpu_amdkfd_resume(adev, !fbcon);
3582         if (r)
3583                 return r;
3584
3585         /* Make sure IB tests flushed */
3586         flush_delayed_work(&adev->delayed_init_work);
3587
3588         /* blat the mode back in */
3589         if (fbcon) {
3590                 if (!amdgpu_device_has_dc_support(adev)) {
3591                         /* pre DCE11 */
3592                         drm_helper_resume_force_mode(dev);
3593
3594                         /* turn on display hw */
3595                         drm_modeset_lock_all(dev);
3596
3597                         drm_connector_list_iter_begin(dev, &iter);
3598                         drm_for_each_connector_iter(connector, &iter)
3599                                 drm_helper_connector_dpms(connector,
3600                                                           DRM_MODE_DPMS_ON);
3601                         drm_connector_list_iter_end(&iter);
3602
3603                         drm_modeset_unlock_all(dev);
3604                 }
3605                 amdgpu_fbdev_set_suspend(adev, 0);
3606         }
3607
3608         drm_kms_helper_poll_enable(dev);
3609
3610         amdgpu_ras_resume(adev);
3611
3612         /*
3613          * Most of the connector probing functions try to acquire runtime pm
3614          * refs to ensure that the GPU is powered on when connector polling is
3615          * performed. Since we're calling this from a runtime PM callback,
3616          * trying to acquire rpm refs will cause us to deadlock.
3617          *
3618          * Since we're guaranteed to be holding the rpm lock, it's safe to
3619          * temporarily disable the rpm helpers so this doesn't deadlock us.
3620          */
3621 #ifdef CONFIG_PM
3622         dev->dev->power.disable_depth++;
3623 #endif
3624         if (!amdgpu_device_has_dc_support(adev))
3625                 drm_helper_hpd_irq_event(dev);
3626         else
3627                 drm_kms_helper_hotplug_event(dev);
3628 #ifdef CONFIG_PM
3629         dev->dev->power.disable_depth--;
3630 #endif
3631         adev->in_suspend = false;
3632
3633         return 0;
3634 }
3635
3636 /**
3637  * amdgpu_device_ip_check_soft_reset - did soft reset succeed
3638  *
3639  * @adev: amdgpu_device pointer
3640  *
3641  * The list of all the hardware IPs that make up the asic is walked and
3642  * the check_soft_reset callbacks are run.  check_soft_reset determines
3643  * if the asic is still hung or not.
3644  * Returns true if any of the IPs are still in a hung state, false if not.
3645  */
3646 static bool amdgpu_device_ip_check_soft_reset(struct amdgpu_device *adev)
3647 {
3648         int i;
3649         bool asic_hang = false;
3650
3651         if (amdgpu_sriov_vf(adev))
3652                 return true;
3653
3654         if (amdgpu_asic_need_full_reset(adev))
3655                 return true;
3656
3657         for (i = 0; i < adev->num_ip_blocks; i++) {
3658                 if (!adev->ip_blocks[i].status.valid)
3659                         continue;
3660                 if (adev->ip_blocks[i].version->funcs->check_soft_reset)
3661                         adev->ip_blocks[i].status.hang =
3662                                 adev->ip_blocks[i].version->funcs->check_soft_reset(adev);
3663                 if (adev->ip_blocks[i].status.hang) {
3664                         DRM_INFO("IP block:%s is hung!\n", adev->ip_blocks[i].version->funcs->name);
3665                         asic_hang = true;
3666                 }
3667         }
3668         return asic_hang;
3669 }
3670
3671 /**
3672  * amdgpu_device_ip_pre_soft_reset - prepare for soft reset
3673  *
3674  * @adev: amdgpu_device pointer
3675  *
3676  * The list of all the hardware IPs that make up the asic is walked and the
3677  * pre_soft_reset callbacks are run if the block is hung.  pre_soft_reset
3678  * handles any IP specific hardware or software state changes that are
3679  * necessary for a soft reset to succeed.
3680  * Returns 0 on success, negative error code on failure.
3681  */
3682 static int amdgpu_device_ip_pre_soft_reset(struct amdgpu_device *adev)
3683 {
3684         int i, r = 0;
3685
3686         for (i = 0; i < adev->num_ip_blocks; i++) {
3687                 if (!adev->ip_blocks[i].status.valid)
3688                         continue;
3689                 if (adev->ip_blocks[i].status.hang &&
3690                     adev->ip_blocks[i].version->funcs->pre_soft_reset) {
3691                         r = adev->ip_blocks[i].version->funcs->pre_soft_reset(adev);
3692                         if (r)
3693                                 return r;
3694                 }
3695         }
3696
3697         return 0;
3698 }
3699
3700 /**
3701  * amdgpu_device_ip_need_full_reset - check if a full asic reset is needed
3702  *
3703  * @adev: amdgpu_device pointer
3704  *
3705  * Some hardware IPs cannot be soft reset.  If they are hung, a full gpu
3706  * reset is necessary to recover.
3707  * Returns true if a full asic reset is required, false if not.
3708  */
3709 static bool amdgpu_device_ip_need_full_reset(struct amdgpu_device *adev)
3710 {
3711         int i;
3712
3713         if (amdgpu_asic_need_full_reset(adev))
3714                 return true;
3715
3716         for (i = 0; i < adev->num_ip_blocks; i++) {
3717                 if (!adev->ip_blocks[i].status.valid)
3718                         continue;
3719                 if ((adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_GMC) ||
3720                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_SMC) ||
3721                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_ACP) ||
3722                     (adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_DCE) ||
3723                      adev->ip_blocks[i].version->type == AMD_IP_BLOCK_TYPE_PSP) {
3724                         if (adev->ip_blocks[i].status.hang) {
3725                                 DRM_INFO("Some block need full reset!\n");
3726                                 return true;
3727                         }
3728                 }
3729         }
3730         return false;
3731 }
3732
3733 /**
3734  * amdgpu_device_ip_soft_reset - do a soft reset
3735  *
3736  * @adev: amdgpu_device pointer
3737  *
3738  * The list of all the hardware IPs that make up the asic is walked and the
3739  * soft_reset callbacks are run if the block is hung.  soft_reset handles any
3740  * IP specific hardware or software state changes that are necessary to soft
3741  * reset the IP.
3742  * Returns 0 on success, negative error code on failure.
3743  */
3744 static int amdgpu_device_ip_soft_reset(struct amdgpu_device *adev)
3745 {
3746         int i, r = 0;
3747
3748         for (i = 0; i < adev->num_ip_blocks; i++) {
3749                 if (!adev->ip_blocks[i].status.valid)
3750                         continue;
3751                 if (adev->ip_blocks[i].status.hang &&
3752                     adev->ip_blocks[i].version->funcs->soft_reset) {
3753                         r = adev->ip_blocks[i].version->funcs->soft_reset(adev);
3754                         if (r)
3755                                 return r;
3756                 }
3757         }
3758
3759         return 0;
3760 }
3761
3762 /**
3763  * amdgpu_device_ip_post_soft_reset - clean up from soft reset
3764  *
3765  * @adev: amdgpu_device pointer
3766  *
3767  * The list of all the hardware IPs that make up the asic is walked and the
3768  * post_soft_reset callbacks are run if the asic was hung.  post_soft_reset
3769  * handles any IP specific hardware or software state changes that are
3770  * necessary after the IP has been soft reset.
3771  * Returns 0 on success, negative error code on failure.
3772  */
3773 static int amdgpu_device_ip_post_soft_reset(struct amdgpu_device *adev)
3774 {
3775         int i, r = 0;
3776
3777         for (i = 0; i < adev->num_ip_blocks; i++) {
3778                 if (!adev->ip_blocks[i].status.valid)
3779                         continue;
3780                 if (adev->ip_blocks[i].status.hang &&
3781                     adev->ip_blocks[i].version->funcs->post_soft_reset)
3782                         r = adev->ip_blocks[i].version->funcs->post_soft_reset(adev);
3783                 if (r)
3784                         return r;
3785         }
3786
3787         return 0;
3788 }
3789
3790 /**
3791  * amdgpu_device_recover_vram - Recover some VRAM contents
3792  *
3793  * @adev: amdgpu_device pointer
3794  *
3795  * Restores the contents of VRAM buffers from the shadows in GTT.  Used to
3796  * restore things like GPUVM page tables after a GPU reset where
3797  * the contents of VRAM might be lost.
3798  *
3799  * Returns:
3800  * 0 on success, negative error code on failure.
3801  */
3802 static int amdgpu_device_recover_vram(struct amdgpu_device *adev)
3803 {
3804         struct dma_fence *fence = NULL, *next = NULL;
3805         struct amdgpu_bo *shadow;
3806         long r = 1, tmo;
3807
3808         if (amdgpu_sriov_runtime(adev))
3809                 tmo = msecs_to_jiffies(8000);
3810         else
3811                 tmo = msecs_to_jiffies(100);
3812
3813         DRM_INFO("recover vram bo from shadow start\n");
3814         mutex_lock(&adev->shadow_list_lock);
3815         list_for_each_entry(shadow, &adev->shadow_list, shadow_list) {
3816
3817                 /* No need to recover an evicted BO */
3818                 if (shadow->tbo.mem.mem_type != TTM_PL_TT ||
3819                     shadow->tbo.mem.start == AMDGPU_BO_INVALID_OFFSET ||
3820                     shadow->parent->tbo.mem.mem_type != TTM_PL_VRAM)
3821                         continue;
3822
3823                 r = amdgpu_bo_restore_shadow(shadow, &next);
3824                 if (r)
3825                         break;
3826
3827                 if (fence) {
3828                         tmo = dma_fence_wait_timeout(fence, false, tmo);
3829                         dma_fence_put(fence);
3830                         fence = next;
3831                         if (tmo == 0) {
3832                                 r = -ETIMEDOUT;
3833                                 break;
3834                         } else if (tmo < 0) {
3835                                 r = tmo;
3836                                 break;
3837                         }
3838                 } else {
3839                         fence = next;
3840                 }
3841         }
3842         mutex_unlock(&adev->shadow_list_lock);
3843
3844         if (fence)
3845                 tmo = dma_fence_wait_timeout(fence, false, tmo);
3846         dma_fence_put(fence);
3847
3848         if (r < 0 || tmo <= 0) {
3849                 DRM_ERROR("recover vram bo from shadow failed, r is %ld, tmo is %ld\n", r, tmo);
3850                 return -EIO;
3851         }
3852
3853         DRM_INFO("recover vram bo from shadow done\n");
3854         return 0;
3855 }
3856
3857
3858 /**
3859  * amdgpu_device_reset_sriov - reset ASIC for SR-IOV vf
3860  *
3861  * @adev: amdgpu device pointer
3862  * @from_hypervisor: request from hypervisor
3863  *
3864  * do VF FLR and reinitialize Asic
3865  * return 0 means succeeded otherwise failed
3866  */
3867 static int amdgpu_device_reset_sriov(struct amdgpu_device *adev,
3868                                      bool from_hypervisor)
3869 {
3870         int r;
3871
3872         if (from_hypervisor)
3873                 r = amdgpu_virt_request_full_gpu(adev, true);
3874         else
3875                 r = amdgpu_virt_reset_gpu(adev);
3876         if (r)
3877                 return r;
3878
3879         amdgpu_amdkfd_pre_reset(adev);
3880
3881         /* Resume IP prior to SMC */
3882         r = amdgpu_device_ip_reinit_early_sriov(adev);
3883         if (r)
3884                 goto error;
3885
3886         amdgpu_virt_init_data_exchange(adev);
3887         /* we need recover gart prior to run SMC/CP/SDMA resume */
3888         amdgpu_gtt_mgr_recover(&adev->mman.bdev.man[TTM_PL_TT]);
3889
3890         r = amdgpu_device_fw_loading(adev);
3891         if (r)
3892                 return r;
3893
3894         /* now we are okay to resume SMC/CP/SDMA */
3895         r = amdgpu_device_ip_reinit_late_sriov(adev);
3896         if (r)
3897                 goto error;
3898
3899         amdgpu_irq_gpu_reset_resume_helper(adev);
3900         r = amdgpu_ib_ring_tests(adev);
3901         amdgpu_amdkfd_post_reset(adev);
3902
3903 error:
3904         amdgpu_virt_release_full_gpu(adev, true);
3905         if (!r && adev->virt.gim_feature & AMDGIM_FEATURE_GIM_FLR_VRAMLOST) {
3906                 amdgpu_inc_vram_lost(adev);
3907                 r = amdgpu_device_recover_vram(adev);
3908         }
3909
3910         return r;
3911 }
3912
3913 /**
3914  * amdgpu_device_should_recover_gpu - check if we should try GPU recovery
3915  *
3916  * @adev: amdgpu device pointer
3917  *
3918  * Check amdgpu_gpu_recovery and SRIOV status to see if we should try to recover
3919  * a hung GPU.
3920  */
3921 bool amdgpu_device_should_recover_gpu(struct amdgpu_device *adev)
3922 {
3923         if (!amdgpu_device_ip_check_soft_reset(adev)) {
3924                 DRM_INFO("Timeout, but no hardware hang detected.\n");
3925                 return false;
3926         }
3927
3928         if (amdgpu_gpu_recovery == 0)
3929                 goto disabled;
3930
3931         if (amdgpu_sriov_vf(adev))
3932                 return true;
3933
3934         if (amdgpu_gpu_recovery == -1) {
3935                 switch (adev->asic_type) {
3936                 case CHIP_BONAIRE:
3937                 case CHIP_HAWAII:
3938                 case CHIP_TOPAZ:
3939                 case CHIP_TONGA:
3940                 case CHIP_FIJI:
3941                 case CHIP_POLARIS10:
3942                 case CHIP_POLARIS11:
3943                 case CHIP_POLARIS12:
3944                 case CHIP_VEGAM:
3945                 case CHIP_VEGA20:
3946                 case CHIP_VEGA10:
3947                 case CHIP_VEGA12:
3948                 case CHIP_RAVEN:
3949                 case CHIP_ARCTURUS:
3950                 case CHIP_RENOIR:
3951                 case CHIP_NAVI10:
3952                 case CHIP_NAVI14:
3953                 case CHIP_NAVI12:
3954                         break;
3955                 default:
3956                         goto disabled;
3957                 }
3958         }
3959
3960         return true;
3961
3962 disabled:
3963                 DRM_INFO("GPU recovery disabled.\n");
3964                 return false;
3965 }
3966
3967
3968 static int amdgpu_device_pre_asic_reset(struct amdgpu_device *adev,
3969                                         struct amdgpu_job *job,
3970                                         bool *need_full_reset_arg)
3971 {
3972         int i, r = 0;
3973         bool need_full_reset  = *need_full_reset_arg;
3974
3975         amdgpu_debugfs_wait_dump(adev);
3976
3977         /* block all schedulers and reset given job's ring */
3978         for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
3979                 struct amdgpu_ring *ring = adev->rings[i];
3980
3981                 if (!ring || !ring->sched.thread)
3982                         continue;
3983
3984                 /* after all hw jobs are reset, hw fence is meaningless, so force_completion */
3985                 amdgpu_fence_driver_force_completion(ring);
3986         }
3987
3988         if(job)
3989                 drm_sched_increase_karma(&job->base);
3990
3991         /* Don't suspend on bare metal if we are not going to HW reset the ASIC */
3992         if (!amdgpu_sriov_vf(adev)) {
3993
3994                 if (!need_full_reset)
3995                         need_full_reset = amdgpu_device_ip_need_full_reset(adev);
3996
3997                 if (!need_full_reset) {
3998                         amdgpu_device_ip_pre_soft_reset(adev);
3999                         r = amdgpu_device_ip_soft_reset(adev);
4000                         amdgpu_device_ip_post_soft_reset(adev);
4001                         if (r || amdgpu_device_ip_check_soft_reset(adev)) {
4002                                 DRM_INFO("soft reset failed, will fallback to full reset!\n");
4003                                 need_full_reset = true;
4004                         }
4005                 }
4006
4007                 if (need_full_reset)
4008                         r = amdgpu_device_ip_suspend(adev);
4009
4010                 *need_full_reset_arg = need_full_reset;
4011         }
4012
4013         return r;
4014 }
4015
4016 static int amdgpu_do_asic_reset(struct amdgpu_hive_info *hive,
4017                                struct list_head *device_list_handle,
4018                                bool *need_full_reset_arg)
4019 {
4020         struct amdgpu_device *tmp_adev = NULL;
4021         bool need_full_reset = *need_full_reset_arg, vram_lost = false;
4022         int r = 0;
4023
4024         /*
4025          * ASIC reset has to be done on all HGMI hive nodes ASAP
4026          * to allow proper links negotiation in FW (within 1 sec)
4027          */
4028         if (need_full_reset) {
4029                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4030                         /* For XGMI run all resets in parallel to speed up the process */
4031                         if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4032                                 if (!queue_work(system_unbound_wq, &tmp_adev->xgmi_reset_work))
4033                                         r = -EALREADY;
4034                         } else
4035                                 r = amdgpu_asic_reset(tmp_adev);
4036
4037                         if (r) {
4038                                 DRM_ERROR("ASIC reset failed with error, %d for drm dev, %s",
4039                                          r, tmp_adev->ddev->unique);
4040                                 break;
4041                         }
4042                 }
4043
4044                 /* For XGMI wait for all resets to complete before proceed */
4045                 if (!r) {
4046                         list_for_each_entry(tmp_adev, device_list_handle,
4047                                             gmc.xgmi.head) {
4048                                 if (tmp_adev->gmc.xgmi.num_physical_nodes > 1) {
4049                                         flush_work(&tmp_adev->xgmi_reset_work);
4050                                         r = tmp_adev->asic_reset_res;
4051                                         if (r)
4052                                                 break;
4053                                 }
4054                         }
4055                 }
4056         }
4057
4058         if (!r && amdgpu_ras_intr_triggered()) {
4059                 list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4060                         if (tmp_adev->mmhub.funcs &&
4061                             tmp_adev->mmhub.funcs->reset_ras_error_count)
4062                                 tmp_adev->mmhub.funcs->reset_ras_error_count(tmp_adev);
4063                 }
4064
4065                 amdgpu_ras_intr_cleared();
4066         }
4067
4068         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4069                 if (need_full_reset) {
4070                         /* post card */
4071                         if (amdgpu_atom_asic_init(tmp_adev->mode_info.atom_context))
4072                                 DRM_WARN("asic atom init failed!");
4073
4074                         if (!r) {
4075                                 dev_info(tmp_adev->dev, "GPU reset succeeded, trying to resume\n");
4076                                 r = amdgpu_device_ip_resume_phase1(tmp_adev);
4077                                 if (r)
4078                                         goto out;
4079
4080                                 vram_lost = amdgpu_device_check_vram_lost(tmp_adev);
4081                                 if (vram_lost) {
4082                                         DRM_INFO("VRAM is lost due to GPU reset!\n");
4083                                         amdgpu_inc_vram_lost(tmp_adev);
4084                                 }
4085
4086                                 r = amdgpu_gtt_mgr_recover(
4087                                         &tmp_adev->mman.bdev.man[TTM_PL_TT]);
4088                                 if (r)
4089                                         goto out;
4090
4091                                 r = amdgpu_device_fw_loading(tmp_adev);
4092                                 if (r)
4093                                         return r;
4094
4095                                 r = amdgpu_device_ip_resume_phase2(tmp_adev);
4096                                 if (r)
4097                                         goto out;
4098
4099                                 if (vram_lost)
4100                                         amdgpu_device_fill_reset_magic(tmp_adev);
4101
4102                                 /*
4103                                  * Add this ASIC as tracked as reset was already
4104                                  * complete successfully.
4105                                  */
4106                                 amdgpu_register_gpu_instance(tmp_adev);
4107
4108                                 r = amdgpu_device_ip_late_init(tmp_adev);
4109                                 if (r)
4110                                         goto out;
4111
4112                                 amdgpu_fbdev_set_suspend(tmp_adev, 0);
4113
4114                                 /* must succeed. */
4115                                 amdgpu_ras_resume(tmp_adev);
4116
4117                                 /* Update PSP FW topology after reset */
4118                                 if (hive && tmp_adev->gmc.xgmi.num_physical_nodes > 1)
4119                                         r = amdgpu_xgmi_update_topology(hive, tmp_adev);
4120                         }
4121                 }
4122
4123
4124 out:
4125                 if (!r) {
4126                         amdgpu_irq_gpu_reset_resume_helper(tmp_adev);
4127                         r = amdgpu_ib_ring_tests(tmp_adev);
4128                         if (r) {
4129                                 dev_err(tmp_adev->dev, "ib ring test failed (%d).\n", r);
4130                                 r = amdgpu_device_ip_suspend(tmp_adev);
4131                                 need_full_reset = true;
4132                                 r = -EAGAIN;
4133                                 goto end;
4134                         }
4135                 }
4136
4137                 if (!r)
4138                         r = amdgpu_device_recover_vram(tmp_adev);
4139                 else
4140                         tmp_adev->asic_reset_res = r;
4141         }
4142
4143 end:
4144         *need_full_reset_arg = need_full_reset;
4145         return r;
4146 }
4147
4148 static bool amdgpu_device_lock_adev(struct amdgpu_device *adev, bool trylock)
4149 {
4150         if (trylock) {
4151                 if (!mutex_trylock(&adev->lock_reset))
4152                         return false;
4153         } else
4154                 mutex_lock(&adev->lock_reset);
4155
4156         atomic_inc(&adev->gpu_reset_counter);
4157         adev->in_gpu_reset = true;
4158         switch (amdgpu_asic_reset_method(adev)) {
4159         case AMD_RESET_METHOD_MODE1:
4160                 adev->mp1_state = PP_MP1_STATE_SHUTDOWN;
4161                 break;
4162         case AMD_RESET_METHOD_MODE2:
4163                 adev->mp1_state = PP_MP1_STATE_RESET;
4164                 break;
4165         default:
4166                 adev->mp1_state = PP_MP1_STATE_NONE;
4167                 break;
4168         }
4169
4170         return true;
4171 }
4172
4173 static void amdgpu_device_unlock_adev(struct amdgpu_device *adev)
4174 {
4175         amdgpu_vf_error_trans_all(adev);
4176         adev->mp1_state = PP_MP1_STATE_NONE;
4177         adev->in_gpu_reset = false;
4178         mutex_unlock(&adev->lock_reset);
4179 }
4180
4181 static void amdgpu_device_resume_display_audio(struct amdgpu_device *adev)
4182 {
4183         struct pci_dev *p = NULL;
4184
4185         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4186                         adev->pdev->bus->number, 1);
4187         if (p) {
4188                 pm_runtime_enable(&(p->dev));
4189                 pm_runtime_resume(&(p->dev));
4190         }
4191 }
4192
4193 static int amdgpu_device_suspend_display_audio(struct amdgpu_device *adev)
4194 {
4195         enum amd_reset_method reset_method;
4196         struct pci_dev *p = NULL;
4197         u64 expires;
4198
4199         /*
4200          * For now, only BACO and mode1 reset are confirmed
4201          * to suffer the audio issue without proper suspended.
4202          */
4203         reset_method = amdgpu_asic_reset_method(adev);
4204         if ((reset_method != AMD_RESET_METHOD_BACO) &&
4205              (reset_method != AMD_RESET_METHOD_MODE1))
4206                 return -EINVAL;
4207
4208         p = pci_get_domain_bus_and_slot(pci_domain_nr(adev->pdev->bus),
4209                         adev->pdev->bus->number, 1);
4210         if (!p)
4211                 return -ENODEV;
4212
4213         expires = pm_runtime_autosuspend_expiration(&(p->dev));
4214         if (!expires)
4215                 /*
4216                  * If we cannot get the audio device autosuspend delay,
4217                  * a fixed 4S interval will be used. Considering 3S is
4218                  * the audio controller default autosuspend delay setting.
4219                  * 4S used here is guaranteed to cover that.
4220                  */
4221                 expires = ktime_get_mono_fast_ns() + NSEC_PER_SEC * 4ULL;
4222
4223         while (!pm_runtime_status_suspended(&(p->dev))) {
4224                 if (!pm_runtime_suspend(&(p->dev)))
4225                         break;
4226
4227                 if (expires < ktime_get_mono_fast_ns()) {
4228                         dev_warn(adev->dev, "failed to suspend display audio\n");
4229                         /* TODO: abort the succeeding gpu reset? */
4230                         return -ETIMEDOUT;
4231                 }
4232         }
4233
4234         pm_runtime_disable(&(p->dev));
4235
4236         return 0;
4237 }
4238
4239 /**
4240  * amdgpu_device_gpu_recover - reset the asic and recover scheduler
4241  *
4242  * @adev: amdgpu device pointer
4243  * @job: which job trigger hang
4244  *
4245  * Attempt to reset the GPU if it has hung (all asics).
4246  * Attempt to do soft-reset or full-reset and reinitialize Asic
4247  * Returns 0 for success or an error on failure.
4248  */
4249
4250 int amdgpu_device_gpu_recover(struct amdgpu_device *adev,
4251                               struct amdgpu_job *job)
4252 {
4253         struct list_head device_list, *device_list_handle =  NULL;
4254         bool need_full_reset = false;
4255         bool job_signaled = false;
4256         struct amdgpu_hive_info *hive = NULL;
4257         struct amdgpu_device *tmp_adev = NULL;
4258         int i, r = 0;
4259         bool in_ras_intr = amdgpu_ras_intr_triggered();
4260         bool use_baco =
4261                 (amdgpu_asic_reset_method(adev) == AMD_RESET_METHOD_BACO) ?
4262                 true : false;
4263         bool audio_suspended = false;
4264
4265         /*
4266          * Flush RAM to disk so that after reboot
4267          * the user can read log and see why the system rebooted.
4268          */
4269         if (in_ras_intr && !use_baco && amdgpu_ras_get_context(adev)->reboot) {
4270
4271                 DRM_WARN("Emergency reboot.");
4272
4273                 ksys_sync_helper();
4274                 emergency_restart();
4275         }
4276
4277         dev_info(adev->dev, "GPU %s begin!\n",
4278                 (in_ras_intr && !use_baco) ? "jobs stop":"reset");
4279
4280         /*
4281          * Here we trylock to avoid chain of resets executing from
4282          * either trigger by jobs on different adevs in XGMI hive or jobs on
4283          * different schedulers for same device while this TO handler is running.
4284          * We always reset all schedulers for device and all devices for XGMI
4285          * hive so that should take care of them too.
4286          */
4287         hive = amdgpu_get_xgmi_hive(adev, true);
4288         if (hive && !mutex_trylock(&hive->reset_lock)) {
4289                 DRM_INFO("Bailing on TDR for s_job:%llx, hive: %llx as another already in progress",
4290                           job ? job->base.id : -1, hive->hive_id);
4291                 mutex_unlock(&hive->hive_lock);
4292                 return 0;
4293         }
4294
4295         /*
4296          * Build list of devices to reset.
4297          * In case we are in XGMI hive mode, resort the device list
4298          * to put adev in the 1st position.
4299          */
4300         INIT_LIST_HEAD(&device_list);
4301         if (adev->gmc.xgmi.num_physical_nodes > 1) {
4302                 if (!hive)
4303                         return -ENODEV;
4304                 if (!list_is_first(&adev->gmc.xgmi.head, &hive->device_list))
4305                         list_rotate_to_front(&adev->gmc.xgmi.head, &hive->device_list);
4306                 device_list_handle = &hive->device_list;
4307         } else {
4308                 list_add_tail(&adev->gmc.xgmi.head, &device_list);
4309                 device_list_handle = &device_list;
4310         }
4311
4312         /* block all schedulers and reset given job's ring */
4313         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4314                 if (!amdgpu_device_lock_adev(tmp_adev, !hive)) {
4315                         DRM_INFO("Bailing on TDR for s_job:%llx, as another already in progress",
4316                                   job ? job->base.id : -1);
4317                         mutex_unlock(&hive->hive_lock);
4318                         return 0;
4319                 }
4320
4321                 /*
4322                  * Try to put the audio codec into suspend state
4323                  * before gpu reset started.
4324                  *
4325                  * Due to the power domain of the graphics device
4326                  * is shared with AZ power domain. Without this,
4327                  * we may change the audio hardware from behind
4328                  * the audio driver's back. That will trigger
4329                  * some audio codec errors.
4330                  */
4331                 if (!amdgpu_device_suspend_display_audio(tmp_adev))
4332                         audio_suspended = true;
4333
4334                 amdgpu_ras_set_error_query_ready(tmp_adev, false);
4335
4336                 cancel_delayed_work_sync(&tmp_adev->delayed_init_work);
4337
4338                 if (!amdgpu_sriov_vf(tmp_adev))
4339                         amdgpu_amdkfd_pre_reset(tmp_adev);
4340
4341                 /*
4342                  * Mark these ASICs to be reseted as untracked first
4343                  * And add them back after reset completed
4344                  */
4345                 amdgpu_unregister_gpu_instance(tmp_adev);
4346
4347                 amdgpu_fbdev_set_suspend(tmp_adev, 1);
4348
4349                 /* disable ras on ALL IPs */
4350                 if (!(in_ras_intr && !use_baco) &&
4351                       amdgpu_device_ip_need_full_reset(tmp_adev))
4352                         amdgpu_ras_suspend(tmp_adev);
4353
4354                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4355                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4356
4357                         if (!ring || !ring->sched.thread)
4358                                 continue;
4359
4360                         drm_sched_stop(&ring->sched, job ? &job->base : NULL);
4361
4362                         if (in_ras_intr && !use_baco)
4363                                 amdgpu_job_stop_all_jobs_on_sched(&ring->sched);
4364                 }
4365         }
4366
4367         if (in_ras_intr && !use_baco)
4368                 goto skip_sched_resume;
4369
4370         /*
4371          * Must check guilty signal here since after this point all old
4372          * HW fences are force signaled.
4373          *
4374          * job->base holds a reference to parent fence
4375          */
4376         if (job && job->base.s_fence->parent &&
4377             dma_fence_is_signaled(job->base.s_fence->parent)) {
4378                 job_signaled = true;
4379                 dev_info(adev->dev, "Guilty job already signaled, skipping HW reset");
4380                 goto skip_hw_reset;
4381         }
4382
4383 retry:  /* Rest of adevs pre asic reset from XGMI hive. */
4384         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4385                 r = amdgpu_device_pre_asic_reset(tmp_adev,
4386                                                  NULL,
4387                                                  &need_full_reset);
4388                 /*TODO Should we stop ?*/
4389                 if (r) {
4390                         DRM_ERROR("GPU pre asic reset failed with err, %d for drm dev, %s ",
4391                                   r, tmp_adev->ddev->unique);
4392                         tmp_adev->asic_reset_res = r;
4393                 }
4394         }
4395
4396         /* Actual ASIC resets if needed.*/
4397         /* TODO Implement XGMI hive reset logic for SRIOV */
4398         if (amdgpu_sriov_vf(adev)) {
4399                 r = amdgpu_device_reset_sriov(adev, job ? false : true);
4400                 if (r)
4401                         adev->asic_reset_res = r;
4402         } else {
4403                 r  = amdgpu_do_asic_reset(hive, device_list_handle, &need_full_reset);
4404                 if (r && r == -EAGAIN)
4405                         goto retry;
4406         }
4407
4408 skip_hw_reset:
4409
4410         /* Post ASIC reset for all devs .*/
4411         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4412
4413                 for (i = 0; i < AMDGPU_MAX_RINGS; ++i) {
4414                         struct amdgpu_ring *ring = tmp_adev->rings[i];
4415
4416                         if (!ring || !ring->sched.thread)
4417                                 continue;
4418
4419                         /* No point to resubmit jobs if we didn't HW reset*/
4420                         if (!tmp_adev->asic_reset_res && !job_signaled)
4421                                 drm_sched_resubmit_jobs(&ring->sched);
4422
4423                         drm_sched_start(&ring->sched, !tmp_adev->asic_reset_res);
4424                 }
4425
4426                 if (!amdgpu_device_has_dc_support(tmp_adev) && !job_signaled) {
4427                         drm_helper_resume_force_mode(tmp_adev->ddev);
4428                 }
4429
4430                 tmp_adev->asic_reset_res = 0;
4431
4432                 if (r) {
4433                         /* bad news, how to tell it to userspace ? */
4434                         dev_info(tmp_adev->dev, "GPU reset(%d) failed\n", atomic_read(&tmp_adev->gpu_reset_counter));
4435                         amdgpu_vf_error_put(tmp_adev, AMDGIM_ERROR_VF_GPU_RESET_FAIL, 0, r);
4436                 } else {
4437                         dev_info(tmp_adev->dev, "GPU reset(%d) succeeded!\n", atomic_read(&tmp_adev->gpu_reset_counter));
4438                 }
4439         }
4440
4441 skip_sched_resume:
4442         list_for_each_entry(tmp_adev, device_list_handle, gmc.xgmi.head) {
4443                 /*unlock kfd: SRIOV would do it separately */
4444                 if (!(in_ras_intr && !use_baco) && !amdgpu_sriov_vf(tmp_adev))
4445                         amdgpu_amdkfd_post_reset(tmp_adev);
4446                 if (audio_suspended)
4447                         amdgpu_device_resume_display_audio(tmp_adev);
4448                 amdgpu_device_unlock_adev(tmp_adev);
4449         }
4450
4451         if (hive) {
4452                 mutex_unlock(&hive->reset_lock);
4453                 mutex_unlock(&hive->hive_lock);
4454         }
4455
4456         if (r)
4457                 dev_info(adev->dev, "GPU reset end with ret = %d\n", r);
4458         return r;
4459 }
4460
4461 /**
4462  * amdgpu_device_get_pcie_info - fence pcie info about the PCIE slot
4463  *
4464  * @adev: amdgpu_device pointer
4465  *
4466  * Fetchs and stores in the driver the PCIE capabilities (gen speed
4467  * and lanes) of the slot the device is in. Handles APUs and
4468  * virtualized environments where PCIE config space may not be available.
4469  */
4470 static void amdgpu_device_get_pcie_info(struct amdgpu_device *adev)
4471 {
4472         struct pci_dev *pdev;
4473         enum pci_bus_speed speed_cap, platform_speed_cap;
4474         enum pcie_link_width platform_link_width;
4475
4476         if (amdgpu_pcie_gen_cap)
4477                 adev->pm.pcie_gen_mask = amdgpu_pcie_gen_cap;
4478
4479         if (amdgpu_pcie_lane_cap)
4480                 adev->pm.pcie_mlw_mask = amdgpu_pcie_lane_cap;
4481
4482         /* covers APUs as well */
4483         if (pci_is_root_bus(adev->pdev->bus)) {
4484                 if (adev->pm.pcie_gen_mask == 0)
4485                         adev->pm.pcie_gen_mask = AMDGPU_DEFAULT_PCIE_GEN_MASK;
4486                 if (adev->pm.pcie_mlw_mask == 0)
4487                         adev->pm.pcie_mlw_mask = AMDGPU_DEFAULT_PCIE_MLW_MASK;
4488                 return;
4489         }
4490
4491         if (adev->pm.pcie_gen_mask && adev->pm.pcie_mlw_mask)
4492                 return;
4493
4494         pcie_bandwidth_available(adev->pdev, NULL,
4495                                  &platform_speed_cap, &platform_link_width);
4496
4497         if (adev->pm.pcie_gen_mask == 0) {
4498                 /* asic caps */
4499                 pdev = adev->pdev;
4500                 speed_cap = pcie_get_speed_cap(pdev);
4501                 if (speed_cap == PCI_SPEED_UNKNOWN) {
4502                         adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4503                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4504                                                   CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4505                 } else {
4506                         if (speed_cap == PCIE_SPEED_16_0GT)
4507                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4508                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4509                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4510                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN4);
4511                         else if (speed_cap == PCIE_SPEED_8_0GT)
4512                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4513                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4514                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN3);
4515                         else if (speed_cap == PCIE_SPEED_5_0GT)
4516                                 adev->pm.pcie_gen_mask |= (CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4517                                                           CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN2);
4518                         else
4519                                 adev->pm.pcie_gen_mask |= CAIL_ASIC_PCIE_LINK_SPEED_SUPPORT_GEN1;
4520                 }
4521                 /* platform caps */
4522                 if (platform_speed_cap == PCI_SPEED_UNKNOWN) {
4523                         adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4524                                                    CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4525                 } else {
4526                         if (platform_speed_cap == PCIE_SPEED_16_0GT)
4527                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4528                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4529                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3 |
4530                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN4);
4531                         else if (platform_speed_cap == PCIE_SPEED_8_0GT)
4532                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4533                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2 |
4534                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN3);
4535                         else if (platform_speed_cap == PCIE_SPEED_5_0GT)
4536                                 adev->pm.pcie_gen_mask |= (CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1 |
4537                                                            CAIL_PCIE_LINK_SPEED_SUPPORT_GEN2);
4538                         else
4539                                 adev->pm.pcie_gen_mask |= CAIL_PCIE_LINK_SPEED_SUPPORT_GEN1;
4540
4541                 }
4542         }
4543         if (adev->pm.pcie_mlw_mask == 0) {
4544                 if (platform_link_width == PCIE_LNK_WIDTH_UNKNOWN) {
4545                         adev->pm.pcie_mlw_mask |= AMDGPU_DEFAULT_PCIE_MLW_MASK;
4546                 } else {
4547                         switch (platform_link_width) {
4548                         case PCIE_LNK_X32:
4549                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X32 |
4550                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4551                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4552                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4553                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4554                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4555                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4556                                 break;
4557                         case PCIE_LNK_X16:
4558                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X16 |
4559                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4560                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4561                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4562                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4563                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4564                                 break;
4565                         case PCIE_LNK_X12:
4566                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X12 |
4567                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4568                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4569                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4570                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4571                                 break;
4572                         case PCIE_LNK_X8:
4573                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X8 |
4574                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4575                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4576                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4577                                 break;
4578                         case PCIE_LNK_X4:
4579                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X4 |
4580                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4581                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4582                                 break;
4583                         case PCIE_LNK_X2:
4584                                 adev->pm.pcie_mlw_mask = (CAIL_PCIE_LINK_WIDTH_SUPPORT_X2 |
4585                                                           CAIL_PCIE_LINK_WIDTH_SUPPORT_X1);
4586                                 break;
4587                         case PCIE_LNK_X1:
4588                                 adev->pm.pcie_mlw_mask = CAIL_PCIE_LINK_WIDTH_SUPPORT_X1;
4589                                 break;
4590                         default:
4591                                 break;
4592                         }
4593                 }
4594         }
4595 }
4596
4597 int amdgpu_device_baco_enter(struct drm_device *dev)
4598 {
4599         struct amdgpu_device *adev = dev->dev_private;
4600         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4601
4602         if (!amdgpu_device_supports_baco(adev->ddev))
4603                 return -ENOTSUPP;
4604
4605         if (ras && ras->supported)
4606                 adev->nbio.funcs->enable_doorbell_interrupt(adev, false);
4607
4608         return amdgpu_dpm_baco_enter(adev);
4609 }
4610
4611 int amdgpu_device_baco_exit(struct drm_device *dev)
4612 {
4613         struct amdgpu_device *adev = dev->dev_private;
4614         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
4615         int ret = 0;
4616
4617         if (!amdgpu_device_supports_baco(adev->ddev))
4618                 return -ENOTSUPP;
4619
4620         ret = amdgpu_dpm_baco_exit(adev);
4621         if (ret)
4622                 return ret;
4623
4624         if (ras && ras->supported)
4625                 adev->nbio.funcs->enable_doorbell_interrupt(adev, true);
4626
4627         return 0;
4628 }