drivers/gpu/drm/amd/amdgpu/amdgpu_ras.c

   1 /*
   2  * Copyright 2018 Advanced Micro Devices, Inc.
   3  *
   4  * Permission is hereby granted, free of charge, to any person obtaining a
   5  * copy of this software and associated documentation files (the "Software"),
   6  * to deal in the Software without restriction, including without limitation
   7  * the rights to use, copy, modify, merge, publish, distribute, sublicense,
   8  * and/or sell copies of the Software, and to permit persons to whom the
   9  * Software is furnished to do so, subject to the following conditions:
  10  *
  11  * The above copyright notice and this permission notice shall be included in
  12  * all copies or substantial portions of the Software.
  13  *
  14  * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
  15  * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
  16  * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL
  17  * THE COPYRIGHT HOLDER(S) OR AUTHOR(S) BE LIABLE FOR ANY CLAIM, DAMAGES OR
  18  * OTHER LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE,
  19  * ARISING FROM, OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR
  20  * OTHER DEALINGS IN THE SOFTWARE.
  21  *
  22  *
  23  */
  24 #include <linux/debugfs.h>
  25 #include <linux/list.h>
  26 #include <linux/module.h>
  27 #include <linux/uaccess.h>
  28
  29 #include "amdgpu.h"
  30 #include "amdgpu_ras.h"
  31 #include "amdgpu_atomfirmware.h"
  32
  33 const char *ras_error_string[] = {
  34         "none",
  35         "parity",
  36         "single_correctable",
  37         "multi_uncorrectable",
  38         "poison",
  39 };
  40
  41 const char *ras_block_string[] = {
  42         "umc",
  43         "sdma",
  44         "gfx",
  45         "mmhub",
  46         "athub",
  47         "pcie_bif",
  48         "hdp",
  49         "xgmi_wafl",
  50         "df",
  51         "smn",
  52         "sem",
  53         "mp0",
  54         "mp1",
  55         "fuse",
  56 };
  57
  58 #define ras_err_str(i) (ras_error_string[ffs(i)])
  59 #define ras_block_str(i) (ras_block_string[i])
  60
  61 #define AMDGPU_RAS_FLAG_INIT_BY_VBIOS           1
  62 #define AMDGPU_RAS_FLAG_INIT_NEED_RESET         2
  63 #define RAS_DEFAULT_FLAGS (AMDGPU_RAS_FLAG_INIT_BY_VBIOS)
  64
  65 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
  66                 uint64_t offset, uint64_t size,
  67                 struct amdgpu_bo **bo_ptr);
  68 static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
  69                 struct amdgpu_bo **bo_ptr);
  70
  71 static ssize_t amdgpu_ras_debugfs_read(struct file *f, char __user *buf,
  72                                         size_t size, loff_t *pos)
  73 {
  74         struct ras_manager *obj = (struct ras_manager *)file_inode(f)->i_private;
  75         struct ras_query_if info = {
  76                 .head = obj->head,
  77         };
  78         ssize_t s;
  79         char val[128];
  80
  81         if (amdgpu_ras_error_query(obj->adev, &info))
  82                 return -EINVAL;
  83
  84         s = snprintf(val, sizeof(val), "%s: %lu\n%s: %lu\n",
  85                         "ue", info.ue_count,
  86                         "ce", info.ce_count);
  87         if (*pos >= s)
  88                 return 0;
  89
  90         s -= *pos;
  91         s = min_t(u64, s, size);
  92
  93
  94         if (copy_to_user(buf, &val[*pos], s))
  95                 return -EINVAL;
  96
  97         *pos += s;
  98
  99         return s;
 100 }
 101
 102 static const struct file_operations amdgpu_ras_debugfs_ops = {
 103         .owner = THIS_MODULE,
 104         .read = amdgpu_ras_debugfs_read,
 105         .write = NULL,
 106         .llseek = default_llseek
 107 };
 108
 109 static int amdgpu_ras_find_block_id_by_name(const char *name, int *block_id)
 110 {
 111         int i;
 112
 113         for (i = 0; i < ARRAY_SIZE(ras_block_string); i++) {
 114                 *block_id = i;
 115                 if (strcmp(name, ras_block_str(i)) == 0)
 116                         return 0;
 117         }
 118         return -EINVAL;
 119 }
 120
 121 static int amdgpu_ras_debugfs_ctrl_parse_data(struct file *f,
 122                 const char __user *buf, size_t size,
 123                 loff_t *pos, struct ras_debug_if *data)
 124 {
 125         ssize_t s = min_t(u64, 64, size);
 126         char str[65];
 127         char block_name[33];
 128         char err[9] = "ue";
 129         int op = -1;
 130         int block_id;
 131         u64 address, value;
 132
 133         if (*pos)
 134                 return -EINVAL;
 135         *pos = size;
 136
 137         memset(str, 0, sizeof(str));
 138         memset(data, 0, sizeof(*data));
 139
 140         if (copy_from_user(str, buf, s))
 141                 return -EINVAL;
 142
 143         if (sscanf(str, "disable %32s", block_name) == 1)
 144                 op = 0;
 145         else if (sscanf(str, "enable %32s %8s", block_name, err) == 2)
 146                 op = 1;
 147         else if (sscanf(str, "inject %32s %8s", block_name, err) == 2)
 148                 op = 2;
 149         else if (str[0] && str[1] && str[2] && str[3])
 150                 /* ascii string, but commands are not matched. */
 151                 return -EINVAL;
 152
 153         if (op != -1) {
 154                 if (amdgpu_ras_find_block_id_by_name(block_name, &block_id))
 155                         return -EINVAL;
 156
 157                 data->head.block = block_id;
 158                 data->head.type = memcmp("ue", err, 2) == 0 ?
 159                         AMDGPU_RAS_ERROR__MULTI_UNCORRECTABLE :
 160                         AMDGPU_RAS_ERROR__SINGLE_CORRECTABLE;
 161                 data->op = op;
 162
 163                 if (op == 2) {
 164                         if (sscanf(str, "%*s %*s %*s %llu %llu",
 165                                                 &address, &value) != 2)
 166                                 if (sscanf(str, "%*s %*s %*s 0x%llx 0x%llx",
 167                                                         &address, &value) != 2)
 168                                         return -EINVAL;
 169                         data->inject.address = address;
 170                         data->inject.value = value;
 171                 }
 172         } else {
 173                 if (size < sizeof(*data))
 174                         return -EINVAL;
 175
 176                 if (copy_from_user(data, buf, sizeof(*data)))
 177                         return -EINVAL;
 178         }
 179
 180         return 0;
 181 }
 182 /**
 183  * DOC: AMDGPU RAS debugfs control interface
 184  *
 185  * It accepts struct ras_debug_if who has two members.
 186  *
 187  * First member: ras_debug_if::head or ras_debug_if::inject.
 188  *
 189  * head is used to indicate which IP block will be under control.
 190  *
 191  * head has four members, they are block, type, sub_block_index, name.
 192  * block: which IP will be under control.
 193  * type: what kind of error will be enabled/disabled/injected.
 194  * sub_block_index: some IPs have subcomponets. say, GFX, sDMA.
 195  * name: the name of IP.
 196  *
 197  * inject has two more members than head, they are address, value.
 198  * As their names indicate, inject operation will write the
 199  * value to the address.
 200  *
 201  * Second member: struct ras_debug_if::op.
 202  * It has three kinds of operations.
 203  *  0: disable RAS on the block. Take ::head as its data.
 204  *  1: enable RAS on the block. Take ::head as its data.
 205  *  2: inject errors on the block. Take ::inject as its data.
 206  *
 207  * How to use the interface?
 208  * programs:
 209  * copy the struct ras_debug_if in your codes and initialize it.
 210  * write the struct to the control node.
 211  *
 212  * bash:
 213  * echo op block [error [address value]] > .../ras/ras_ctrl
 214  *      op: disable, enable, inject
 215  *              disable: only block is needed
 216  *              enable: block and error are needed
 217  *              inject: error, address, value are needed
 218  *      block: umc, smda, gfx, .........
 219  *              see ras_block_string[] for details
 220  *      error: ue, ce
 221  *              ue: multi_uncorrectable
 222  *              ce: single_correctable
 223  *
 224  * here are some examples for bash commands,
 225  *      echo inject umc ue 0x0 0x0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 226  *      echo inject umc ce 0 0 > /sys/kernel/debug/dri/0/ras/ras_ctrl
 227  *      echo disable umc > /sys/kernel/debug/dri/0/ras/ras_ctrl
 228  *
 229  * How to check the result?
 230  *
 231  * For disable/enable, please check ras features at
 232  * /sys/class/drm/card[0/1/2...]/device/ras/features
 233  *
 234  * For inject, please check corresponding err count at
 235  * /sys/class/drm/card[0/1/2...]/device/ras/[gfx/sdma/...]_err_count
 236  *
 237  * NOTE: operation is only allowed on blocks which are supported.
 238  * Please check ras mask at /sys/module/amdgpu/parameters/ras_mask
 239  */
 240 static ssize_t amdgpu_ras_debugfs_ctrl_write(struct file *f, const char __user *buf,
 241                 size_t size, loff_t *pos)
 242 {
 243         struct amdgpu_device *adev = (struct amdgpu_device *)file_inode(f)->i_private;
 244         struct ras_debug_if data;
 245         struct amdgpu_bo *bo;
 246         int ret = 0;
 247
 248         ret = amdgpu_ras_debugfs_ctrl_parse_data(f, buf, size, pos, &data);
 249         if (ret)
 250                 return -EINVAL;
 251
 252         if (!amdgpu_ras_is_supported(adev, data.head.block))
 253                 return -EINVAL;
 254
 255         switch (data.op) {
 256         case 0:
 257                 ret = amdgpu_ras_feature_enable(adev, &data.head, 0);
 258                 break;
 259         case 1:
 260                 ret = amdgpu_ras_feature_enable(adev, &data.head, 1);
 261                 break;
 262         case 2:
 263                 ret = amdgpu_ras_reserve_vram(adev,
 264                                 data.inject.address, PAGE_SIZE, &bo);
 265                 if (ret) {
 266                         /* address was offset, now it is absolute.*/
 267                         data.inject.address += adev->gmc.vram_start;
 268                         if (data.inject.address > adev->gmc.vram_end)
 269                                 break;
 270                 } else
 271                         data.inject.address = amdgpu_bo_gpu_offset(bo);
 272                 ret = amdgpu_ras_error_inject(adev, &data.inject);
 273                 amdgpu_ras_release_vram(adev, &bo);
 274                 break;
 275         default:
 276                 ret = -EINVAL;
 277                 break;
 278         };
 279
 280         if (ret)
 281                 return -EINVAL;
 282
 283         return size;
 284 }
 285
 286 static const struct file_operations amdgpu_ras_debugfs_ctrl_ops = {
 287         .owner = THIS_MODULE,
 288         .read = NULL,
 289         .write = amdgpu_ras_debugfs_ctrl_write,
 290         .llseek = default_llseek
 291 };
 292
 293 static ssize_t amdgpu_ras_sysfs_read(struct device *dev,
 294                 struct device_attribute *attr, char *buf)
 295 {
 296         struct ras_manager *obj = container_of(attr, struct ras_manager, sysfs_attr);
 297         struct ras_query_if info = {
 298                 .head = obj->head,
 299         };
 300
 301         if (amdgpu_ras_error_query(obj->adev, &info))
 302                 return -EINVAL;
 303
 304         return snprintf(buf, PAGE_SIZE, "%s: %lu\n%s: %lu\n",
 305                         "ue", info.ue_count,
 306                         "ce", info.ce_count);
 307 }
 308
 309 /* obj begin */
 310
 311 #define get_obj(obj) do { (obj)->use++; } while (0)
 312 #define alive_obj(obj) ((obj)->use)
 313
 314 static inline void put_obj(struct ras_manager *obj)
 315 {
 316         if (obj && --obj->use == 0)
 317                 list_del(&obj->node);
 318         if (obj && obj->use < 0) {
 319                  DRM_ERROR("RAS ERROR: Unbalance obj(%s) use\n", obj->head.name);
 320         }
 321 }
 322
 323 /* make one obj and return it. */
 324 static struct ras_manager *amdgpu_ras_create_obj(struct amdgpu_device *adev,
 325                 struct ras_common_if *head)
 326 {
 327         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 328         struct ras_manager *obj;
 329
 330         if (!con)
 331                 return NULL;
 332
 333         if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 334                 return NULL;
 335
 336         obj = &con->objs[head->block];
 337         /* already exist. return obj? */
 338         if (alive_obj(obj))
 339                 return NULL;
 340
 341         obj->head = *head;
 342         obj->adev = adev;
 343         list_add(&obj->node, &con->head);
 344         get_obj(obj);
 345
 346         return obj;
 347 }
 348
 349 /* return an obj equal to head, or the first when head is NULL */
 350 static struct ras_manager *amdgpu_ras_find_obj(struct amdgpu_device *adev,
 351                 struct ras_common_if *head)
 352 {
 353         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 354         struct ras_manager *obj;
 355         int i;
 356
 357         if (!con)
 358                 return NULL;
 359
 360         if (head) {
 361                 if (head->block >= AMDGPU_RAS_BLOCK_COUNT)
 362                         return NULL;
 363
 364                 obj = &con->objs[head->block];
 365
 366                 if (alive_obj(obj)) {
 367                         WARN_ON(head->block != obj->head.block);
 368                         return obj;
 369                 }
 370         } else {
 371                 for (i = 0; i < AMDGPU_RAS_BLOCK_COUNT; i++) {
 372                         obj = &con->objs[i];
 373                         if (alive_obj(obj)) {
 374                                 WARN_ON(i != obj->head.block);
 375                                 return obj;
 376                         }
 377                 }
 378         }
 379
 380         return NULL;
 381 }
 382 /* obj end */
 383
 384 /* feature ctl begin */
 385 static int amdgpu_ras_is_feature_allowed(struct amdgpu_device *adev,
 386                 struct ras_common_if *head)
 387 {
 388         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 389
 390         return con->hw_supported & BIT(head->block);
 391 }
 392
 393 static int amdgpu_ras_is_feature_enabled(struct amdgpu_device *adev,
 394                 struct ras_common_if *head)
 395 {
 396         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 397
 398         return con->features & BIT(head->block);
 399 }
 400
 401 /*
 402  * if obj is not created, then create one.
 403  * set feature enable flag.
 404  */
 405 static int __amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 406                 struct ras_common_if *head, int enable)
 407 {
 408         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 409         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 410
 411         /* If hardware does not support ras, then do not create obj.
 412          * But if hardware support ras, we can create the obj.
 413          * Ras framework checks con->hw_supported to see if it need do
 414          * corresponding initialization.
 415          * IP checks con->support to see if it need disable ras.
 416          */
 417         if (!amdgpu_ras_is_feature_allowed(adev, head))
 418                 return 0;
 419         if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 420                 return 0;
 421
 422         if (enable) {
 423                 if (!obj) {
 424                         obj = amdgpu_ras_create_obj(adev, head);
 425                         if (!obj)
 426                                 return -EINVAL;
 427                 } else {
 428                         /* In case we create obj somewhere else */
 429                         get_obj(obj);
 430                 }
 431                 con->features |= BIT(head->block);
 432         } else {
 433                 if (obj && amdgpu_ras_is_feature_enabled(adev, head)) {
 434                         con->features &= ~BIT(head->block);
 435                         put_obj(obj);
 436                 }
 437         }
 438
 439         return 0;
 440 }
 441
 442 /* wrapper of psp_ras_enable_features */
 443 int amdgpu_ras_feature_enable(struct amdgpu_device *adev,
 444                 struct ras_common_if *head, bool enable)
 445 {
 446         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 447         union ta_ras_cmd_input info;
 448         int ret;
 449
 450         if (!con)
 451                 return -EINVAL;
 452
 453         if (!enable) {
 454                 info.disable_features = (struct ta_ras_disable_features_input) {
 455                         .block_id =  amdgpu_ras_block_to_ta(head->block),
 456                         .error_type = amdgpu_ras_error_to_ta(head->type),
 457                 };
 458         } else {
 459                 info.enable_features = (struct ta_ras_enable_features_input) {
 460                         .block_id =  amdgpu_ras_block_to_ta(head->block),
 461                         .error_type = amdgpu_ras_error_to_ta(head->type),
 462                 };
 463         }
 464
 465         /* Do not enable if it is not allowed. */
 466         WARN_ON(enable && !amdgpu_ras_is_feature_allowed(adev, head));
 467         /* Are we alerady in that state we are going to set? */
 468         if (!(!!enable ^ !!amdgpu_ras_is_feature_enabled(adev, head)))
 469                 return 0;
 470
 471         ret = psp_ras_enable_features(&adev->psp, &info, enable);
 472         if (ret) {
 473                 DRM_ERROR("RAS ERROR: %s %s feature failed ret %d\n",
 474                                 enable ? "enable":"disable",
 475                                 ras_block_str(head->block),
 476                                 ret);
 477                 if (ret == TA_RAS_STATUS__RESET_NEEDED)
 478                         return -EAGAIN;
 479                 return -EINVAL;
 480         }
 481
 482         /* setup the obj */
 483         __amdgpu_ras_feature_enable(adev, head, enable);
 484
 485         return 0;
 486 }
 487
 488 /* Only used in device probe stage and called only once. */
 489 int amdgpu_ras_feature_enable_on_boot(struct amdgpu_device *adev,
 490                 struct ras_common_if *head, bool enable)
 491 {
 492         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 493         int ret;
 494
 495         if (!con)
 496                 return -EINVAL;
 497
 498         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
 499                 if (enable) {
 500                         /* There is no harm to issue a ras TA cmd regardless of
 501                          * the currecnt ras state.
 502                          * If current state == target state, it will do nothing
 503                          * But sometimes it requests driver to reset and repost
 504                          * with error code -EAGAIN.
 505                          */
 506                         ret = amdgpu_ras_feature_enable(adev, head, 1);
 507                         /* With old ras TA, we might fail to enable ras.
 508                          * Log it and just setup the object.
 509                          * TODO need remove this WA in the future.
 510                          */
 511                         if (ret == -EINVAL) {
 512                                 ret = __amdgpu_ras_feature_enable(adev, head, 1);
 513                                 if (!ret)
 514                                         DRM_INFO("RAS INFO: %s setup object\n",
 515                                                 ras_block_str(head->block));
 516                         }
 517                 } else {
 518                         /* setup the object then issue a ras TA disable cmd.*/
 519                         ret = __amdgpu_ras_feature_enable(adev, head, 1);
 520                         if (ret)
 521                                 return ret;
 522
 523                         ret = amdgpu_ras_feature_enable(adev, head, 0);
 524                 }
 525         } else
 526                 ret = amdgpu_ras_feature_enable(adev, head, enable);
 527
 528         return ret;
 529 }
 530
 531 static int amdgpu_ras_disable_all_features(struct amdgpu_device *adev,
 532                 bool bypass)
 533 {
 534         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 535         struct ras_manager *obj, *tmp;
 536
 537         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 538                 /* bypass psp.
 539                  * aka just release the obj and corresponding flags
 540                  */
 541                 if (bypass) {
 542                         if (__amdgpu_ras_feature_enable(adev, &obj->head, 0))
 543                                 break;
 544                 } else {
 545                         if (amdgpu_ras_feature_enable(adev, &obj->head, 0))
 546                                 break;
 547                 }
 548         }
 549
 550         return con->features;
 551 }
 552
 553 static int amdgpu_ras_enable_all_features(struct amdgpu_device *adev,
 554                 bool bypass)
 555 {
 556         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 557         int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
 558         int i;
 559         const enum amdgpu_ras_error_type default_ras_type =
 560                 AMDGPU_RAS_ERROR__NONE;
 561
 562         for (i = 0; i < ras_block_count; i++) {
 563                 struct ras_common_if head = {
 564                         .block = i,
 565                         .type = default_ras_type,
 566                         .sub_block_index = 0,
 567                 };
 568                 strcpy(head.name, ras_block_str(i));
 569                 if (bypass) {
 570                         /*
 571                          * bypass psp. vbios enable ras for us.
 572                          * so just create the obj
 573                          */
 574                         if (__amdgpu_ras_feature_enable(adev, &head, 1))
 575                                 break;
 576                 } else {
 577                         if (amdgpu_ras_feature_enable(adev, &head, 1))
 578                                 break;
 579                 }
 580         }
 581
 582         return con->features;
 583 }
 584 /* feature ctl end */
 585
 586 /* query/inject/cure begin */
 587 int amdgpu_ras_error_query(struct amdgpu_device *adev,
 588                 struct ras_query_if *info)
 589 {
 590         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 591         struct ras_err_data err_data = {0, 0};
 592
 593         if (!obj)
 594                 return -EINVAL;
 595
 596         switch (info->head.block) {
 597         case AMDGPU_RAS_BLOCK__UMC:
 598                 if (adev->umc.funcs->query_ras_error_count)
 599                         adev->umc.funcs->query_ras_error_count(adev, &err_data);
 600                 break;
 601         default:
 602                 break;
 603         }
 604
 605         obj->err_data.ue_count += err_data.ue_count;
 606         obj->err_data.ce_count += err_data.ce_count;
 607
 608         info->ue_count = obj->err_data.ue_count;
 609         info->ce_count = obj->err_data.ce_count;
 610
 611         if (err_data.ce_count)
 612                 dev_info(adev->dev, "%ld correctable errors detected in %s block\n",
 613                          obj->err_data.ce_count, ras_block_str(info->head.block));
 614         if (err_data.ue_count)
 615                 dev_info(adev->dev, "%ld uncorrectable errors detected in %s block\n",
 616                          obj->err_data.ue_count, ras_block_str(info->head.block));
 617
 618         return 0;
 619 }
 620
 621 /* wrapper of psp_ras_trigger_error */
 622 int amdgpu_ras_error_inject(struct amdgpu_device *adev,
 623                 struct ras_inject_if *info)
 624 {
 625         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
 626         struct ta_ras_trigger_error_input block_info = {
 627                 .block_id =  amdgpu_ras_block_to_ta(info->head.block),
 628                 .inject_error_type = amdgpu_ras_error_to_ta(info->head.type),
 629                 .sub_block_index = info->head.sub_block_index,
 630                 .address = info->address,
 631                 .value = info->value,
 632         };
 633         int ret = 0;
 634
 635         if (!obj)
 636                 return -EINVAL;
 637
 638         if (block_info.block_id != TA_RAS_BLOCK__UMC) {
 639                 DRM_INFO("%s error injection is not supported yet\n",
 640                          ras_block_str(info->head.block));
 641                 return -EINVAL;
 642         }
 643
 644         ret = psp_ras_trigger_error(&adev->psp, &block_info);
 645         if (ret)
 646                 DRM_ERROR("RAS ERROR: inject %s error failed ret %d\n",
 647                                 ras_block_str(info->head.block),
 648                                 ret);
 649
 650         return ret;
 651 }
 652
 653 int amdgpu_ras_error_cure(struct amdgpu_device *adev,
 654                 struct ras_cure_if *info)
 655 {
 656         /* psp fw has no cure interface for now. */
 657         return 0;
 658 }
 659
 660 /* get the total error counts on all IPs */
 661 int amdgpu_ras_query_error_count(struct amdgpu_device *adev,
 662                 bool is_ce)
 663 {
 664         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 665         struct ras_manager *obj;
 666         struct ras_err_data data = {0, 0};
 667
 668         if (!con)
 669                 return -EINVAL;
 670
 671         list_for_each_entry(obj, &con->head, node) {
 672                 struct ras_query_if info = {
 673                         .head = obj->head,
 674                 };
 675
 676                 if (amdgpu_ras_error_query(adev, &info))
 677                         return -EINVAL;
 678
 679                 data.ce_count += info.ce_count;
 680                 data.ue_count += info.ue_count;
 681         }
 682
 683         return is_ce ? data.ce_count : data.ue_count;
 684 }
 685 /* query/inject/cure end */
 686
 687
 688 /* sysfs begin */
 689
 690 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
 691                 struct ras_badpage **bps, unsigned int *count);
 692
 693 static char *amdgpu_ras_badpage_flags_str(unsigned int flags)
 694 {
 695         switch (flags) {
 696         case 0:
 697                 return "R";
 698         case 1:
 699                 return "P";
 700         case 2:
 701         default:
 702                 return "F";
 703         };
 704 }
 705
 706 /*
 707  * DOC: ras sysfs gpu_vram_bad_pages interface
 708  *
 709  * It allows user to read the bad pages of vram on the gpu through
 710  * /sys/class/drm/card[0/1/2...]/device/ras/gpu_vram_bad_pages
 711  *
 712  * It outputs multiple lines, and each line stands for one gpu page.
 713  *
 714  * The format of one line is below,
 715  * gpu pfn : gpu page size : flags
 716  *
 717  * gpu pfn and gpu page size are printed in hex format.
 718  * flags can be one of below character,
 719  * R: reserved, this gpu page is reserved and not able to use.
 720  * P: pending for reserve, this gpu page is marked as bad, will be reserved
 721  *    in next window of page_reserve.
 722  * F: unable to reserve. this gpu page can't be reserved due to some reasons.
 723  *
 724  * examples:
 725  * 0x00000001 : 0x00001000 : R
 726  * 0x00000002 : 0x00001000 : P
 727  */
 728
 729 static ssize_t amdgpu_ras_sysfs_badpages_read(struct file *f,
 730                 struct kobject *kobj, struct bin_attribute *attr,
 731                 char *buf, loff_t ppos, size_t count)
 732 {
 733         struct amdgpu_ras *con =
 734                 container_of(attr, struct amdgpu_ras, badpages_attr);
 735         struct amdgpu_device *adev = con->adev;
 736         const unsigned int element_size =
 737                 sizeof("0xabcdabcd : 0x12345678 : R\n") - 1;
 738         unsigned int start = div64_ul(ppos + element_size - 1, element_size);
 739         unsigned int end = div64_ul(ppos + count - 1, element_size);
 740         ssize_t s = 0;
 741         struct ras_badpage *bps = NULL;
 742         unsigned int bps_count = 0;
 743
 744         memset(buf, 0, count);
 745
 746         if (amdgpu_ras_badpages_read(adev, &bps, &bps_count))
 747                 return 0;
 748
 749         for (; start < end && start < bps_count; start++)
 750                 s += scnprintf(&buf[s], element_size + 1,
 751                                 "0x%08x : 0x%08x : %1s\n",
 752                                 bps[start].bp,
 753                                 bps[start].size,
 754                                 amdgpu_ras_badpage_flags_str(bps[start].flags));
 755
 756         kfree(bps);
 757
 758         return s;
 759 }
 760
 761 static ssize_t amdgpu_ras_sysfs_features_read(struct device *dev,
 762                 struct device_attribute *attr, char *buf)
 763 {
 764         struct amdgpu_ras *con =
 765                 container_of(attr, struct amdgpu_ras, features_attr);
 766         struct drm_device *ddev = dev_get_drvdata(dev);
 767         struct amdgpu_device *adev = ddev->dev_private;
 768         struct ras_common_if head;
 769         int ras_block_count = AMDGPU_RAS_BLOCK_COUNT;
 770         int i;
 771         ssize_t s;
 772         struct ras_manager *obj;
 773
 774         s = scnprintf(buf, PAGE_SIZE, "feature mask: 0x%x\n", con->features);
 775
 776         for (i = 0; i < ras_block_count; i++) {
 777                 head.block = i;
 778
 779                 if (amdgpu_ras_is_feature_enabled(adev, &head)) {
 780                         obj = amdgpu_ras_find_obj(adev, &head);
 781                         s += scnprintf(&buf[s], PAGE_SIZE - s,
 782                                         "%s: %s\n",
 783                                         ras_block_str(i),
 784                                         ras_err_str(obj->head.type));
 785                 } else
 786                         s += scnprintf(&buf[s], PAGE_SIZE - s,
 787                                         "%s: disabled\n",
 788                                         ras_block_str(i));
 789         }
 790
 791         return s;
 792 }
 793
 794 static int amdgpu_ras_sysfs_create_feature_node(struct amdgpu_device *adev)
 795 {
 796         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 797         struct attribute *attrs[] = {
 798                 &con->features_attr.attr,
 799                 NULL
 800         };
 801         struct bin_attribute *bin_attrs[] = {
 802                 &con->badpages_attr,
 803                 NULL
 804         };
 805         struct attribute_group group = {
 806                 .name = "ras",
 807                 .attrs = attrs,
 808                 .bin_attrs = bin_attrs,
 809         };
 810
 811         con->features_attr = (struct device_attribute) {
 812                 .attr = {
 813                         .name = "features",
 814                         .mode = S_IRUGO,
 815                 },
 816                         .show = amdgpu_ras_sysfs_features_read,
 817         };
 818
 819         con->badpages_attr = (struct bin_attribute) {
 820                 .attr = {
 821                         .name = "gpu_vram_bad_pages",
 822                         .mode = S_IRUGO,
 823                 },
 824                 .size = 0,
 825                 .private = NULL,
 826                 .read = amdgpu_ras_sysfs_badpages_read,
 827         };
 828
 829         sysfs_attr_init(attrs[0]);
 830         sysfs_bin_attr_init(bin_attrs[0]);
 831
 832         return sysfs_create_group(&adev->dev->kobj, &group);
 833 }
 834
 835 static int amdgpu_ras_sysfs_remove_feature_node(struct amdgpu_device *adev)
 836 {
 837         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 838         struct attribute *attrs[] = {
 839                 &con->features_attr.attr,
 840                 NULL
 841         };
 842         struct bin_attribute *bin_attrs[] = {
 843                 &con->badpages_attr,
 844                 NULL
 845         };
 846         struct attribute_group group = {
 847                 .name = "ras",
 848                 .attrs = attrs,
 849                 .bin_attrs = bin_attrs,
 850         };
 851
 852         sysfs_remove_group(&adev->dev->kobj, &group);
 853
 854         return 0;
 855 }
 856
 857 int amdgpu_ras_sysfs_create(struct amdgpu_device *adev,
 858                 struct ras_fs_if *head)
 859 {
 860         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 861
 862         if (!obj || obj->attr_inuse)
 863                 return -EINVAL;
 864
 865         get_obj(obj);
 866
 867         memcpy(obj->fs_data.sysfs_name,
 868                         head->sysfs_name,
 869                         sizeof(obj->fs_data.sysfs_name));
 870
 871         obj->sysfs_attr = (struct device_attribute){
 872                 .attr = {
 873                         .name = obj->fs_data.sysfs_name,
 874                         .mode = S_IRUGO,
 875                 },
 876                         .show = amdgpu_ras_sysfs_read,
 877         };
 878         sysfs_attr_init(&obj->sysfs_attr.attr);
 879
 880         if (sysfs_add_file_to_group(&adev->dev->kobj,
 881                                 &obj->sysfs_attr.attr,
 882                                 "ras")) {
 883                 put_obj(obj);
 884                 return -EINVAL;
 885         }
 886
 887         obj->attr_inuse = 1;
 888
 889         return 0;
 890 }
 891
 892 int amdgpu_ras_sysfs_remove(struct amdgpu_device *adev,
 893                 struct ras_common_if *head)
 894 {
 895         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 896
 897         if (!obj || !obj->attr_inuse)
 898                 return -EINVAL;
 899
 900         sysfs_remove_file_from_group(&adev->dev->kobj,
 901                                 &obj->sysfs_attr.attr,
 902                                 "ras");
 903         obj->attr_inuse = 0;
 904         put_obj(obj);
 905
 906         return 0;
 907 }
 908
 909 static int amdgpu_ras_sysfs_remove_all(struct amdgpu_device *adev)
 910 {
 911         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 912         struct ras_manager *obj, *tmp;
 913
 914         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 915                 amdgpu_ras_sysfs_remove(adev, &obj->head);
 916         }
 917
 918         amdgpu_ras_sysfs_remove_feature_node(adev);
 919
 920         return 0;
 921 }
 922 /* sysfs end */
 923
 924 /* debugfs begin */
 925 static void amdgpu_ras_debugfs_create_ctrl_node(struct amdgpu_device *adev)
 926 {
 927         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 928         struct drm_minor *minor = adev->ddev->primary;
 929
 930         con->dir = debugfs_create_dir("ras", minor->debugfs_root);
 931         con->ent = debugfs_create_file("ras_ctrl", S_IWUGO | S_IRUGO, con->dir,
 932                                        adev, &amdgpu_ras_debugfs_ctrl_ops);
 933 }
 934
 935 void amdgpu_ras_debugfs_create(struct amdgpu_device *adev,
 936                 struct ras_fs_if *head)
 937 {
 938         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 939         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &head->head);
 940
 941         if (!obj || obj->ent)
 942                 return;
 943
 944         get_obj(obj);
 945
 946         memcpy(obj->fs_data.debugfs_name,
 947                         head->debugfs_name,
 948                         sizeof(obj->fs_data.debugfs_name));
 949
 950         obj->ent = debugfs_create_file(obj->fs_data.debugfs_name,
 951                                        S_IWUGO | S_IRUGO, con->dir, obj,
 952                                        &amdgpu_ras_debugfs_ops);
 953 }
 954
 955 void amdgpu_ras_debugfs_remove(struct amdgpu_device *adev,
 956                 struct ras_common_if *head)
 957 {
 958         struct ras_manager *obj = amdgpu_ras_find_obj(adev, head);
 959
 960         if (!obj || !obj->ent)
 961                 return;
 962
 963         debugfs_remove(obj->ent);
 964         obj->ent = NULL;
 965         put_obj(obj);
 966 }
 967
 968 static void amdgpu_ras_debugfs_remove_all(struct amdgpu_device *adev)
 969 {
 970         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
 971         struct ras_manager *obj, *tmp;
 972
 973         list_for_each_entry_safe(obj, tmp, &con->head, node) {
 974                 amdgpu_ras_debugfs_remove(adev, &obj->head);
 975         }
 976
 977         debugfs_remove(con->ent);
 978         debugfs_remove(con->dir);
 979         con->dir = NULL;
 980         con->ent = NULL;
 981 }
 982 /* debugfs end */
 983
 984 /* ras fs */
 985
 986 static int amdgpu_ras_fs_init(struct amdgpu_device *adev)
 987 {
 988         amdgpu_ras_sysfs_create_feature_node(adev);
 989         amdgpu_ras_debugfs_create_ctrl_node(adev);
 990
 991         return 0;
 992 }
 993
 994 static int amdgpu_ras_fs_fini(struct amdgpu_device *adev)
 995 {
 996         amdgpu_ras_debugfs_remove_all(adev);
 997         amdgpu_ras_sysfs_remove_all(adev);
 998         return 0;
 999 }
1000 /* ras fs end */
1001
1002 /* ih begin */
1003 static void amdgpu_ras_interrupt_handler(struct ras_manager *obj)
1004 {
1005         struct ras_ih_data *data = &obj->ih_data;
1006         struct amdgpu_iv_entry entry;
1007         int ret;
1008         struct ras_err_data err_data = {0, 0};
1009
1010         while (data->rptr != data->wptr) {
1011                 rmb();
1012                 memcpy(&entry, &data->ring[data->rptr],
1013                                 data->element_size);
1014
1015                 wmb();
1016                 data->rptr = (data->aligned_element_size +
1017                                 data->rptr) % data->ring_size;
1018
1019                 /* Let IP handle its data, maybe we need get the output
1020                  * from the callback to udpate the error type/count, etc
1021                  */
1022                 if (data->cb) {
1023                         ret = data->cb(obj->adev, &entry);
1024                         /* ue will trigger an interrupt, and in that case
1025                          * we need do a reset to recovery the whole system.
1026                          * But leave IP do that recovery, here we just dispatch
1027                          * the error.
1028                          */
1029                         if (ret == AMDGPU_RAS_UE) {
1030                                 obj->err_data.ue_count++;
1031                         }
1032                         /* Might need get ce count by register, but not all IP
1033                          * saves ce count, some IP just use one bit or two bits
1034                          * to indicate ce happened.
1035                          */
1036                 }
1037         }
1038 }
1039
1040 static void amdgpu_ras_interrupt_process_handler(struct work_struct *work)
1041 {
1042         struct ras_ih_data *data =
1043                 container_of(work, struct ras_ih_data, ih_work);
1044         struct ras_manager *obj =
1045                 container_of(data, struct ras_manager, ih_data);
1046
1047         amdgpu_ras_interrupt_handler(obj);
1048 }
1049
1050 int amdgpu_ras_interrupt_dispatch(struct amdgpu_device *adev,
1051                 struct ras_dispatch_if *info)
1052 {
1053         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1054         struct ras_ih_data *data = &obj->ih_data;
1055
1056         if (!obj)
1057                 return -EINVAL;
1058
1059         if (data->inuse == 0)
1060                 return 0;
1061
1062         /* Might be overflow... */
1063         memcpy(&data->ring[data->wptr], info->entry,
1064                         data->element_size);
1065
1066         wmb();
1067         data->wptr = (data->aligned_element_size +
1068                         data->wptr) % data->ring_size;
1069
1070         schedule_work(&data->ih_work);
1071
1072         return 0;
1073 }
1074
1075 int amdgpu_ras_interrupt_remove_handler(struct amdgpu_device *adev,
1076                 struct ras_ih_if *info)
1077 {
1078         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1079         struct ras_ih_data *data;
1080
1081         if (!obj)
1082                 return -EINVAL;
1083
1084         data = &obj->ih_data;
1085         if (data->inuse == 0)
1086                 return 0;
1087
1088         cancel_work_sync(&data->ih_work);
1089
1090         kfree(data->ring);
1091         memset(data, 0, sizeof(*data));
1092         put_obj(obj);
1093
1094         return 0;
1095 }
1096
1097 int amdgpu_ras_interrupt_add_handler(struct amdgpu_device *adev,
1098                 struct ras_ih_if *info)
1099 {
1100         struct ras_manager *obj = amdgpu_ras_find_obj(adev, &info->head);
1101         struct ras_ih_data *data;
1102
1103         if (!obj) {
1104                 /* in case we registe the IH before enable ras feature */
1105                 obj = amdgpu_ras_create_obj(adev, &info->head);
1106                 if (!obj)
1107                         return -EINVAL;
1108         } else
1109                 get_obj(obj);
1110
1111         data = &obj->ih_data;
1112         /* add the callback.etc */
1113         *data = (struct ras_ih_data) {
1114                 .inuse = 0,
1115                 .cb = info->cb,
1116                 .element_size = sizeof(struct amdgpu_iv_entry),
1117                 .rptr = 0,
1118                 .wptr = 0,
1119         };
1120
1121         INIT_WORK(&data->ih_work, amdgpu_ras_interrupt_process_handler);
1122
1123         data->aligned_element_size = ALIGN(data->element_size, 8);
1124         /* the ring can store 64 iv entries. */
1125         data->ring_size = 64 * data->aligned_element_size;
1126         data->ring = kmalloc(data->ring_size, GFP_KERNEL);
1127         if (!data->ring) {
1128                 put_obj(obj);
1129                 return -ENOMEM;
1130         }
1131
1132         /* IH is ready */
1133         data->inuse = 1;
1134
1135         return 0;
1136 }
1137
1138 static int amdgpu_ras_interrupt_remove_all(struct amdgpu_device *adev)
1139 {
1140         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1141         struct ras_manager *obj, *tmp;
1142
1143         list_for_each_entry_safe(obj, tmp, &con->head, node) {
1144                 struct ras_ih_if info = {
1145                         .head = obj->head,
1146                 };
1147                 amdgpu_ras_interrupt_remove_handler(adev, &info);
1148         }
1149
1150         return 0;
1151 }
1152 /* ih end */
1153
1154 /* recovery begin */
1155
1156 /* return 0 on success.
1157  * caller need free bps.
1158  */
1159 static int amdgpu_ras_badpages_read(struct amdgpu_device *adev,
1160                 struct ras_badpage **bps, unsigned int *count)
1161 {
1162         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1163         struct ras_err_handler_data *data;
1164         int i = 0;
1165         int ret = 0;
1166
1167         if (!con || !con->eh_data || !bps || !count)
1168                 return -EINVAL;
1169
1170         mutex_lock(&con->recovery_lock);
1171         data = con->eh_data;
1172         if (!data || data->count == 0) {
1173                 *bps = NULL;
1174                 goto out;
1175         }
1176
1177         *bps = kmalloc(sizeof(struct ras_badpage) * data->count, GFP_KERNEL);
1178         if (!*bps) {
1179                 ret = -ENOMEM;
1180                 goto out;
1181         }
1182
1183         for (; i < data->count; i++) {
1184                 (*bps)[i] = (struct ras_badpage){
1185                         .bp = data->bps[i].bp,
1186                         .size = AMDGPU_GPU_PAGE_SIZE,
1187                         .flags = 0,
1188                 };
1189
1190                 if (data->last_reserved <= i)
1191                         (*bps)[i].flags = 1;
1192                 else if (data->bps[i].bo == NULL)
1193                         (*bps)[i].flags = 2;
1194         }
1195
1196         *count = data->count;
1197 out:
1198         mutex_unlock(&con->recovery_lock);
1199         return ret;
1200 }
1201
1202 static void amdgpu_ras_do_recovery(struct work_struct *work)
1203 {
1204         struct amdgpu_ras *ras =
1205                 container_of(work, struct amdgpu_ras, recovery_work);
1206
1207         amdgpu_device_gpu_recover(ras->adev, 0);
1208         atomic_set(&ras->in_recovery, 0);
1209 }
1210
1211 static int amdgpu_ras_release_vram(struct amdgpu_device *adev,
1212                 struct amdgpu_bo **bo_ptr)
1213 {
1214         /* no need to free it actually. */
1215         amdgpu_bo_free_kernel(bo_ptr, NULL, NULL);
1216         return 0;
1217 }
1218
1219 /* reserve vram with size@offset */
1220 static int amdgpu_ras_reserve_vram(struct amdgpu_device *adev,
1221                 uint64_t offset, uint64_t size,
1222                 struct amdgpu_bo **bo_ptr)
1223 {
1224         struct ttm_operation_ctx ctx = { false, false };
1225         struct amdgpu_bo_param bp;
1226         int r = 0;
1227         int i;
1228         struct amdgpu_bo *bo;
1229
1230         if (bo_ptr)
1231                 *bo_ptr = NULL;
1232         memset(&bp, 0, sizeof(bp));
1233         bp.size = size;
1234         bp.byte_align = PAGE_SIZE;
1235         bp.domain = AMDGPU_GEM_DOMAIN_VRAM;
1236         bp.flags = AMDGPU_GEM_CREATE_VRAM_CONTIGUOUS |
1237                 AMDGPU_GEM_CREATE_NO_CPU_ACCESS;
1238         bp.type = ttm_bo_type_kernel;
1239         bp.resv = NULL;
1240
1241         r = amdgpu_bo_create(adev, &bp, &bo);
1242         if (r)
1243                 return -EINVAL;
1244
1245         r = amdgpu_bo_reserve(bo, false);
1246         if (r)
1247                 goto error_reserve;
1248
1249         offset = ALIGN(offset, PAGE_SIZE);
1250         for (i = 0; i < bo->placement.num_placement; ++i) {
1251                 bo->placements[i].fpfn = offset >> PAGE_SHIFT;
1252                 bo->placements[i].lpfn = (offset + size) >> PAGE_SHIFT;
1253         }
1254
1255         ttm_bo_mem_put(&bo->tbo, &bo->tbo.mem);
1256         r = ttm_bo_mem_space(&bo->tbo, &bo->placement, &bo->tbo.mem, &ctx);
1257         if (r)
1258                 goto error_pin;
1259
1260         r = amdgpu_bo_pin_restricted(bo,
1261                         AMDGPU_GEM_DOMAIN_VRAM,
1262                         offset,
1263                         offset + size);
1264         if (r)
1265                 goto error_pin;
1266
1267         if (bo_ptr)
1268                 *bo_ptr = bo;
1269
1270         amdgpu_bo_unreserve(bo);
1271         return r;
1272
1273 error_pin:
1274         amdgpu_bo_unreserve(bo);
1275 error_reserve:
1276         amdgpu_bo_unref(&bo);
1277         return r;
1278 }
1279
1280 /* alloc/realloc bps array */
1281 static int amdgpu_ras_realloc_eh_data_space(struct amdgpu_device *adev,
1282                 struct ras_err_handler_data *data, int pages)
1283 {
1284         unsigned int old_space = data->count + data->space_left;
1285         unsigned int new_space = old_space + pages;
1286         unsigned int align_space = ALIGN(new_space, 1024);
1287         void *tmp = kmalloc(align_space * sizeof(*data->bps), GFP_KERNEL);
1288
1289         if (!tmp)
1290                 return -ENOMEM;
1291
1292         if (data->bps) {
1293                 memcpy(tmp, data->bps,
1294                                 data->count * sizeof(*data->bps));
1295                 kfree(data->bps);
1296         }
1297
1298         data->bps = tmp;
1299         data->space_left += align_space - old_space;
1300         return 0;
1301 }
1302
1303 /* it deal with vram only. */
1304 int amdgpu_ras_add_bad_pages(struct amdgpu_device *adev,
1305                 unsigned long *bps, int pages)
1306 {
1307         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1308         struct ras_err_handler_data *data;
1309         int i = pages;
1310         int ret = 0;
1311
1312         if (!con || !con->eh_data || !bps || pages <= 0)
1313                 return 0;
1314
1315         mutex_lock(&con->recovery_lock);
1316         data = con->eh_data;
1317         if (!data)
1318                 goto out;
1319
1320         if (data->space_left <= pages)
1321                 if (amdgpu_ras_realloc_eh_data_space(adev, data, pages)) {
1322                         ret = -ENOMEM;
1323                         goto out;
1324                 }
1325
1326         while (i--)
1327                 data->bps[data->count++].bp = bps[i];
1328
1329         data->space_left -= pages;
1330 out:
1331         mutex_unlock(&con->recovery_lock);
1332
1333         return ret;
1334 }
1335
1336 /* called in gpu recovery/init */
1337 int amdgpu_ras_reserve_bad_pages(struct amdgpu_device *adev)
1338 {
1339         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1340         struct ras_err_handler_data *data;
1341         uint64_t bp;
1342         struct amdgpu_bo *bo;
1343         int i;
1344
1345         if (!con || !con->eh_data)
1346                 return 0;
1347
1348         mutex_lock(&con->recovery_lock);
1349         data = con->eh_data;
1350         if (!data)
1351                 goto out;
1352         /* reserve vram at driver post stage. */
1353         for (i = data->last_reserved; i < data->count; i++) {
1354                 bp = data->bps[i].bp;
1355
1356                 if (amdgpu_ras_reserve_vram(adev, bp << PAGE_SHIFT,
1357                                         PAGE_SIZE, &bo))
1358                         DRM_ERROR("RAS ERROR: reserve vram %llx fail\n", bp);
1359
1360                 data->bps[i].bo = bo;
1361                 data->last_reserved = i + 1;
1362         }
1363 out:
1364         mutex_unlock(&con->recovery_lock);
1365         return 0;
1366 }
1367
1368 /* called when driver unload */
1369 static int amdgpu_ras_release_bad_pages(struct amdgpu_device *adev)
1370 {
1371         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1372         struct ras_err_handler_data *data;
1373         struct amdgpu_bo *bo;
1374         int i;
1375
1376         if (!con || !con->eh_data)
1377                 return 0;
1378
1379         mutex_lock(&con->recovery_lock);
1380         data = con->eh_data;
1381         if (!data)
1382                 goto out;
1383
1384         for (i = data->last_reserved - 1; i >= 0; i--) {
1385                 bo = data->bps[i].bo;
1386
1387                 amdgpu_ras_release_vram(adev, &bo);
1388
1389                 data->bps[i].bo = bo;
1390                 data->last_reserved = i;
1391         }
1392 out:
1393         mutex_unlock(&con->recovery_lock);
1394         return 0;
1395 }
1396
1397 static int amdgpu_ras_save_bad_pages(struct amdgpu_device *adev)
1398 {
1399         /* TODO
1400          * write the array to eeprom when SMU disabled.
1401          */
1402         return 0;
1403 }
1404
1405 static int amdgpu_ras_load_bad_pages(struct amdgpu_device *adev)
1406 {
1407         /* TODO
1408          * read the array to eeprom when SMU disabled.
1409          */
1410         return 0;
1411 }
1412
1413 static int amdgpu_ras_recovery_init(struct amdgpu_device *adev)
1414 {
1415         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1416         struct ras_err_handler_data **data = &con->eh_data;
1417
1418         *data = kmalloc(sizeof(**data),
1419                         GFP_KERNEL|__GFP_ZERO);
1420         if (!*data)
1421                 return -ENOMEM;
1422
1423         mutex_init(&con->recovery_lock);
1424         INIT_WORK(&con->recovery_work, amdgpu_ras_do_recovery);
1425         atomic_set(&con->in_recovery, 0);
1426         con->adev = adev;
1427
1428         amdgpu_ras_load_bad_pages(adev);
1429         amdgpu_ras_reserve_bad_pages(adev);
1430
1431         return 0;
1432 }
1433
1434 static int amdgpu_ras_recovery_fini(struct amdgpu_device *adev)
1435 {
1436         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1437         struct ras_err_handler_data *data = con->eh_data;
1438
1439         cancel_work_sync(&con->recovery_work);
1440         amdgpu_ras_save_bad_pages(adev);
1441         amdgpu_ras_release_bad_pages(adev);
1442
1443         mutex_lock(&con->recovery_lock);
1444         con->eh_data = NULL;
1445         kfree(data->bps);
1446         kfree(data);
1447         mutex_unlock(&con->recovery_lock);
1448
1449         return 0;
1450 }
1451 /* recovery end */
1452
1453 /* return 0 if ras will reset gpu and repost.*/
1454 int amdgpu_ras_request_reset_on_boot(struct amdgpu_device *adev,
1455                 unsigned int block)
1456 {
1457         struct amdgpu_ras *ras = amdgpu_ras_get_context(adev);
1458
1459         if (!ras)
1460                 return -EINVAL;
1461
1462         ras->flags |= AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1463         return 0;
1464 }
1465
1466 /*
1467  * check hardware's ras ability which will be saved in hw_supported.
1468  * if hardware does not support ras, we can skip some ras initializtion and
1469  * forbid some ras operations from IP.
1470  * if software itself, say boot parameter, limit the ras ability. We still
1471  * need allow IP do some limited operations, like disable. In such case,
1472  * we have to initialize ras as normal. but need check if operation is
1473  * allowed or not in each function.
1474  */
1475 static void amdgpu_ras_check_supported(struct amdgpu_device *adev,
1476                 uint32_t *hw_supported, uint32_t *supported)
1477 {
1478         *hw_supported = 0;
1479         *supported = 0;
1480
1481         if (amdgpu_sriov_vf(adev) ||
1482                         adev->asic_type != CHIP_VEGA20)
1483                 return;
1484
1485         if (adev->is_atom_fw &&
1486                         (amdgpu_atomfirmware_mem_ecc_supported(adev) ||
1487                          amdgpu_atomfirmware_sram_ecc_supported(adev)))
1488                 *hw_supported = AMDGPU_RAS_BLOCK_MASK;
1489
1490         *supported = amdgpu_ras_enable == 0 ?
1491                                 0 : *hw_supported & amdgpu_ras_mask;
1492 }
1493
1494 int amdgpu_ras_init(struct amdgpu_device *adev)
1495 {
1496         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1497
1498         if (con)
1499                 return 0;
1500
1501         con = kmalloc(sizeof(struct amdgpu_ras) +
1502                         sizeof(struct ras_manager) * AMDGPU_RAS_BLOCK_COUNT,
1503                         GFP_KERNEL|__GFP_ZERO);
1504         if (!con)
1505                 return -ENOMEM;
1506
1507         con->objs = (struct ras_manager *)(con + 1);
1508
1509         amdgpu_ras_set_context(adev, con);
1510
1511         amdgpu_ras_check_supported(adev, &con->hw_supported,
1512                         &con->supported);
1513         if (!con->hw_supported) {
1514                 amdgpu_ras_set_context(adev, NULL);
1515                 kfree(con);
1516                 return 0;
1517         }
1518
1519         con->features = 0;
1520         INIT_LIST_HEAD(&con->head);
1521         /* Might need get this flag from vbios. */
1522         con->flags = RAS_DEFAULT_FLAGS;
1523
1524         if (amdgpu_ras_recovery_init(adev))
1525                 goto recovery_out;
1526
1527         amdgpu_ras_mask &= AMDGPU_RAS_BLOCK_MASK;
1528
1529         if (amdgpu_ras_fs_init(adev))
1530                 goto fs_out;
1531
1532         DRM_INFO("RAS INFO: ras initialized successfully, "
1533                         "hardware ability[%x] ras_mask[%x]\n",
1534                         con->hw_supported, con->supported);
1535         return 0;
1536 fs_out:
1537         amdgpu_ras_recovery_fini(adev);
1538 recovery_out:
1539         amdgpu_ras_set_context(adev, NULL);
1540         kfree(con);
1541
1542         return -EINVAL;
1543 }
1544
1545 /* do some init work after IP late init as dependence.
1546  * and it runs in resume/gpu reset/booting up cases.
1547  */
1548 void amdgpu_ras_resume(struct amdgpu_device *adev)
1549 {
1550         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1551         struct ras_manager *obj, *tmp;
1552
1553         if (!con)
1554                 return;
1555
1556         if (con->flags & AMDGPU_RAS_FLAG_INIT_BY_VBIOS) {
1557                 /* Set up all other IPs which are not implemented. There is a
1558                  * tricky thing that IP's actual ras error type should be
1559                  * MULTI_UNCORRECTABLE, but as driver does not handle it, so
1560                  * ERROR_NONE make sense anyway.
1561                  */
1562                 amdgpu_ras_enable_all_features(adev, 1);
1563
1564                 /* We enable ras on all hw_supported block, but as boot
1565                  * parameter might disable some of them and one or more IP has
1566                  * not implemented yet. So we disable them on behalf.
1567                  */
1568                 list_for_each_entry_safe(obj, tmp, &con->head, node) {
1569                         if (!amdgpu_ras_is_supported(adev, obj->head.block)) {
1570                                 amdgpu_ras_feature_enable(adev, &obj->head, 0);
1571                                 /* there should be no any reference. */
1572                                 WARN_ON(alive_obj(obj));
1573                         }
1574                 }
1575         }
1576
1577         if (con->flags & AMDGPU_RAS_FLAG_INIT_NEED_RESET) {
1578                 con->flags &= ~AMDGPU_RAS_FLAG_INIT_NEED_RESET;
1579                 /* setup ras obj state as disabled.
1580                  * for init_by_vbios case.
1581                  * if we want to enable ras, just enable it in a normal way.
1582                  * If we want do disable it, need setup ras obj as enabled,
1583                  * then issue another TA disable cmd.
1584                  * See feature_enable_on_boot
1585                  */
1586                 amdgpu_ras_disable_all_features(adev, 1);
1587                 amdgpu_ras_reset_gpu(adev, 0);
1588         }
1589 }
1590
1591 void amdgpu_ras_suspend(struct amdgpu_device *adev)
1592 {
1593         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1594
1595         if (!con)
1596                 return;
1597
1598         amdgpu_ras_disable_all_features(adev, 0);
1599         /* Make sure all ras objects are disabled. */
1600         if (con->features)
1601                 amdgpu_ras_disable_all_features(adev, 1);
1602 }
1603
1604 /* do some fini work before IP fini as dependence */
1605 int amdgpu_ras_pre_fini(struct amdgpu_device *adev)
1606 {
1607         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1608
1609         if (!con)
1610                 return 0;
1611
1612         /* Need disable ras on all IPs here before ip [hw/sw]fini */
1613         amdgpu_ras_disable_all_features(adev, 0);
1614         amdgpu_ras_recovery_fini(adev);
1615         return 0;
1616 }
1617
1618 int amdgpu_ras_fini(struct amdgpu_device *adev)
1619 {
1620         struct amdgpu_ras *con = amdgpu_ras_get_context(adev);
1621
1622         if (!con)
1623                 return 0;
1624
1625         amdgpu_ras_fs_fini(adev);
1626         amdgpu_ras_interrupt_remove_all(adev);
1627
1628         WARN(con->features, "Feature mask is not cleared");
1629
1630         if (con->features)
1631                 amdgpu_ras_disable_all_features(adev, 1);
1632
1633         amdgpu_ras_set_context(adev, NULL);
1634         kfree(con);
1635
1636         return 0;
1637 }