]> git.kernelconcepts.de Git - karo-tx-linux.git/blobdiff - drivers/gpu/drm/amd/amdgpu/gfx_v8_0.c
Merge branch 'drm-next-4.13' of git://people.freedesktop.org/~agd5f/linux into drm...
[karo-tx-linux.git] / drivers / gpu / drm / amd / amdgpu / gfx_v8_0.c
index 404d127858535970b2ec0266f45992157e2ff7cb..142924212b4395a606b67f41b98e0bb2823eb216 100644 (file)
@@ -52,7 +52,7 @@
 #include "smu/smu_7_1_3_d.h"
 
 #define GFX8_NUM_GFX_RINGS     1
-#define GFX8_NUM_COMPUTE_RINGS 8
+#define GFX8_MEC_HPD_SIZE 2048
 
 #define TOPAZ_GB_ADDR_CONFIG_GOLDEN 0x22010001
 #define CARRIZO_GB_ADDR_CONFIG_GOLDEN 0x22010001
@@ -657,10 +657,8 @@ static void gfx_v8_0_set_gds_init(struct amdgpu_device *adev);
 static void gfx_v8_0_set_rlc_funcs(struct amdgpu_device *adev);
 static u32 gfx_v8_0_get_csb_size(struct amdgpu_device *adev);
 static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev);
-static void gfx_v8_0_ring_emit_ce_meta_init(struct amdgpu_ring *ring, uint64_t addr);
-static void gfx_v8_0_ring_emit_de_meta_init(struct amdgpu_ring *ring, uint64_t addr);
-static int gfx_v8_0_compute_mqd_sw_init(struct amdgpu_device *adev);
-static void gfx_v8_0_compute_mqd_sw_fini(struct amdgpu_device *adev);
+static void gfx_v8_0_ring_emit_ce_meta(struct amdgpu_ring *ring);
+static void gfx_v8_0_ring_emit_de_meta(struct amdgpu_ring *ring);
 
 static void gfx_v8_0_init_golden_registers(struct amdgpu_device *adev)
 {
@@ -859,7 +857,8 @@ err1:
 }
 
 
-static void gfx_v8_0_free_microcode(struct amdgpu_device *adev) {
+static void gfx_v8_0_free_microcode(struct amdgpu_device *adev)
+{
        release_firmware(adev->gfx.pfp_fw);
        adev->gfx.pfp_fw = NULL;
        release_firmware(adev->gfx.me_fw);
@@ -941,12 +940,6 @@ static int gfx_v8_0_init_microcode(struct amdgpu_device *adev)
        cp_hdr = (const struct gfx_firmware_header_v1_0 *)adev->gfx.me_fw->data;
        adev->gfx.me_fw_version = le32_to_cpu(cp_hdr->header.ucode_version);
 
-       /* chain ib ucode isn't formal released, just disable it by far
-        * TODO: when ucod ready we should use ucode version to judge if
-        * chain-ib support or not.
-        */
-       adev->virt.chained_ib_support = false;
-
        adev->gfx.me_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version);
 
        snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_ce.bin", chip_name);
@@ -960,6 +953,17 @@ static int gfx_v8_0_init_microcode(struct amdgpu_device *adev)
        adev->gfx.ce_fw_version = le32_to_cpu(cp_hdr->header.ucode_version);
        adev->gfx.ce_feature_version = le32_to_cpu(cp_hdr->ucode_feature_version);
 
+       /*
+        * Support for MCBP/Virtualization in combination with chained IBs is
+        * formal released on feature version #46
+        */
+       if (adev->gfx.ce_feature_version >= 46 &&
+           adev->gfx.pfp_feature_version >= 46) {
+               adev->virt.chained_ib_support = true;
+               DRM_INFO("Chained IB support enabled!\n");
+       } else
+               adev->virt.chained_ib_support = false;
+
        snprintf(fw_name, sizeof(fw_name), "amdgpu/%s_rlc.bin", chip_name);
        err = request_firmware(&adev->gfx.rlc_fw, fw_name, adev->dev);
        if (err)
@@ -1373,64 +1377,22 @@ static void gfx_v8_0_mec_fini(struct amdgpu_device *adev)
        }
 }
 
-static int gfx_v8_0_kiq_init_ring(struct amdgpu_device *adev,
-                                 struct amdgpu_ring *ring,
-                                 struct amdgpu_irq_src *irq)
-{
-       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-       int r = 0;
-
-       r = amdgpu_wb_get(adev, &adev->virt.reg_val_offs);
-       if (r)
-               return r;
-
-       ring->adev = NULL;
-       ring->ring_obj = NULL;
-       ring->use_doorbell = true;
-       ring->doorbell_index = AMDGPU_DOORBELL_KIQ;
-       if (adev->gfx.mec2_fw) {
-               ring->me = 2;
-               ring->pipe = 0;
-       } else {
-               ring->me = 1;
-               ring->pipe = 1;
-       }
-
-       ring->queue = 0;
-       ring->eop_gpu_addr = kiq->eop_gpu_addr;
-       sprintf(ring->name, "kiq %d.%d.%d", ring->me, ring->pipe, ring->queue);
-       r = amdgpu_ring_init(adev, ring, 1024,
-                            irq, AMDGPU_CP_KIQ_IRQ_DRIVER0);
-       if (r)
-               dev_warn(adev->dev, "(%d) failed to init kiq ring\n", r);
-
-       return r;
-}
-static void gfx_v8_0_kiq_free_ring(struct amdgpu_ring *ring,
-                                  struct amdgpu_irq_src *irq)
-{
-       amdgpu_wb_free(ring->adev, ring->adev->virt.reg_val_offs);
-       amdgpu_ring_fini(ring);
-}
-
-#define MEC_HPD_SIZE 2048
-
 static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
 {
        int r;
        u32 *hpd;
+       size_t mec_hpd_size;
 
-       /*
-        * we assign only 1 pipe because all other pipes will
-        * be handled by KFD
-        */
-       adev->gfx.mec.num_mec = 1;
-       adev->gfx.mec.num_pipe = 1;
-       adev->gfx.mec.num_queue = adev->gfx.mec.num_mec * adev->gfx.mec.num_pipe * 8;
+       bitmap_zero(adev->gfx.mec.queue_bitmap, AMDGPU_MAX_COMPUTE_QUEUES);
+
+       /* take ownership of the relevant compute queues */
+       amdgpu_gfx_compute_queue_acquire(adev);
+
+       mec_hpd_size = adev->gfx.num_compute_rings * GFX8_MEC_HPD_SIZE;
 
        if (adev->gfx.mec.hpd_eop_obj == NULL) {
                r = amdgpu_bo_create(adev,
-                                    adev->gfx.mec.num_queue * MEC_HPD_SIZE,
+                                    mec_hpd_size,
                                     PAGE_SIZE, true,
                                     AMDGPU_GEM_DOMAIN_GTT, 0, NULL, NULL,
                                     &adev->gfx.mec.hpd_eop_obj);
@@ -1459,7 +1421,7 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
                return r;
        }
 
-       memset(hpd, 0, adev->gfx.mec.num_queue * MEC_HPD_SIZE);
+       memset(hpd, 0, mec_hpd_size);
 
        amdgpu_bo_kunmap(adev->gfx.mec.hpd_eop_obj);
        amdgpu_bo_unreserve(adev->gfx.mec.hpd_eop_obj);
@@ -1467,38 +1429,6 @@ static int gfx_v8_0_mec_init(struct amdgpu_device *adev)
        return 0;
 }
 
-static void gfx_v8_0_kiq_fini(struct amdgpu_device *adev)
-{
-       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
-       amdgpu_bo_free_kernel(&kiq->eop_obj, &kiq->eop_gpu_addr, NULL);
-}
-
-static int gfx_v8_0_kiq_init(struct amdgpu_device *adev)
-{
-       int r;
-       u32 *hpd;
-       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
-
-       r = amdgpu_bo_create_kernel(adev, MEC_HPD_SIZE, PAGE_SIZE,
-                                   AMDGPU_GEM_DOMAIN_GTT, &kiq->eop_obj,
-                                   &kiq->eop_gpu_addr, (void **)&hpd);
-       if (r) {
-               dev_warn(adev->dev, "failed to create KIQ bo (%d).\n", r);
-               return r;
-       }
-
-       memset(hpd, 0, MEC_HPD_SIZE);
-
-       r = amdgpu_bo_reserve(kiq->eop_obj, true);
-       if (unlikely(r != 0))
-               dev_warn(adev->dev, "(%d) reserve kiq eop bo failed\n", r);
-       amdgpu_bo_kunmap(kiq->eop_obj);
-       amdgpu_bo_unreserve(kiq->eop_obj);
-
-       return 0;
-}
-
 static const u32 vgpr_init_compute_shader[] =
 {
        0x7e000209, 0x7e020208,
@@ -1907,46 +1837,7 @@ static int gfx_v8_0_gpu_early_init(struct amdgpu_device *adev)
                adev->gfx.config.max_tile_pipes = 2;
                adev->gfx.config.max_sh_per_se = 1;
                adev->gfx.config.max_backends_per_se = 2;
-
-               switch (adev->pdev->revision) {
-               case 0xc4:
-               case 0x84:
-               case 0xc8:
-               case 0xcc:
-               case 0xe1:
-               case 0xe3:
-                       /* B10 */
-                       adev->gfx.config.max_cu_per_sh = 8;
-                       break;
-               case 0xc5:
-               case 0x81:
-               case 0x85:
-               case 0xc9:
-               case 0xcd:
-               case 0xe2:
-               case 0xe4:
-                       /* B8 */
-                       adev->gfx.config.max_cu_per_sh = 6;
-                       break;
-               case 0xc6:
-               case 0xca:
-               case 0xce:
-               case 0x88:
-               case 0xe6:
-                       /* B6 */
-                       adev->gfx.config.max_cu_per_sh = 6;
-                       break;
-               case 0xc7:
-               case 0x87:
-               case 0xcb:
-               case 0xe5:
-               case 0x89:
-               default:
-                       /* B4 */
-                       adev->gfx.config.max_cu_per_sh = 4;
-                       break;
-               }
-
+               adev->gfx.config.max_cu_per_sh = 8;
                adev->gfx.config.max_texture_channel_caches = 2;
                adev->gfx.config.max_gprs = 256;
                adev->gfx.config.max_gs_threads = 32;
@@ -1963,35 +1854,7 @@ static int gfx_v8_0_gpu_early_init(struct amdgpu_device *adev)
                adev->gfx.config.max_tile_pipes = 2;
                adev->gfx.config.max_sh_per_se = 1;
                adev->gfx.config.max_backends_per_se = 1;
-
-               switch (adev->pdev->revision) {
-               case 0x80:
-               case 0x81:
-               case 0xc0:
-               case 0xc1:
-               case 0xc2:
-               case 0xc4:
-               case 0xc8:
-               case 0xc9:
-               case 0xd6:
-               case 0xda:
-               case 0xe9:
-               case 0xea:
-                       adev->gfx.config.max_cu_per_sh = 3;
-                       break;
-               case 0x83:
-               case 0xd0:
-               case 0xd1:
-               case 0xd2:
-               case 0xd4:
-               case 0xdb:
-               case 0xe1:
-               case 0xe2:
-               default:
-                       adev->gfx.config.max_cu_per_sh = 2;
-                       break;
-               }
-
+               adev->gfx.config.max_cu_per_sh = 3;
                adev->gfx.config.max_texture_channel_caches = 2;
                adev->gfx.config.max_gprs = 256;
                adev->gfx.config.max_gs_threads = 16;
@@ -2083,13 +1946,67 @@ static int gfx_v8_0_gpu_early_init(struct amdgpu_device *adev)
        return 0;
 }
 
+static int gfx_v8_0_compute_ring_init(struct amdgpu_device *adev, int ring_id,
+                                       int mec, int pipe, int queue)
+{
+       int r;
+       unsigned irq_type;
+       struct amdgpu_ring *ring = &adev->gfx.compute_ring[ring_id];
+
+       ring = &adev->gfx.compute_ring[ring_id];
+
+       /* mec0 is me1 */
+       ring->me = mec + 1;
+       ring->pipe = pipe;
+       ring->queue = queue;
+
+       ring->ring_obj = NULL;
+       ring->use_doorbell = true;
+       ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + ring_id;
+       ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr
+                               + (ring_id * GFX8_MEC_HPD_SIZE);
+       sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
+
+       irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP
+               + ((ring->me - 1) * adev->gfx.mec.num_pipe_per_mec)
+               + ring->pipe;
+
+       /* type-2 packets are deprecated on MEC, use type-3 instead */
+       r = amdgpu_ring_init(adev, ring, 1024,
+                       &adev->gfx.eop_irq, irq_type);
+       if (r)
+               return r;
+
+
+       return 0;
+}
+
 static int gfx_v8_0_sw_init(void *handle)
 {
-       int i, r;
+       int i, j, k, r, ring_id;
        struct amdgpu_ring *ring;
        struct amdgpu_kiq *kiq;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
+       switch (adev->asic_type) {
+       case CHIP_FIJI:
+       case CHIP_TONGA:
+       case CHIP_POLARIS11:
+       case CHIP_POLARIS12:
+       case CHIP_POLARIS10:
+       case CHIP_CARRIZO:
+               adev->gfx.mec.num_mec = 2;
+               break;
+       case CHIP_TOPAZ:
+       case CHIP_STONEY:
+       default:
+               adev->gfx.mec.num_mec = 1;
+               break;
+       }
+
+       adev->gfx.mec.num_pipe_per_mec = 4;
+       adev->gfx.mec.num_queue_per_pipe = 8;
+
        /* KIQ event */
        r = amdgpu_irq_add_id(adev, AMDGPU_IH_CLIENTID_LEGACY, 178, &adev->gfx.kiq.irq);
        if (r)
@@ -2151,49 +2068,41 @@ static int gfx_v8_0_sw_init(void *handle)
                        return r;
        }
 
-       /* set up the compute queues */
-       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-               unsigned irq_type;
 
-               /* max 32 queues per MEC */
-               if ((i >= 32) || (i >= AMDGPU_MAX_COMPUTE_RINGS)) {
-                       DRM_ERROR("Too many (%d) compute rings!\n", i);
-                       break;
+       /* set up the compute queues - allocate horizontally across pipes */
+       ring_id = 0;
+       for (i = 0; i < adev->gfx.mec.num_mec; ++i) {
+               for (j = 0; j < adev->gfx.mec.num_queue_per_pipe; j++) {
+                       for (k = 0; k < adev->gfx.mec.num_pipe_per_mec; k++) {
+                               if (!amdgpu_gfx_is_mec_queue_enabled(adev, i, k, j))
+                                       continue;
+
+                               r = gfx_v8_0_compute_ring_init(adev,
+                                                               ring_id,
+                                                               i, k, j);
+                               if (r)
+                                       return r;
+
+                               ring_id++;
+                       }
                }
-               ring = &adev->gfx.compute_ring[i];
-               ring->ring_obj = NULL;
-               ring->use_doorbell = true;
-               ring->doorbell_index = AMDGPU_DOORBELL_MEC_RING0 + i;
-               ring->me = 1; /* first MEC */
-               ring->pipe = i / 8;
-               ring->queue = i % 8;
-               ring->eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (i * MEC_HPD_SIZE);
-               sprintf(ring->name, "comp_%d.%d.%d", ring->me, ring->pipe, ring->queue);
-               irq_type = AMDGPU_CP_IRQ_COMPUTE_MEC1_PIPE0_EOP + ring->pipe;
-               /* type-2 packets are deprecated on MEC, use type-3 instead */
-               r = amdgpu_ring_init(adev, ring, 1024, &adev->gfx.eop_irq,
-                                    irq_type);
-               if (r)
-                       return r;
        }
 
-       if (amdgpu_sriov_vf(adev)) {
-               r = gfx_v8_0_kiq_init(adev);
-               if (r) {
-                       DRM_ERROR("Failed to init KIQ BOs!\n");
-                       return r;
-               }
+       r = amdgpu_gfx_kiq_init(adev, GFX8_MEC_HPD_SIZE);
+       if (r) {
+               DRM_ERROR("Failed to init KIQ BOs!\n");
+               return r;
+       }
 
-               kiq = &adev->gfx.kiq;
-               r = gfx_v8_0_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
-               if (r)
-                       return r;
+       kiq = &adev->gfx.kiq;
+       r = amdgpu_gfx_kiq_init_ring(adev, &kiq->ring, &kiq->irq);
+       if (r)
+               return r;
 
-               /* create MQD for all compute queues as wel as KIQ for SRIOV case */
-               r = gfx_v8_0_compute_mqd_sw_init(adev);
-               if (r)
-                       return r;
-       }
+       /* create MQD for all compute queues as well as KIQ for SRIOV case */
+       r = amdgpu_gfx_compute_mqd_sw_init(adev, sizeof(struct vi_mqd));
+       if (r)
+               return r;
 
        /* reserve GDS, GWS and OA resource for gfx */
        r = amdgpu_bo_create_kernel(adev, adev->gds.mem.gfx_partition_size,
@@ -2237,11 +2146,9 @@ static int gfx_v8_0_sw_fini(void *handle)
        for (i = 0; i < adev->gfx.num_compute_rings; i++)
                amdgpu_ring_fini(&adev->gfx.compute_ring[i]);
 
-       if (amdgpu_sriov_vf(adev)) {
-               gfx_v8_0_compute_mqd_sw_fini(adev);
-               gfx_v8_0_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
-               gfx_v8_0_kiq_fini(adev);
-       }
+       amdgpu_gfx_compute_mqd_sw_fini(adev);
+       amdgpu_gfx_kiq_free_ring(&adev->gfx.kiq.ring, &adev->gfx.kiq.irq);
+       amdgpu_gfx_kiq_fini(adev);
 
        gfx_v8_0_mec_fini(adev);
        gfx_v8_0_rlc_fini(adev);
@@ -3594,11 +3501,6 @@ static void gfx_v8_0_select_se_sh(struct amdgpu_device *adev,
        WREG32(mmGRBM_GFX_INDEX, data);
 }
 
-static u32 gfx_v8_0_create_bitmask(u32 bit_width)
-{
-       return (u32)((1ULL << bit_width) - 1);
-}
-
 static u32 gfx_v8_0_get_rb_active_bitmap(struct amdgpu_device *adev)
 {
        u32 data, mask;
@@ -3608,8 +3510,8 @@ static u32 gfx_v8_0_get_rb_active_bitmap(struct amdgpu_device *adev)
 
        data = REG_GET_FIELD(data, GC_USER_RB_BACKEND_DISABLE, BACKEND_DISABLE);
 
-       mask = gfx_v8_0_create_bitmask(adev->gfx.config.max_backends_per_se /
-                                      adev->gfx.config.max_sh_per_se);
+       mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_backends_per_se /
+                                        adev->gfx.config.max_sh_per_se);
 
        return (~data) & mask;
 }
@@ -3823,7 +3725,7 @@ static void gfx_v8_0_setup_rb(struct amdgpu_device *adev)
 /**
  * gfx_v8_0_init_compute_vmid - gart enable
  *
- * @rdev: amdgpu_device pointer
+ * @adev: amdgpu_device pointer
  *
  * Initialize compute vmid sh_mem registers
  *
@@ -4481,6 +4383,39 @@ static int gfx_v8_0_cp_gfx_start(struct amdgpu_device *adev)
 
        return 0;
 }
+static void gfx_v8_0_set_cpg_door_bell(struct amdgpu_device *adev, struct amdgpu_ring *ring)
+{
+       u32 tmp;
+       /* no gfx doorbells on iceland */
+       if (adev->asic_type == CHIP_TOPAZ)
+               return;
+
+       tmp = RREG32(mmCP_RB_DOORBELL_CONTROL);
+
+       if (ring->use_doorbell) {
+               tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
+                               DOORBELL_OFFSET, ring->doorbell_index);
+               tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
+                                               DOORBELL_HIT, 0);
+               tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
+                                           DOORBELL_EN, 1);
+       } else {
+               tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL, DOORBELL_EN, 0);
+       }
+
+       WREG32(mmCP_RB_DOORBELL_CONTROL, tmp);
+
+       if (adev->flags & AMD_IS_APU)
+               return;
+
+       tmp = REG_SET_FIELD(0, CP_RB_DOORBELL_RANGE_LOWER,
+                                       DOORBELL_RANGE_LOWER,
+                                       AMDGPU_DOORBELL_GFX_RING0);
+       WREG32(mmCP_RB_DOORBELL_RANGE_LOWER, tmp);
+
+       WREG32(mmCP_RB_DOORBELL_RANGE_UPPER,
+               CP_RB_DOORBELL_RANGE_UPPER__DOORBELL_RANGE_UPPER_MASK);
+}
 
 static int gfx_v8_0_cp_gfx_resume(struct amdgpu_device *adev)
 {
@@ -4528,34 +4463,7 @@ static int gfx_v8_0_cp_gfx_resume(struct amdgpu_device *adev)
        WREG32(mmCP_RB0_BASE, rb_addr);
        WREG32(mmCP_RB0_BASE_HI, upper_32_bits(rb_addr));
 
-       /* no gfx doorbells on iceland */
-       if (adev->asic_type != CHIP_TOPAZ) {
-               tmp = RREG32(mmCP_RB_DOORBELL_CONTROL);
-               if (ring->use_doorbell) {
-                       tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
-                                           DOORBELL_OFFSET, ring->doorbell_index);
-                       tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
-                                           DOORBELL_HIT, 0);
-                       tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
-                                           DOORBELL_EN, 1);
-               } else {
-                       tmp = REG_SET_FIELD(tmp, CP_RB_DOORBELL_CONTROL,
-                                           DOORBELL_EN, 0);
-               }
-               WREG32(mmCP_RB_DOORBELL_CONTROL, tmp);
-
-               if (adev->asic_type == CHIP_TONGA) {
-                       tmp = REG_SET_FIELD(0, CP_RB_DOORBELL_RANGE_LOWER,
-                                           DOORBELL_RANGE_LOWER,
-                                           AMDGPU_DOORBELL_GFX_RING0);
-                       WREG32(mmCP_RB_DOORBELL_RANGE_LOWER, tmp);
-
-                       WREG32(mmCP_RB_DOORBELL_RANGE_UPPER,
-                              CP_RB_DOORBELL_RANGE_UPPER__DOORBELL_RANGE_UPPER_MASK);
-               }
-
-       }
-
+       gfx_v8_0_set_cpg_door_bell(adev, ring);
        /* start the ring */
        amdgpu_ring_clear_ring(ring);
        gfx_v8_0_cp_gfx_start(adev);
@@ -4628,29 +4536,6 @@ static int gfx_v8_0_cp_compute_load_microcode(struct amdgpu_device *adev)
        return 0;
 }
 
-static void gfx_v8_0_cp_compute_fini(struct amdgpu_device *adev)
-{
-       int i, r;
-
-       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-               struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
-
-               if (ring->mqd_obj) {
-                       r = amdgpu_bo_reserve(ring->mqd_obj, false);
-                       if (unlikely(r != 0))
-                               dev_warn(adev->dev, "(%d) reserve MQD bo failed\n", r);
-
-                       amdgpu_bo_unpin(ring->mqd_obj);
-                       amdgpu_bo_unreserve(ring->mqd_obj);
-
-                       amdgpu_bo_unref(&ring->mqd_obj);
-                       ring->mqd_obj = NULL;
-                       ring->mqd_ptr = NULL;
-                       ring->mqd_gpu_addr = 0;
-               }
-       }
-}
-
 /* KIQ functions */
 static void gfx_v8_0_kiq_setting(struct amdgpu_ring *ring)
 {
@@ -4666,45 +4551,161 @@ static void gfx_v8_0_kiq_setting(struct amdgpu_ring *ring)
        WREG32(mmRLC_CP_SCHEDULERS, tmp);
 }
 
-static void gfx_v8_0_kiq_enable(struct amdgpu_ring *ring)
+static int gfx_v8_0_kiq_kcq_enable(struct amdgpu_device *adev)
 {
-       amdgpu_ring_alloc(ring, 8);
+       struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
+       uint32_t scratch, tmp = 0;
+       uint64_t queue_mask = 0;
+       int r, i;
+
+       for (i = 0; i < AMDGPU_MAX_COMPUTE_QUEUES; ++i) {
+               if (!test_bit(i, adev->gfx.mec.queue_bitmap))
+                       continue;
+
+               /* This situation may be hit in the future if a new HW
+                * generation exposes more than 64 queues. If so, the
+                * definition of queue_mask needs updating */
+               if (WARN_ON(i > (sizeof(queue_mask)*8))) {
+                       DRM_ERROR("Invalid KCQ enabled: %d\n", i);
+                       break;
+               }
+
+               queue_mask |= (1ull << i);
+       }
+
+       r = amdgpu_gfx_scratch_get(adev, &scratch);
+       if (r) {
+               DRM_ERROR("Failed to get scratch reg (%d).\n", r);
+               return r;
+       }
+       WREG32(scratch, 0xCAFEDEAD);
+
+       r = amdgpu_ring_alloc(kiq_ring, (8 * adev->gfx.num_compute_rings) + 11);
+       if (r) {
+               DRM_ERROR("Failed to lock KIQ (%d).\n", r);
+               amdgpu_gfx_scratch_free(adev, scratch);
+               return r;
+       }
        /* set resources */
-       amdgpu_ring_write(ring, PACKET3(PACKET3_SET_RESOURCES, 6));
-       amdgpu_ring_write(ring, 0);     /* vmid_mask:0 queue_type:0 (KIQ) */
-       amdgpu_ring_write(ring, 0x000000FF);    /* queue mask lo */
-       amdgpu_ring_write(ring, 0);     /* queue mask hi */
-       amdgpu_ring_write(ring, 0);     /* gws mask lo */
-       amdgpu_ring_write(ring, 0);     /* gws mask hi */
-       amdgpu_ring_write(ring, 0);     /* oac mask */
-       amdgpu_ring_write(ring, 0);     /* gds heap base:0, gds heap size:0 */
-       amdgpu_ring_commit(ring);
-       udelay(50);
+       amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_RESOURCES, 6));
+       amdgpu_ring_write(kiq_ring, 0); /* vmid_mask:0 queue_type:0 (KIQ) */
+       amdgpu_ring_write(kiq_ring, lower_32_bits(queue_mask)); /* queue mask lo */
+       amdgpu_ring_write(kiq_ring, upper_32_bits(queue_mask)); /* queue mask hi */
+       amdgpu_ring_write(kiq_ring, 0); /* gws mask lo */
+       amdgpu_ring_write(kiq_ring, 0); /* gws mask hi */
+       amdgpu_ring_write(kiq_ring, 0); /* oac mask */
+       amdgpu_ring_write(kiq_ring, 0); /* gds heap base:0, gds heap size:0 */
+       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
+               struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
+               uint64_t mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
+               uint64_t wptr_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4);
+
+               /* map queues */
+               amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
+               /* Q_sel:0, vmid:0, vidmem: 1, engine:0, num_Q:1*/
+               amdgpu_ring_write(kiq_ring,
+                                 PACKET3_MAP_QUEUES_NUM_QUEUES(1));
+               amdgpu_ring_write(kiq_ring,
+                                 PACKET3_MAP_QUEUES_DOORBELL_OFFSET(ring->doorbell_index) |
+                                 PACKET3_MAP_QUEUES_QUEUE(ring->queue) |
+                                 PACKET3_MAP_QUEUES_PIPE(ring->pipe) |
+                                 PACKET3_MAP_QUEUES_ME(ring->me == 1 ? 0 : 1)); /* doorbell */
+               amdgpu_ring_write(kiq_ring, lower_32_bits(mqd_addr));
+               amdgpu_ring_write(kiq_ring, upper_32_bits(mqd_addr));
+               amdgpu_ring_write(kiq_ring, lower_32_bits(wptr_addr));
+               amdgpu_ring_write(kiq_ring, upper_32_bits(wptr_addr));
+       }
+       /* write to scratch for completion */
+       amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1));
+       amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START));
+       amdgpu_ring_write(kiq_ring, 0xDEADBEEF);
+       amdgpu_ring_commit(kiq_ring);
+
+       for (i = 0; i < adev->usec_timeout; i++) {
+               tmp = RREG32(scratch);
+               if (tmp == 0xDEADBEEF)
+                       break;
+               DRM_UDELAY(1);
+       }
+       if (i >= adev->usec_timeout) {
+               DRM_ERROR("KCQ enable failed (scratch(0x%04X)=0x%08X)\n",
+                         scratch, tmp);
+               r = -EINVAL;
+       }
+       amdgpu_gfx_scratch_free(adev, scratch);
+
+       return r;
 }
 
-static void gfx_v8_0_map_queue_enable(struct amdgpu_ring *kiq_ring,
-                                  struct amdgpu_ring *ring)
+static int gfx_v8_0_kiq_kcq_disable(struct amdgpu_device *adev)
 {
-       struct amdgpu_device *adev = kiq_ring->adev;
-       uint64_t mqd_addr, wptr_addr;
+       struct amdgpu_ring *kiq_ring = &adev->gfx.kiq.ring;
+       uint32_t scratch, tmp = 0;
+       int r, i;
 
-       mqd_addr = amdgpu_bo_gpu_offset(ring->mqd_obj);
-       wptr_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4);
-       amdgpu_ring_alloc(kiq_ring, 8);
+       r = amdgpu_gfx_scratch_get(adev, &scratch);
+       if (r) {
+               DRM_ERROR("Failed to get scratch reg (%d).\n", r);
+               return r;
+       }
+       WREG32(scratch, 0xCAFEDEAD);
 
-       amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_MAP_QUEUES, 5));
-       /* Q_sel:0, vmid:0, vidmem: 1, engine:0, num_Q:1*/
-       amdgpu_ring_write(kiq_ring, 0x21010000);
-       amdgpu_ring_write(kiq_ring, (ring->doorbell_index << 2) |
-                       (ring->queue << 26) |
-                       (ring->pipe << 29) |
-                       ((ring->me == 1 ? 0 : 1) << 31)); /* doorbell */
-       amdgpu_ring_write(kiq_ring, lower_32_bits(mqd_addr));
-       amdgpu_ring_write(kiq_ring, upper_32_bits(mqd_addr));
-       amdgpu_ring_write(kiq_ring, lower_32_bits(wptr_addr));
-       amdgpu_ring_write(kiq_ring, upper_32_bits(wptr_addr));
+       r = amdgpu_ring_alloc(kiq_ring, 6 + 3);
+       if (r) {
+               DRM_ERROR("Failed to lock KIQ (%d).\n", r);
+               amdgpu_gfx_scratch_free(adev, scratch);
+               return r;
+       }
+       /* unmap queues */
+       amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_UNMAP_QUEUES, 4));
+       amdgpu_ring_write(kiq_ring,
+                         PACKET3_UNMAP_QUEUES_ACTION(1)| /* RESET_QUEUES */
+                         PACKET3_UNMAP_QUEUES_QUEUE_SEL(2)); /* select all queues */
+       amdgpu_ring_write(kiq_ring, 0);
+       amdgpu_ring_write(kiq_ring, 0);
+       amdgpu_ring_write(kiq_ring, 0);
+       amdgpu_ring_write(kiq_ring, 0);
+       /* write to scratch for completion */
+       amdgpu_ring_write(kiq_ring, PACKET3(PACKET3_SET_UCONFIG_REG, 1));
+       amdgpu_ring_write(kiq_ring, (scratch - PACKET3_SET_UCONFIG_REG_START));
+       amdgpu_ring_write(kiq_ring, 0xDEADBEEF);
        amdgpu_ring_commit(kiq_ring);
-       udelay(50);
+
+       for (i = 0; i < adev->usec_timeout; i++) {
+               tmp = RREG32(scratch);
+               if (tmp == 0xDEADBEEF)
+                       break;
+               DRM_UDELAY(1);
+       }
+       if (i >= adev->usec_timeout) {
+               DRM_ERROR("KCQ disabled failed (scratch(0x%04X)=0x%08X)\n",
+                         scratch, tmp);
+               r = -EINVAL;
+       }
+       amdgpu_gfx_scratch_free(adev, scratch);
+
+       return r;
+}
+
+static int gfx_v8_0_deactivate_hqd(struct amdgpu_device *adev, u32 req)
+{
+       int i, r = 0;
+
+       if (RREG32(mmCP_HQD_ACTIVE) & CP_HQD_ACTIVE__ACTIVE_MASK) {
+               WREG32_FIELD(CP_HQD_DEQUEUE_REQUEST, DEQUEUE_REQ, req);
+               for (i = 0; i < adev->usec_timeout; i++) {
+                       if (!(RREG32(mmCP_HQD_ACTIVE) & CP_HQD_ACTIVE__ACTIVE_MASK))
+                               break;
+                       udelay(1);
+               }
+               if (i == adev->usec_timeout)
+                       r = -ETIMEDOUT;
+       }
+       WREG32(mmCP_HQD_DEQUEUE_REQUEST, 0);
+       WREG32(mmCP_HQD_PQ_RPTR, 0);
+       WREG32(mmCP_HQD_PQ_WPTR, 0);
+
+       return r;
 }
 
 static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring)
@@ -4714,6 +4715,9 @@ static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring)
        uint64_t hqd_gpu_addr, wb_gpu_addr, eop_base_addr;
        uint32_t tmp;
 
+       /* init the mqd struct */
+       memset(mqd, 0, sizeof(struct vi_mqd));
+
        mqd->header = 0xC0310800;
        mqd->compute_pipelinestat_enable = 0x00000001;
        mqd->compute_static_thread_mgmt_se0 = 0xffffffff;
@@ -4729,7 +4733,7 @@ static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring)
        /* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
        tmp = RREG32(mmCP_HQD_EOP_CONTROL);
        tmp = REG_SET_FIELD(tmp, CP_HQD_EOP_CONTROL, EOP_SIZE,
-                       (order_base_2(MEC_HPD_SIZE / 4) - 1));
+                       (order_base_2(GFX8_MEC_HPD_SIZE / 4) - 1));
 
        mqd->cp_hqd_eop_control = tmp;
 
@@ -4741,11 +4745,6 @@ static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring)
 
        mqd->cp_hqd_pq_doorbell_control = tmp;
 
-       /* disable the queue if it's active */
-       mqd->cp_hqd_dequeue_request = 0;
-       mqd->cp_hqd_pq_rptr = 0;
-       mqd->cp_hqd_pq_wptr = 0;
-
        /* set the pointer to the MQD */
        mqd->cp_mqd_base_addr_lo = ring->mqd_gpu_addr & 0xfffffffc;
        mqd->cp_mqd_base_addr_hi = upper_32_bits(ring->mqd_gpu_addr);
@@ -4815,149 +4814,170 @@ static int gfx_v8_0_mqd_init(struct amdgpu_ring *ring)
        tmp = REG_SET_FIELD(tmp, CP_HQD_PERSISTENT_STATE, PRELOAD_SIZE, 0x53);
        mqd->cp_hqd_persistent_state = tmp;
 
+       /* set MTYPE */
+       tmp = RREG32(mmCP_HQD_IB_CONTROL);
+       tmp = REG_SET_FIELD(tmp, CP_HQD_IB_CONTROL, MIN_IB_AVAIL_SIZE, 3);
+       tmp = REG_SET_FIELD(tmp, CP_HQD_IB_CONTROL, MTYPE, 3);
+       mqd->cp_hqd_ib_control = tmp;
+
+       tmp = RREG32(mmCP_HQD_IQ_TIMER);
+       tmp = REG_SET_FIELD(tmp, CP_HQD_IQ_TIMER, MTYPE, 3);
+       mqd->cp_hqd_iq_timer = tmp;
+
+       tmp = RREG32(mmCP_HQD_CTX_SAVE_CONTROL);
+       tmp = REG_SET_FIELD(tmp, CP_HQD_CTX_SAVE_CONTROL, MTYPE, 3);
+       mqd->cp_hqd_ctx_save_control = tmp;
+
+       /* defaults */
+       mqd->cp_hqd_eop_rptr = RREG32(mmCP_HQD_EOP_RPTR);
+       mqd->cp_hqd_eop_wptr = RREG32(mmCP_HQD_EOP_WPTR);
+       mqd->cp_hqd_pipe_priority = RREG32(mmCP_HQD_PIPE_PRIORITY);
+       mqd->cp_hqd_queue_priority = RREG32(mmCP_HQD_QUEUE_PRIORITY);
+       mqd->cp_hqd_quantum = RREG32(mmCP_HQD_QUANTUM);
+       mqd->cp_hqd_ctx_save_base_addr_lo = RREG32(mmCP_HQD_CTX_SAVE_BASE_ADDR_LO);
+       mqd->cp_hqd_ctx_save_base_addr_hi = RREG32(mmCP_HQD_CTX_SAVE_BASE_ADDR_HI);
+       mqd->cp_hqd_cntl_stack_offset = RREG32(mmCP_HQD_CNTL_STACK_OFFSET);
+       mqd->cp_hqd_cntl_stack_size = RREG32(mmCP_HQD_CNTL_STACK_SIZE);
+       mqd->cp_hqd_wg_state_offset = RREG32(mmCP_HQD_WG_STATE_OFFSET);
+       mqd->cp_hqd_ctx_save_size = RREG32(mmCP_HQD_CTX_SAVE_SIZE);
+       mqd->cp_hqd_eop_done_events = RREG32(mmCP_HQD_EOP_EVENTS);
+       mqd->cp_hqd_error = RREG32(mmCP_HQD_ERROR);
+       mqd->cp_hqd_eop_wptr_mem = RREG32(mmCP_HQD_EOP_WPTR_MEM);
+       mqd->cp_hqd_eop_dones = RREG32(mmCP_HQD_EOP_DONES);
+
        /* activate the queue */
        mqd->cp_hqd_active = 1;
 
        return 0;
 }
 
-static int gfx_v8_0_kiq_init_register(struct amdgpu_ring *ring)
+int gfx_v8_0_mqd_commit(struct amdgpu_device *adev,
+                       struct vi_mqd *mqd)
 {
-       struct amdgpu_device *adev = ring->adev;
-       struct vi_mqd *mqd = ring->mqd_ptr;
-       int j;
+       uint32_t mqd_reg;
+       uint32_t *mqd_data;
+
+       /* HQD registers extend from mmCP_MQD_BASE_ADDR to mmCP_HQD_ERROR */
+       mqd_data = &mqd->cp_mqd_base_addr_lo;
 
        /* disable wptr polling */
        WREG32_FIELD(CP_PQ_WPTR_POLL_CNTL, EN, 0);
 
-       WREG32(mmCP_HQD_EOP_BASE_ADDR, mqd->cp_hqd_eop_base_addr_lo);
-       WREG32(mmCP_HQD_EOP_BASE_ADDR_HI, mqd->cp_hqd_eop_base_addr_hi);
-
-       /* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
-       WREG32(mmCP_HQD_EOP_CONTROL, mqd->cp_hqd_eop_control);
-
-       /* enable doorbell? */
-       WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, mqd->cp_hqd_pq_doorbell_control);
+       /* program all HQD registers */
+       for (mqd_reg = mmCP_HQD_VMID; mqd_reg <= mmCP_HQD_EOP_CONTROL; mqd_reg++)
+               WREG32(mqd_reg, mqd_data[mqd_reg - mmCP_MQD_BASE_ADDR]);
 
-       /* disable the queue if it's active */
-       if (RREG32(mmCP_HQD_ACTIVE) & CP_HQD_ACTIVE__ACTIVE_MASK) {
-               WREG32(mmCP_HQD_DEQUEUE_REQUEST, 1);
-               for (j = 0; j < adev->usec_timeout; j++) {
-                       if (!(RREG32(mmCP_HQD_ACTIVE) & CP_HQD_ACTIVE__ACTIVE_MASK))
-                               break;
-                       udelay(1);
-               }
-               WREG32(mmCP_HQD_DEQUEUE_REQUEST, mqd->cp_hqd_dequeue_request);
-               WREG32(mmCP_HQD_PQ_RPTR, mqd->cp_hqd_pq_rptr);
-               WREG32(mmCP_HQD_PQ_WPTR, mqd->cp_hqd_pq_wptr);
+       /* Tonga errata: EOP RPTR/WPTR should be left unmodified.
+        * This is safe since EOP RPTR==WPTR for any inactive HQD
+        * on ASICs that do not support context-save.
+        * EOP writes/reads can start anywhere in the ring.
+        */
+       if (adev->asic_type != CHIP_TONGA) {
+               WREG32(mmCP_HQD_EOP_RPTR, mqd->cp_hqd_eop_rptr);
+               WREG32(mmCP_HQD_EOP_WPTR, mqd->cp_hqd_eop_wptr);
+               WREG32(mmCP_HQD_EOP_WPTR_MEM, mqd->cp_hqd_eop_wptr_mem);
        }
 
-       /* set the pointer to the MQD */
-       WREG32(mmCP_MQD_BASE_ADDR, mqd->cp_mqd_base_addr_lo);
-       WREG32(mmCP_MQD_BASE_ADDR_HI, mqd->cp_mqd_base_addr_hi);
+       for (mqd_reg = mmCP_HQD_EOP_EVENTS; mqd_reg <= mmCP_HQD_ERROR; mqd_reg++)
+               WREG32(mqd_reg, mqd_data[mqd_reg - mmCP_MQD_BASE_ADDR]);
 
-       /* set MQD vmid to 0 */
-       WREG32(mmCP_MQD_CONTROL, mqd->cp_mqd_control);
+       /* activate the HQD */
+       for (mqd_reg = mmCP_MQD_BASE_ADDR; mqd_reg <= mmCP_HQD_ACTIVE; mqd_reg++)
+               WREG32(mqd_reg, mqd_data[mqd_reg - mmCP_MQD_BASE_ADDR]);
 
-       /* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
-       WREG32(mmCP_HQD_PQ_BASE, mqd->cp_hqd_pq_base_lo);
-       WREG32(mmCP_HQD_PQ_BASE_HI, mqd->cp_hqd_pq_base_hi);
+       return 0;
+}
 
-       /* set up the HQD, this is similar to CP_RB0_CNTL */
-       WREG32(mmCP_HQD_PQ_CONTROL, mqd->cp_hqd_pq_control);
+static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
+{
+       int r = 0;
+       struct amdgpu_device *adev = ring->adev;
+       struct vi_mqd *mqd = ring->mqd_ptr;
+       int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
 
-       /* set the wb address whether it's enabled or not */
-       WREG32(mmCP_HQD_PQ_RPTR_REPORT_ADDR,
-                               mqd->cp_hqd_pq_rptr_report_addr_lo);
-       WREG32(mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI,
-                               mqd->cp_hqd_pq_rptr_report_addr_hi);
+       gfx_v8_0_kiq_setting(ring);
 
-       /* only used if CP_PQ_WPTR_POLL_CNTL.CP_PQ_WPTR_POLL_CNTL__EN_MASK=1 */
-       WREG32(mmCP_HQD_PQ_WPTR_POLL_ADDR, mqd->cp_hqd_pq_wptr_poll_addr_lo);
-       WREG32(mmCP_HQD_PQ_WPTR_POLL_ADDR_HI, mqd->cp_hqd_pq_wptr_poll_addr_hi);
+       if (adev->gfx.in_reset) { /* for GPU_RESET case */
+               /* reset MQD to a clean status */
+               if (adev->gfx.mec.mqd_backup[mqd_idx])
+                       memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
 
-       /* enable the doorbell if requested */
-       if (ring->use_doorbell) {
-               if ((adev->asic_type == CHIP_CARRIZO) ||
-                               (adev->asic_type == CHIP_FIJI) ||
-                               (adev->asic_type == CHIP_STONEY)) {
-                       WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER,
-                                               AMDGPU_DOORBELL_KIQ << 2);
-                       WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER,
-                                               AMDGPU_DOORBELL_MEC_RING7 << 2);
+               /* reset ring buffer */
+               ring->wptr = 0;
+               amdgpu_ring_clear_ring(ring);
+               mutex_lock(&adev->srbm_mutex);
+               vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+               r = gfx_v8_0_deactivate_hqd(adev, 1);
+               if (r) {
+                       dev_err(adev->dev, "failed to deactivate ring %s\n", ring->name);
+                       goto out_unlock;
                }
-       }
-       WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, mqd->cp_hqd_pq_doorbell_control);
-
-       /* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
-       WREG32(mmCP_HQD_PQ_WPTR, mqd->cp_hqd_pq_wptr);
-
-       /* set the vmid for the queue */
-       WREG32(mmCP_HQD_VMID, mqd->cp_hqd_vmid);
-
-       WREG32(mmCP_HQD_PERSISTENT_STATE, mqd->cp_hqd_persistent_state);
+               gfx_v8_0_mqd_commit(adev, mqd);
+               vi_srbm_select(adev, 0, 0, 0, 0);
+               mutex_unlock(&adev->srbm_mutex);
+       } else {
+               mutex_lock(&adev->srbm_mutex);
+               vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+               gfx_v8_0_mqd_init(ring);
+               r = gfx_v8_0_deactivate_hqd(adev, 1);
+               if (r) {
+                       dev_err(adev->dev, "failed to deactivate ring %s\n", ring->name);
+                       goto out_unlock;
+               }
+               gfx_v8_0_mqd_commit(adev, mqd);
+               vi_srbm_select(adev, 0, 0, 0, 0);
+               mutex_unlock(&adev->srbm_mutex);
 
-       /* activate the queue */
-       WREG32(mmCP_HQD_ACTIVE, mqd->cp_hqd_active);
+               if (adev->gfx.mec.mqd_backup[mqd_idx])
+                       memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
+       }
 
-       if (ring->use_doorbell)
-               WREG32_FIELD(CP_PQ_STATUS, DOORBELL_ENABLE, 1);
+       return r;
 
-       return 0;
+out_unlock:
+       vi_srbm_select(adev, 0, 0, 0, 0);
+       mutex_unlock(&adev->srbm_mutex);
+       return r;
 }
 
-static int gfx_v8_0_kiq_init_queue(struct amdgpu_ring *ring)
+static int gfx_v8_0_kcq_init_queue(struct amdgpu_ring *ring)
 {
        struct amdgpu_device *adev = ring->adev;
-       struct amdgpu_kiq *kiq = &adev->gfx.kiq;
        struct vi_mqd *mqd = ring->mqd_ptr;
-       bool is_kiq = (ring->funcs->type == AMDGPU_RING_TYPE_KIQ);
-       int mqd_idx = AMDGPU_MAX_COMPUTE_RINGS;
-
-       if (is_kiq) {
-               gfx_v8_0_kiq_setting(&kiq->ring);
-       } else {
-               mqd_idx = ring - &adev->gfx.compute_ring[0];
-       }
+       int mqd_idx = ring - &adev->gfx.compute_ring[0];
 
-       if (!adev->gfx.in_reset) {
-               memset((void *)mqd, 0, sizeof(*mqd));
+       if (!adev->gfx.in_reset && !adev->gfx.in_suspend) {
                mutex_lock(&adev->srbm_mutex);
                vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
                gfx_v8_0_mqd_init(ring);
-               if (is_kiq)
-                       gfx_v8_0_kiq_init_register(ring);
                vi_srbm_select(adev, 0, 0, 0, 0);
                mutex_unlock(&adev->srbm_mutex);
 
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(adev->gfx.mec.mqd_backup[mqd_idx], mqd, sizeof(*mqd));
-       } else { /* for GPU_RESET case */
+       } else if (adev->gfx.in_reset) { /* for GPU_RESET case */
                /* reset MQD to a clean status */
                if (adev->gfx.mec.mqd_backup[mqd_idx])
                        memcpy(mqd, adev->gfx.mec.mqd_backup[mqd_idx], sizeof(*mqd));
-
                /* reset ring buffer */
                ring->wptr = 0;
                amdgpu_ring_clear_ring(ring);
-
-               if (is_kiq) {
-                   mutex_lock(&adev->srbm_mutex);
-                   vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-                   gfx_v8_0_kiq_init_register(ring);
-                   vi_srbm_select(adev, 0, 0, 0, 0);
-                   mutex_unlock(&adev->srbm_mutex);
-               }
+       } else {
+               amdgpu_ring_clear_ring(ring);
        }
-
-       if (is_kiq)
-               gfx_v8_0_kiq_enable(ring);
-       else
-               gfx_v8_0_map_queue_enable(&kiq->ring, ring);
-
        return 0;
 }
 
+static void gfx_v8_0_set_mec_doorbell_range(struct amdgpu_device *adev)
+{
+       if (adev->asic_type > CHIP_TONGA) {
+               WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER, AMDGPU_DOORBELL_KIQ << 2);
+               WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER, AMDGPU_DOORBELL_MEC_RING7 << 2);
+       }
+       /* enable doorbells */
+       WREG32_FIELD(CP_PQ_STATUS, DOORBELL_ENABLE, 1);
+}
+
 static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
 {
        struct amdgpu_ring *ring = NULL;
@@ -4981,13 +5001,6 @@ static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
        if (r)
                goto done;
 
-       ring->ready = true;
-       r = amdgpu_ring_test_ring(ring);
-       if (r) {
-               ring->ready = false;
-               goto done;
-       }
-
        for (i = 0; i < adev->gfx.num_compute_rings; i++) {
                ring = &adev->gfx.compute_ring[i];
 
@@ -4996,272 +5009,41 @@ static int gfx_v8_0_kiq_resume(struct amdgpu_device *adev)
                        goto done;
                r = amdgpu_bo_kmap(ring->mqd_obj, &ring->mqd_ptr);
                if (!r) {
-                       r = gfx_v8_0_kiq_init_queue(ring);
+                       r = gfx_v8_0_kcq_init_queue(ring);
                        amdgpu_bo_kunmap(ring->mqd_obj);
                        ring->mqd_ptr = NULL;
                }
                amdgpu_bo_unreserve(ring->mqd_obj);
                if (r)
                        goto done;
-
-               ring->ready = true;
-               r = amdgpu_ring_test_ring(ring);
-               if (r)
-                       ring->ready = false;
        }
 
-done:
-       return r;
-}
+       gfx_v8_0_set_mec_doorbell_range(adev);
 
-static int gfx_v8_0_cp_compute_resume(struct amdgpu_device *adev)
-{
-       int r, i, j;
-       u32 tmp;
-       bool use_doorbell = true;
-       u64 hqd_gpu_addr;
-       u64 mqd_gpu_addr;
-       u64 eop_gpu_addr;
-       u64 wb_gpu_addr;
-       u32 *buf;
-       struct vi_mqd *mqd;
-
-       /* init the queues.  */
-       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-               struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
-
-               if (ring->mqd_obj == NULL) {
-                       r = amdgpu_bo_create(adev,
-                                            sizeof(struct vi_mqd),
-                                            PAGE_SIZE, true,
-                                            AMDGPU_GEM_DOMAIN_GTT, 0, NULL,
-                                            NULL, &ring->mqd_obj);
-                       if (r) {
-                               dev_warn(adev->dev, "(%d) create MQD bo failed\n", r);
-                               return r;
-                       }
-               }
-
-               r = amdgpu_bo_reserve(ring->mqd_obj, false);
-               if (unlikely(r != 0)) {
-                       gfx_v8_0_cp_compute_fini(adev);
-                       return r;
-               }
-               r = amdgpu_bo_pin(ring->mqd_obj, AMDGPU_GEM_DOMAIN_GTT,
-                                 &mqd_gpu_addr);
-               if (r) {
-                       dev_warn(adev->dev, "(%d) pin MQD bo failed\n", r);
-                       gfx_v8_0_cp_compute_fini(adev);
-                       return r;
-               }
-               r = amdgpu_bo_kmap(ring->mqd_obj, (void **)&buf);
-               if (r) {
-                       dev_warn(adev->dev, "(%d) map MQD bo failed\n", r);
-                       gfx_v8_0_cp_compute_fini(adev);
-                       return r;
-               }
-
-               /* init the mqd struct */
-               memset(buf, 0, sizeof(struct vi_mqd));
-
-               mqd = (struct vi_mqd *)buf;
-               mqd->header = 0xC0310800;
-               mqd->compute_pipelinestat_enable = 0x00000001;
-               mqd->compute_static_thread_mgmt_se0 = 0xffffffff;
-               mqd->compute_static_thread_mgmt_se1 = 0xffffffff;
-               mqd->compute_static_thread_mgmt_se2 = 0xffffffff;
-               mqd->compute_static_thread_mgmt_se3 = 0xffffffff;
-               mqd->compute_misc_reserved = 0x00000003;
-
-               mutex_lock(&adev->srbm_mutex);
-               vi_srbm_select(adev, ring->me,
-                              ring->pipe,
-                              ring->queue, 0);
-
-               eop_gpu_addr = adev->gfx.mec.hpd_eop_gpu_addr + (i * MEC_HPD_SIZE);
-               eop_gpu_addr >>= 8;
-
-               /* write the EOP addr */
-               WREG32(mmCP_HQD_EOP_BASE_ADDR, eop_gpu_addr);
-               WREG32(mmCP_HQD_EOP_BASE_ADDR_HI, upper_32_bits(eop_gpu_addr));
-
-               /* set the VMID assigned */
-               WREG32(mmCP_HQD_VMID, 0);
-
-               /* set the EOP size, register value is 2^(EOP_SIZE+1) dwords */
-               tmp = RREG32(mmCP_HQD_EOP_CONTROL);
-               tmp = REG_SET_FIELD(tmp, CP_HQD_EOP_CONTROL, EOP_SIZE,
-                                   (order_base_2(MEC_HPD_SIZE / 4) - 1));
-               WREG32(mmCP_HQD_EOP_CONTROL, tmp);
-
-               /* disable wptr polling */
-               tmp = RREG32(mmCP_PQ_WPTR_POLL_CNTL);
-               tmp = REG_SET_FIELD(tmp, CP_PQ_WPTR_POLL_CNTL, EN, 0);
-               WREG32(mmCP_PQ_WPTR_POLL_CNTL, tmp);
-
-               mqd->cp_hqd_eop_base_addr_lo =
-                       RREG32(mmCP_HQD_EOP_BASE_ADDR);
-               mqd->cp_hqd_eop_base_addr_hi =
-                       RREG32(mmCP_HQD_EOP_BASE_ADDR_HI);
-
-               /* enable doorbell? */
-               tmp = RREG32(mmCP_HQD_PQ_DOORBELL_CONTROL);
-               if (use_doorbell) {
-                       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
-               } else {
-                       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 0);
-               }
-               WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL, tmp);
-               mqd->cp_hqd_pq_doorbell_control = tmp;
-
-               /* disable the queue if it's active */
-               mqd->cp_hqd_dequeue_request = 0;
-               mqd->cp_hqd_pq_rptr = 0;
-               mqd->cp_hqd_pq_wptr= 0;
-               if (RREG32(mmCP_HQD_ACTIVE) & 1) {
-                       WREG32(mmCP_HQD_DEQUEUE_REQUEST, 1);
-                       for (j = 0; j < adev->usec_timeout; j++) {
-                               if (!(RREG32(mmCP_HQD_ACTIVE) & 1))
-                                       break;
-                               udelay(1);
-                       }
-                       WREG32(mmCP_HQD_DEQUEUE_REQUEST, mqd->cp_hqd_dequeue_request);
-                       WREG32(mmCP_HQD_PQ_RPTR, mqd->cp_hqd_pq_rptr);
-                       WREG32(mmCP_HQD_PQ_WPTR, mqd->cp_hqd_pq_wptr);
-               }
-
-               /* set the pointer to the MQD */
-               mqd->cp_mqd_base_addr_lo = mqd_gpu_addr & 0xfffffffc;
-               mqd->cp_mqd_base_addr_hi = upper_32_bits(mqd_gpu_addr);
-               WREG32(mmCP_MQD_BASE_ADDR, mqd->cp_mqd_base_addr_lo);
-               WREG32(mmCP_MQD_BASE_ADDR_HI, mqd->cp_mqd_base_addr_hi);
-
-               /* set MQD vmid to 0 */
-               tmp = RREG32(mmCP_MQD_CONTROL);
-               tmp = REG_SET_FIELD(tmp, CP_MQD_CONTROL, VMID, 0);
-               WREG32(mmCP_MQD_CONTROL, tmp);
-               mqd->cp_mqd_control = tmp;
-
-               /* set the pointer to the HQD, this is similar CP_RB0_BASE/_HI */
-               hqd_gpu_addr = ring->gpu_addr >> 8;
-               mqd->cp_hqd_pq_base_lo = hqd_gpu_addr;
-               mqd->cp_hqd_pq_base_hi = upper_32_bits(hqd_gpu_addr);
-               WREG32(mmCP_HQD_PQ_BASE, mqd->cp_hqd_pq_base_lo);
-               WREG32(mmCP_HQD_PQ_BASE_HI, mqd->cp_hqd_pq_base_hi);
-
-               /* set up the HQD, this is similar to CP_RB0_CNTL */
-               tmp = RREG32(mmCP_HQD_PQ_CONTROL);
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, QUEUE_SIZE,
-                                   (order_base_2(ring->ring_size / 4) - 1));
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, RPTR_BLOCK_SIZE,
-                              ((order_base_2(AMDGPU_GPU_PAGE_SIZE / 4) - 1) << 8));
-#ifdef __BIG_ENDIAN
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, ENDIAN_SWAP, 1);
-#endif
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, UNORD_DISPATCH, 0);
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, ROQ_PQ_IB_FLIP, 0);
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, PRIV_STATE, 1);
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_CONTROL, KMD_QUEUE, 1);
-               WREG32(mmCP_HQD_PQ_CONTROL, tmp);
-               mqd->cp_hqd_pq_control = tmp;
-
-               /* set the wb address wether it's enabled or not */
-               wb_gpu_addr = adev->wb.gpu_addr + (ring->rptr_offs * 4);
-               mqd->cp_hqd_pq_rptr_report_addr_lo = wb_gpu_addr & 0xfffffffc;
-               mqd->cp_hqd_pq_rptr_report_addr_hi =
-                       upper_32_bits(wb_gpu_addr) & 0xffff;
-               WREG32(mmCP_HQD_PQ_RPTR_REPORT_ADDR,
-                      mqd->cp_hqd_pq_rptr_report_addr_lo);
-               WREG32(mmCP_HQD_PQ_RPTR_REPORT_ADDR_HI,
-                      mqd->cp_hqd_pq_rptr_report_addr_hi);
-
-               /* only used if CP_PQ_WPTR_POLL_CNTL.CP_PQ_WPTR_POLL_CNTL__EN_MASK=1 */
-               wb_gpu_addr = adev->wb.gpu_addr + (ring->wptr_offs * 4);
-               mqd->cp_hqd_pq_wptr_poll_addr_lo = wb_gpu_addr & 0xfffffffc;
-               mqd->cp_hqd_pq_wptr_poll_addr_hi = upper_32_bits(wb_gpu_addr) & 0xffff;
-               WREG32(mmCP_HQD_PQ_WPTR_POLL_ADDR, mqd->cp_hqd_pq_wptr_poll_addr_lo);
-               WREG32(mmCP_HQD_PQ_WPTR_POLL_ADDR_HI,
-                      mqd->cp_hqd_pq_wptr_poll_addr_hi);
-
-               /* enable the doorbell if requested */
-               if (use_doorbell) {
-                       if ((adev->asic_type == CHIP_CARRIZO) ||
-                           (adev->asic_type == CHIP_FIJI) ||
-                           (adev->asic_type == CHIP_STONEY) ||
-                           (adev->asic_type == CHIP_POLARIS11) ||
-                           (adev->asic_type == CHIP_POLARIS10) ||
-                           (adev->asic_type == CHIP_POLARIS12)) {
-                               WREG32(mmCP_MEC_DOORBELL_RANGE_LOWER,
-                                      AMDGPU_DOORBELL_KIQ << 2);
-                               WREG32(mmCP_MEC_DOORBELL_RANGE_UPPER,
-                                      AMDGPU_DOORBELL_MEC_RING7 << 2);
-                       }
-                       tmp = RREG32(mmCP_HQD_PQ_DOORBELL_CONTROL);
-                       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL,
-                                           DOORBELL_OFFSET, ring->doorbell_index);
-                       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_EN, 1);
-                       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_SOURCE, 0);
-                       tmp = REG_SET_FIELD(tmp, CP_HQD_PQ_DOORBELL_CONTROL, DOORBELL_HIT, 0);
-                       mqd->cp_hqd_pq_doorbell_control = tmp;
-
-               } else {
-                       mqd->cp_hqd_pq_doorbell_control = 0;
-               }
-               WREG32(mmCP_HQD_PQ_DOORBELL_CONTROL,
-                      mqd->cp_hqd_pq_doorbell_control);
-
-               /* reset read and write pointers, similar to CP_RB0_WPTR/_RPTR */
-               ring->wptr = 0;
-               mqd->cp_hqd_pq_wptr = lower_32_bits(ring->wptr);
-               WREG32(mmCP_HQD_PQ_WPTR, mqd->cp_hqd_pq_wptr);
-               mqd->cp_hqd_pq_rptr = RREG32(mmCP_HQD_PQ_RPTR);
-
-               /* set the vmid for the queue */
-               mqd->cp_hqd_vmid = 0;
-               WREG32(mmCP_HQD_VMID, mqd->cp_hqd_vmid);
-
-               tmp = RREG32(mmCP_HQD_PERSISTENT_STATE);
-               tmp = REG_SET_FIELD(tmp, CP_HQD_PERSISTENT_STATE, PRELOAD_SIZE, 0x53);
-               WREG32(mmCP_HQD_PERSISTENT_STATE, tmp);
-               mqd->cp_hqd_persistent_state = tmp;
-               if (adev->asic_type == CHIP_STONEY ||
-                       adev->asic_type == CHIP_POLARIS11 ||
-                       adev->asic_type == CHIP_POLARIS10 ||
-                       adev->asic_type == CHIP_POLARIS12) {
-                       tmp = RREG32(mmCP_ME1_PIPE3_INT_CNTL);
-                       tmp = REG_SET_FIELD(tmp, CP_ME1_PIPE3_INT_CNTL, GENERIC2_INT_ENABLE, 1);
-                       WREG32(mmCP_ME1_PIPE3_INT_CNTL, tmp);
-               }
-
-               /* activate the queue */
-               mqd->cp_hqd_active = 1;
-               WREG32(mmCP_HQD_ACTIVE, mqd->cp_hqd_active);
-
-               vi_srbm_select(adev, 0, 0, 0, 0);
-               mutex_unlock(&adev->srbm_mutex);
-
-               amdgpu_bo_kunmap(ring->mqd_obj);
-               amdgpu_bo_unreserve(ring->mqd_obj);
-       }
+       r = gfx_v8_0_kiq_kcq_enable(adev);
+       if (r)
+               goto done;
 
-       if (use_doorbell) {
-               tmp = RREG32(mmCP_PQ_STATUS);
-               tmp = REG_SET_FIELD(tmp, CP_PQ_STATUS, DOORBELL_ENABLE, 1);
-               WREG32(mmCP_PQ_STATUS, tmp);
+       /* Test KIQ */
+       ring = &adev->gfx.kiq.ring;
+       ring->ready = true;
+       r = amdgpu_ring_test_ring(ring);
+       if (r) {
+               ring->ready = false;
+               goto done;
        }
 
-       gfx_v8_0_cp_compute_enable(adev, true);
-
+       /* Test KCQs */
        for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-               struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
-
+               ring = &adev->gfx.compute_ring[i];
                ring->ready = true;
                r = amdgpu_ring_test_ring(ring);
                if (r)
                        ring->ready = false;
        }
 
-       return 0;
+done:
+       return r;
 }
 
 static int gfx_v8_0_cp_resume(struct amdgpu_device *adev)
@@ -5314,10 +5096,7 @@ static int gfx_v8_0_cp_resume(struct amdgpu_device *adev)
        if (r)
                return r;
 
-       if (amdgpu_sriov_vf(adev))
-               r = gfx_v8_0_kiq_resume(adev);
-       else
-               r = gfx_v8_0_cp_compute_resume(adev);
+       r = gfx_v8_0_kiq_resume(adev);
        if (r)
                return r;
 
@@ -5359,9 +5138,9 @@ static int gfx_v8_0_hw_fini(void *handle)
                pr_debug("For SRIOV client, shouldn't do anything.\n");
                return 0;
        }
+       gfx_v8_0_kiq_kcq_disable(adev);
        gfx_v8_0_cp_enable(adev, false);
        gfx_v8_0_rlc_stop(adev);
-       gfx_v8_0_cp_compute_fini(adev);
 
        amdgpu_set_powergating_state(adev,
                        AMD_IP_BLOCK_TYPE_GFX, AMD_PG_STATE_UNGATE);
@@ -5372,15 +5151,18 @@ static int gfx_v8_0_hw_fini(void *handle)
 static int gfx_v8_0_suspend(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
-
+       adev->gfx.in_suspend = true;
        return gfx_v8_0_hw_fini(adev);
 }
 
 static int gfx_v8_0_resume(void *handle)
 {
+       int r;
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
-       return gfx_v8_0_hw_init(adev);
+       r = gfx_v8_0_hw_init(adev);
+       adev->gfx.in_suspend = false;
+       return r;
 }
 
 static bool gfx_v8_0_is_idle(void *handle)
@@ -5469,25 +5251,6 @@ static bool gfx_v8_0_check_soft_reset(void *handle)
        }
 }
 
-static void gfx_v8_0_inactive_hqd(struct amdgpu_device *adev,
-                                 struct amdgpu_ring *ring)
-{
-       int i;
-
-       mutex_lock(&adev->srbm_mutex);
-       vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-       if (RREG32(mmCP_HQD_ACTIVE) & CP_HQD_ACTIVE__ACTIVE_MASK) {
-               WREG32_FIELD(CP_HQD_DEQUEUE_REQUEST, DEQUEUE_REQ, 2);
-               for (i = 0; i < adev->usec_timeout; i++) {
-                       if (!(RREG32(mmCP_HQD_ACTIVE) & CP_HQD_ACTIVE__ACTIVE_MASK))
-                               break;
-                       udelay(1);
-               }
-       }
-       vi_srbm_select(adev, 0, 0, 0, 0);
-       mutex_unlock(&adev->srbm_mutex);
-}
-
 static int gfx_v8_0_pre_soft_reset(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -5517,7 +5280,11 @@ static int gfx_v8_0_pre_soft_reset(void *handle)
                for (i = 0; i < adev->gfx.num_compute_rings; i++) {
                        struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
 
-                       gfx_v8_0_inactive_hqd(adev, ring);
+                       mutex_lock(&adev->srbm_mutex);
+                       vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+                       gfx_v8_0_deactivate_hqd(adev, 2);
+                       vi_srbm_select(adev, 0, 0, 0, 0);
+                       mutex_unlock(&adev->srbm_mutex);
                }
                /* Disable MEC parsing/prefetching */
                gfx_v8_0_cp_compute_enable(adev, false);
@@ -5588,18 +5355,6 @@ static int gfx_v8_0_soft_reset(void *handle)
        return 0;
 }
 
-static void gfx_v8_0_init_hqd(struct amdgpu_device *adev,
-                             struct amdgpu_ring *ring)
-{
-       mutex_lock(&adev->srbm_mutex);
-       vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
-       WREG32(mmCP_HQD_DEQUEUE_REQUEST, 0);
-       WREG32(mmCP_HQD_PQ_RPTR, 0);
-       WREG32(mmCP_HQD_PQ_WPTR, 0);
-       vi_srbm_select(adev, 0, 0, 0, 0);
-       mutex_unlock(&adev->srbm_mutex);
-}
-
 static int gfx_v8_0_post_soft_reset(void *handle)
 {
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
@@ -5625,9 +5380,13 @@ static int gfx_v8_0_post_soft_reset(void *handle)
                for (i = 0; i < adev->gfx.num_compute_rings; i++) {
                        struct amdgpu_ring *ring = &adev->gfx.compute_ring[i];
 
-                       gfx_v8_0_init_hqd(adev, ring);
+                       mutex_lock(&adev->srbm_mutex);
+                       vi_srbm_select(adev, ring->me, ring->pipe, ring->queue, 0);
+                       gfx_v8_0_deactivate_hqd(adev, 2);
+                       vi_srbm_select(adev, 0, 0, 0, 0);
+                       mutex_unlock(&adev->srbm_mutex);
                }
-               gfx_v8_0_cp_compute_resume(adev);
+               gfx_v8_0_kiq_resume(adev);
        }
        gfx_v8_0_rlc_start(adev);
 
@@ -5773,7 +5532,7 @@ static int gfx_v8_0_early_init(void *handle)
        struct amdgpu_device *adev = (struct amdgpu_device *)handle;
 
        adev->gfx.num_gfx_rings = GFX8_NUM_GFX_RINGS;
-       adev->gfx.num_compute_rings = GFX8_NUM_COMPUTE_RINGS;
+       adev->gfx.num_compute_rings = AMDGPU_MAX_COMPUTE_RINGS;
        adev->gfx.funcs = &gfx_v8_0_gfx_funcs;
        gfx_v8_0_set_ring_funcs(adev);
        gfx_v8_0_set_irq_funcs(adev);
@@ -6265,6 +6024,8 @@ static void gfx_v8_0_update_coarse_grain_clock_gating(struct amdgpu_device *adev
                          RLC_CGCG_CGLS_CTRL__CGLS_EN_MASK);
                if (temp != data)
                        WREG32(mmRLC_CGCG_CGLS_CTRL, data);
+               /* enable interrupts again for PG */
+               gfx_v8_0_enable_gui_idle_interrupt(adev, true);
        }
 
        gfx_v8_0_wait_for_rlc_serdes(adev);
@@ -6568,9 +6329,13 @@ static void gfx_v8_0_ring_emit_ib_gfx(struct amdgpu_ring *ring,
 
        control |= ib->length_dw | (vm_id << 24);
 
-       if (amdgpu_sriov_vf(ring->adev) && ib->flags & AMDGPU_IB_FLAG_PREEMPT)
+       if (amdgpu_sriov_vf(ring->adev) && (ib->flags & AMDGPU_IB_FLAG_PREEMPT)) {
                control |= INDIRECT_BUFFER_PRE_ENB(1);
 
+               if (!(ib->flags & AMDGPU_IB_FLAG_CE))
+                       gfx_v8_0_ring_emit_de_meta(ring);
+       }
+
        amdgpu_ring_write(ring, header);
        amdgpu_ring_write(ring,
 #ifdef __BIG_ENDIAN
@@ -6753,8 +6518,7 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
        uint32_t dw2 = 0;
 
        if (amdgpu_sriov_vf(ring->adev))
-               gfx_v8_0_ring_emit_ce_meta_init(ring,
-                       (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
+               gfx_v8_0_ring_emit_ce_meta(ring);
 
        dw2 |= 0x80000000; /* set load_enable otherwise this package is just NOPs */
        if (flags & AMDGPU_HAVE_CTX_SWITCH) {
@@ -6780,10 +6544,6 @@ static void gfx_v8_ring_emit_cntxcntl(struct amdgpu_ring *ring, uint32_t flags)
        amdgpu_ring_write(ring, PACKET3(PACKET3_CONTEXT_CONTROL, 1));
        amdgpu_ring_write(ring, dw2);
        amdgpu_ring_write(ring, 0);
-
-       if (amdgpu_sriov_vf(ring->adev))
-               gfx_v8_0_ring_emit_de_meta_init(ring,
-                       (flags & AMDGPU_VM_DOMAIN) ? AMDGPU_CSA_VADDR : ring->adev->virt.csa_vmid0_addr);
 }
 
 static unsigned gfx_v8_0_ring_emit_init_cond_exec(struct amdgpu_ring *ring)
@@ -6813,7 +6573,6 @@ static void gfx_v8_0_ring_emit_patch_cond_exec(struct amdgpu_ring *ring, unsigne
                ring->ring[offset] = (ring->ring_size >> 2) - offset + cur;
 }
 
-
 static void gfx_v8_0_ring_emit_rreg(struct amdgpu_ring *ring, uint32_t reg)
 {
        struct amdgpu_device *adev = ring->adev;
@@ -6851,15 +6610,27 @@ static void gfx_v8_0_set_compute_eop_interrupt_state(struct amdgpu_device *adev,
                                                     int me, int pipe,
                                                     enum amdgpu_interrupt_state state)
 {
+       u32 mec_int_cntl, mec_int_cntl_reg;
+
        /*
-        * amdgpu controls only pipe 0 of MEC1. That's why this function only
-        * handles the setting of interrupts for this specific pipe. All other
+        * amdgpu controls only the first MEC. That's why this function only
+        * handles the setting of interrupts for this specific MEC. All other
         * pipes' interrupts are set by amdkfd.
         */
 
        if (me == 1) {
                switch (pipe) {
                case 0:
+                       mec_int_cntl_reg = mmCP_ME1_PIPE0_INT_CNTL;
+                       break;
+               case 1:
+                       mec_int_cntl_reg = mmCP_ME1_PIPE1_INT_CNTL;
+                       break;
+               case 2:
+                       mec_int_cntl_reg = mmCP_ME1_PIPE2_INT_CNTL;
+                       break;
+               case 3:
+                       mec_int_cntl_reg = mmCP_ME1_PIPE3_INT_CNTL;
                        break;
                default:
                        DRM_DEBUG("invalid pipe %d\n", pipe);
@@ -6870,8 +6641,20 @@ static void gfx_v8_0_set_compute_eop_interrupt_state(struct amdgpu_device *adev,
                return;
        }
 
-       WREG32_FIELD(CP_ME1_PIPE0_INT_CNTL, TIME_STAMP_INT_ENABLE,
-                    state == AMDGPU_IRQ_STATE_DISABLE ? 0 : 1);
+       switch (state) {
+       case AMDGPU_IRQ_STATE_DISABLE:
+               mec_int_cntl = RREG32(mec_int_cntl_reg);
+               mec_int_cntl &= ~CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK;
+               WREG32(mec_int_cntl_reg, mec_int_cntl);
+               break;
+       case AMDGPU_IRQ_STATE_ENABLE:
+               mec_int_cntl = RREG32(mec_int_cntl_reg);
+               mec_int_cntl |= CP_INT_CNTL_RING0__TIME_STAMP_INT_ENABLE_MASK;
+               WREG32(mec_int_cntl_reg, mec_int_cntl);
+               break;
+       default:
+               break;
+       }
 }
 
 static int gfx_v8_0_set_priv_reg_fault_state(struct amdgpu_device *adev,
@@ -6992,8 +6775,6 @@ static int gfx_v8_0_kiq_set_interrupt_state(struct amdgpu_device *adev,
 {
        struct amdgpu_ring *ring = &(adev->gfx.kiq.ring);
 
-       BUG_ON(ring->funcs->type != AMDGPU_RING_TYPE_KIQ);
-
        switch (type) {
        case AMDGPU_CP_KIQ_IRQ_DRIVER0:
                WREG32_FIELD(CPC_INT_CNTL, GENERIC2_INT_ENABLE,
@@ -7023,8 +6804,6 @@ static int gfx_v8_0_kiq_irq(struct amdgpu_device *adev,
        u8 me_id, pipe_id, queue_id;
        struct amdgpu_ring *ring = &(adev->gfx.kiq.ring);
 
-       BUG_ON(ring->funcs->type != AMDGPU_RING_TYPE_KIQ);
-
        me_id = (entry->ring_id & 0x0c) >> 2;
        pipe_id = (entry->ring_id & 0x03) >> 0;
        queue_id = (entry->ring_id & 0x70) >> 4;
@@ -7257,7 +7036,7 @@ static u32 gfx_v8_0_get_cu_active_bitmap(struct amdgpu_device *adev)
        data =  RREG32(mmCC_GC_SHADER_ARRAY_CONFIG) |
                RREG32(mmGC_USER_SHADER_ARRAY_CONFIG);
 
-       mask = gfx_v8_0_create_bitmask(adev->gfx.config.max_cu_per_sh);
+       mask = amdgpu_gfx_create_bitmask(adev->gfx.config.max_cu_per_sh);
 
        return ~REG_GET_FIELD(data, CC_GC_SHADER_ARRAY_CONFIG, INACTIVE_CUS) & mask;
 }
@@ -7268,9 +7047,15 @@ static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev)
        u32 mask, bitmap, ao_bitmap, ao_cu_mask = 0;
        struct amdgpu_cu_info *cu_info = &adev->gfx.cu_info;
        unsigned disable_masks[4 * 2];
+       u32 ao_cu_num;
 
        memset(cu_info, 0, sizeof(*cu_info));
 
+       if (adev->flags & AMD_IS_APU)
+               ao_cu_num = 2;
+       else
+               ao_cu_num = adev->gfx.config.max_cu_per_sh;
+
        amdgpu_gfx_parse_disable_cu(disable_masks, 4, 2);
 
        mutex_lock(&adev->grbm_idx_mutex);
@@ -7286,9 +7071,9 @@ static void gfx_v8_0_get_cu_info(struct amdgpu_device *adev)
                        bitmap = gfx_v8_0_get_cu_active_bitmap(adev);
                        cu_info->bitmap[i][j] = bitmap;
 
-                       for (k = 0; k < 16; k ++) {
+                       for (k = 0; k < adev->gfx.config.max_cu_per_sh; k ++) {
                                if (bitmap & mask) {
-                                       if (counter < 2)
+                                       if (counter < ao_cu_num)
                                                ao_bitmap |= mask;
                                        counter ++;
                                }
@@ -7323,7 +7108,7 @@ const struct amdgpu_ip_block_version gfx_v8_1_ip_block =
        .funcs = &gfx_v8_0_ip_funcs,
 };
 
-static void gfx_v8_0_ring_emit_ce_meta_init(struct amdgpu_ring *ring, uint64_t csa_addr)
+static void gfx_v8_0_ring_emit_ce_meta(struct amdgpu_ring *ring)
 {
        uint64_t ce_payload_addr;
        int cnt_ce;
@@ -7333,10 +7118,12 @@ static void gfx_v8_0_ring_emit_ce_meta_init(struct amdgpu_ring *ring, uint64_t c
        } ce_payload = {};
 
        if (ring->adev->virt.chained_ib_support) {
-               ce_payload_addr = csa_addr + offsetof(struct vi_gfx_meta_data_chained_ib, ce_payload);
+               ce_payload_addr = AMDGPU_VA_RESERVED_SIZE - 2 * 4096 +
+                                                 offsetof(struct vi_gfx_meta_data_chained_ib, ce_payload);
                cnt_ce = (sizeof(ce_payload.chained) >> 2) + 4 - 2;
        } else {
-               ce_payload_addr = csa_addr + offsetof(struct vi_gfx_meta_data, ce_payload);
+               ce_payload_addr = AMDGPU_VA_RESERVED_SIZE - 2 * 4096 +
+                                                 offsetof(struct vi_gfx_meta_data, ce_payload);
                cnt_ce = (sizeof(ce_payload.regular) >> 2) + 4 - 2;
        }
 
@@ -7350,15 +7137,16 @@ static void gfx_v8_0_ring_emit_ce_meta_init(struct amdgpu_ring *ring, uint64_t c
        amdgpu_ring_write_multiple(ring, (void *)&ce_payload, cnt_ce - 2);
 }
 
-static void gfx_v8_0_ring_emit_de_meta_init(struct amdgpu_ring *ring, uint64_t csa_addr)
+static void gfx_v8_0_ring_emit_de_meta(struct amdgpu_ring *ring)
 {
-       uint64_t de_payload_addr, gds_addr;
+       uint64_t de_payload_addr, gds_addr, csa_addr;
        int cnt_de;
        static union {
                struct vi_de_ib_state regular;
                struct vi_de_ib_state_chained_ib chained;
        } de_payload = {};
 
+       csa_addr = AMDGPU_VA_RESERVED_SIZE - 2 * 4096;
        gds_addr = csa_addr + 4096;
        if (ring->adev->virt.chained_ib_support) {
                de_payload.chained.gds_backup_addrlo = lower_32_bits(gds_addr);
@@ -7381,68 +7169,3 @@ static void gfx_v8_0_ring_emit_de_meta_init(struct amdgpu_ring *ring, uint64_t c
        amdgpu_ring_write(ring, upper_32_bits(de_payload_addr));
        amdgpu_ring_write_multiple(ring, (void *)&de_payload, cnt_de - 2);
 }
-
-/* create MQD for each compute queue */
-static int gfx_v8_0_compute_mqd_sw_init(struct amdgpu_device *adev)
-{
-       struct amdgpu_ring *ring = NULL;
-       int r, i;
-
-       /* create MQD for KIQ */
-       ring = &adev->gfx.kiq.ring;
-       if (!ring->mqd_obj) {
-               r = amdgpu_bo_create_kernel(adev, sizeof(struct vi_mqd), PAGE_SIZE,
-                                           AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj,
-                                           &ring->mqd_gpu_addr, &ring->mqd_ptr);
-               if (r) {
-                       dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
-                       return r;
-               }
-
-               /* prepare MQD backup */
-               adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS] = kmalloc(sizeof(struct vi_mqd), GFP_KERNEL);
-               if (!adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS])
-                               dev_warn(adev->dev, "no memory to create MQD backup for ring %s\n", ring->name);
-       }
-
-       /* create MQD for each KCQ */
-       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-               ring = &adev->gfx.compute_ring[i];
-               if (!ring->mqd_obj) {
-                       r = amdgpu_bo_create_kernel(adev, sizeof(struct vi_mqd), PAGE_SIZE,
-                                                   AMDGPU_GEM_DOMAIN_GTT, &ring->mqd_obj,
-                                                   &ring->mqd_gpu_addr, &ring->mqd_ptr);
-                       if (r) {
-                               dev_warn(adev->dev, "failed to create ring mqd ob (%d)", r);
-                               return r;
-                       }
-
-                       /* prepare MQD backup */
-                       adev->gfx.mec.mqd_backup[i] = kmalloc(sizeof(struct vi_mqd), GFP_KERNEL);
-                       if (!adev->gfx.mec.mqd_backup[i])
-                               dev_warn(adev->dev, "no memory to create MQD backup for ring %s\n", ring->name);
-               }
-       }
-
-       return 0;
-}
-
-static void gfx_v8_0_compute_mqd_sw_fini(struct amdgpu_device *adev)
-{
-       struct amdgpu_ring *ring = NULL;
-       int i;
-
-       for (i = 0; i < adev->gfx.num_compute_rings; i++) {
-               ring = &adev->gfx.compute_ring[i];
-               kfree(adev->gfx.mec.mqd_backup[i]);
-               amdgpu_bo_free_kernel(&ring->mqd_obj,
-                                     &ring->mqd_gpu_addr,
-                                     &ring->mqd_ptr);
-       }
-
-       ring = &adev->gfx.kiq.ring;
-       kfree(adev->gfx.mec.mqd_backup[AMDGPU_MAX_COMPUTE_RINGS]);
-       amdgpu_bo_free_kernel(&ring->mqd_obj,
-                             &ring->mqd_gpu_addr,
-                             &ring->mqd_ptr);
-}