- cu_per_se[se] += hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
-
- /* Symmetrically map cu_mask to all SEs:
- * cu_mask[0] bit0 -> se_mask[0] bit0;
- * cu_mask[0] bit1 -> se_mask[1] bit0;
- * ... (if # SE is 4)
- * cu_mask[0] bit4 -> se_mask[0] bit1;
+ cu_per_sh[se][sh] = hweight32(cu_info.cu_bitmap[se % 4][sh + (se / 4)]);
+
+ /* Symmetrically map cu_mask to all SEs & SHs:
+ * se_mask programs up to 2 SH in the upper and lower 16 bits.
+ *
+ * Examples
+ * Assuming 1 SH/SE, 4 SEs:
+ * cu_mask[0] bit0 -> se_mask[0] bit0
+ * cu_mask[0] bit1 -> se_mask[1] bit0
+ * ...
+ * cu_mask[0] bit4 -> se_mask[0] bit1
+ * ...
+ *
+ * Assuming 2 SH/SE, 4 SEs
+ * cu_mask[0] bit0 -> se_mask[0] bit0 (SE0,SH0,CU0)
+ * cu_mask[0] bit1 -> se_mask[1] bit0 (SE1,SH0,CU0)
+ * ...
+ * cu_mask[0] bit4 -> se_mask[0] bit16 (SE0,SH1,CU0)
+ * cu_mask[0] bit5 -> se_mask[1] bit16 (SE1,SH1,CU0)
+ * ...
+ * cu_mask[0] bit8 -> se_mask[0] bit1 (SE0,SH0,CU1)