A fix is available
APAR status
Closed as program error.
Error description
When using the __popcnt8() builtin, there is a performance concern that needed to be addressed via this APAR. The following test case demonstrates the issue: ===== COMPILE COMMAND: /bgsys/drivers/ppcfloor/comm/xl/bin/mpixlcxx_r -O5 -qsmp=omp -qthreaded test.c -I/bgsys/drivers/V1R2M2/ppc64 ===== TESTCASE: % cat test.c #include <stdio.h> #include <stdlib.h> #include <stdio.h> #include <inttypes.h> #include "hwi/include/bqc/A2_inlines.h" #define size (128*1024*1024) volatile int64_t value[size]; inline int popcnt8(uint64_t data) { int ldz; asm volatile ("popcntd %0, %1\n\t" : "=r" (ldz) : "r" (data) ); return ldz; } char BitsSetTable256[256]; int popcount(long long b) { b = (b & 0x5555555555555555LU) + (b >> 1 & 0x5555555555555555LU); b = (b & 0x3333333333333333LU) + (b >> 2 & 0x3333333333333333LU); b = b + (b >> 4) & 0x0F0F0F0F0F0F0F0FLU; b = b + (b >> 8); b = b + (b >> 16); b = b + (b >> 32) & 0x0000007F; return (int) b; } int popcount_table(long long v) { int c = BitsSetTable256[v & 0xff] + BitsSetTable256[(v >> 8) & 0xff] + BitsSetTable256[(v >> 16) & 0xff] + BitsSetTable256[v >> 24]; return c; } int counttz(long long v) { int c; if (v) { v = (v ? (v - 1)) >> 1; // Set v's trailing 0s to 1s and zero rest for (c = 0; v; c++) v >>= 1; } else c = 8 * sizeof(v); } int main(int argc, char * argv[]) { BitsSetTable256[0] = 0; for (int i = 0; i < 256; i++) BitsSetTable256[i] = (i & 1) + BitsSetTable256[i / 2]; uint64_t t2, t1, software_cycles, hardware_instruction_cycles; uint64_t thr_count[64]; int nthreads; #pragma omp parallel for for(int i=0;i<size;i++) value[i] = i; t1 = GetTimeBase(); #pragma omp parallel { int i, tid = omp_get_thread_num(); if(tid ==0) nthreads = omp_get_num_threads(); uint64_t count = 0; thr_count[tid] = 0; #pragma omp for for(i=0;i<size;i++) count += popcount_table(value[i]); thr_count[tid] = count; } t2 = GetTimeBase(); software_cycles = t2 - t1; for(int i = 1;i<nthreads;i++) thr_count[0] += thr_count[i]; printf("Software popcnt cycles: %llu, count %llu\n",software_cycles, thr_count[0]); t1 = GetTimeBase(); #pragma omp parallel { int i, tid = omp_get_thread_num(); uint64_t count = 0; thr_count[tid] = 0; #pragma omp for for(i=0;i<size;i++) count += __popcnt8(value[i]); thr_count[tid] = count; } t2 = GetTimeBase(); hardware_instruction_cycles = t2 - t1; for(int i = 1;i<nthreads;i++) thr_count[0] += thr_count[i]; printf("hardware_instruction popcnt cycles: %llu, count %llu\n", hardware_instruction_cycles, thr_count[0]); } % ===== ACTUAL OUTPUT: % /bgsys/drivers/ppcfloor/hlcs/bin/runjob --exe a.out --block R00-M0-N04 --np 1 -p 1 Software popcnt cycles: 115842638, count 1811939328 hardware_instruction popcnt cycles: 242737020, count 1811939328 %
Local fix
N/A
Problem summary
USERS AFFECTED: Users using __popcnt8() builtin in their application and call it a number of times within a loop maybe affected by this issue. PROBLEM DESCRIPTION: The popcnt hardware ops are slow on BG/Q.
Problem conclusion
The compiler should not use popcnt hardware ops. It has been changed to use a software emulation and shows improvement in performance.
Temporary fix
Comments
APAR Information
APAR number
LI79792
Reported component name
XL C/C++ FOR BG
Reported component ID
5799AG100
Reported release
C10
Status
CLOSED PER
PE
NoPE
HIPER
NoHIPER
Special Attention
NoSpecatt / Xsystem
Submitted date
2017-12-21
Closed date
2017-12-21
Last modified date
2017-12-21
APAR is sysrouted FROM one or more of the following:
APAR is sysrouted TO one or more of the following:
Fix information
Fixed component name
XL C/C++ FOR BG
Fixed component ID
5799AG100
Applicable component levels
[{"Business Unit":{"code":"BU048","label":"IBM Software"},"Product":{"code":"SS2LWA","label":"XL C\/C++ for Blue Gene\/Q"},"Component":"","ARM Category":[],"Platform":[{"code":"PF025","label":"Platform Independent"}],"Version":"12.1","Edition":"","Line of Business":{"code":"LOB73","label":"Power TPS"}}]
Document Information
Modified date:
05 September 2024