Skip to content

Commit

Permalink
Fixed errors in OpenCL on CPU (openmm#4358)
Browse files Browse the repository at this point in the history
  • Loading branch information
peastman authored Dec 20, 2023
1 parent e854f8f commit abadc82
Show file tree
Hide file tree
Showing 4 changed files with 70 additions and 52 deletions.
2 changes: 1 addition & 1 deletion platforms/opencl/src/OpenCLNonbondedUtilities.cpp
Original file line number Diff line number Diff line change
Expand Up @@ -83,7 +83,7 @@ OpenCLNonbondedUtilities::OpenCLNonbondedUtilities(OpenCLContext& context) : con
// us from sorting atom blocks by size, which leads to a slightly less efficient neighbor
// list. We guess based on system size which will be faster.

useLargeBlocks = (context.getNumAtoms() > 100000);
useLargeBlocks = (!deviceIsCpu && context.getNumAtoms() > 100000);

std::string vendor = context.getDevice().getInfo<CL_DEVICE_VENDOR>();
isAMD = !deviceIsCpu && ((vendor.size() >= 3 && vendor.substr(0, 3) == "AMD") || (vendor.size() >= 28 && vendor.substr(0, 28) == "Advanced Micro Devices, Inc."));
Expand Down
1 change: 1 addition & 0 deletions platforms/opencl/src/kernels/findInteractingBlocks.cl
Original file line number Diff line number Diff line change
Expand Up @@ -66,6 +66,7 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
if (get_global_id(0) == 0)
rebuildNeighborList[0] = 0;
}

__kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) {
// Find the total range of sizes recorded by all blocks.

Expand Down
77 changes: 68 additions & 9 deletions platforms/opencl/src/kernels/findInteractingBlocks_cpu.cl
Original file line number Diff line number Diff line change
Expand Up @@ -7,9 +7,10 @@
*/
__kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global const real4* restrict posq, __global real4* restrict blockCenter, __global real4* restrict blockBoundingBox, __global int* restrict rebuildNeighborList,
__global real2* restrict sortedBlocks) {
__global real2* restrict blockSizeRange) {
int index = get_global_id(0);
int base = index*TILE_SIZE;
real minSize = 1e38, maxSize = 0;
while (base < numAtoms) {
real4 pos = posq[base];
#ifdef USE_PERIODIC
Expand All @@ -28,20 +29,79 @@ __kernel void findBlockBounds(int numAtoms, real4 periodicBoxSize, real4 invPeri
maxPos = max(maxPos, pos);
}
real4 blockSize = 0.5f*(maxPos-minPos);
real4 center = 0.5f*(maxPos+minPos);
center.w = 0;
for (int i = base; i < last; i++) {
pos = posq[i];
real4 delta = posq[i]-center;
#ifdef USE_PERIODIC
APPLY_PERIODIC_TO_DELTA(delta)
#endif
center.w = max(center.w, delta.x*delta.x+delta.y*delta.y+delta.z*delta.z);
}
center.w = sqrt(center.w);
blockBoundingBox[index] = blockSize;
blockCenter[index] = 0.5f*(maxPos+minPos);
sortedBlocks[index] = (real2) (blockSize.x+blockSize.y+blockSize.z, index);
blockCenter[index] = center;
real totalSize = blockSize.x+blockSize.y+blockSize.z;
minSize = min(minSize, totalSize);
maxSize = max(maxSize, totalSize);
index += get_global_size(0);
base = index*TILE_SIZE;
}

// Record the range of sizes seen by threads in this block.

__local real minBuffer[64], maxBuffer[64];
minBuffer[get_local_id(0)] = minSize;
maxBuffer[get_local_id(0)] = maxSize;
barrier(CLK_LOCAL_MEM_FENCE);
for (int step = 1; step < 64; step *= 2) {
if (get_local_id(0)+step < 64 && get_local_id(0)%(2*step) == 0) {
minBuffer[get_local_id(0)] = min(minBuffer[get_local_id(0)], minBuffer[get_local_id(0)+step]);
maxBuffer[get_local_id(0)] = max(maxBuffer[get_local_id(0)], maxBuffer[get_local_id(0)+step]);
}
barrier(CLK_LOCAL_MEM_FENCE);
}
if (get_local_id(0) == 0)
blockSizeRange[get_group_id(0)] = make_real2(minBuffer[0], maxBuffer[0]);
if (get_global_id(0) == 0)
rebuildNeighborList[0] = 0;
}

__kernel void computeSortKeys(__global const real4* restrict blockBoundingBox, __global unsigned int* restrict sortedBlocks, __global real2* restrict blockSizeRange, int numSizes) {
// Find the total range of sizes recorded by all blocks.

__local real2 sizeRange;
if (get_local_id(0) == 0) {
sizeRange = blockSizeRange[0];
for (int i = 1; i < numSizes; i++) {
real2 size = blockSizeRange[i];
sizeRange.x = min(sizeRange.x, size.x);
sizeRange.y = max(sizeRange.y, size.y);
}
sizeRange.x = LOG(sizeRange.x);
sizeRange.y = LOG(sizeRange.y);
}
barrier(CLK_LOCAL_MEM_FENCE);

// Sort keys store the bin in the high order part and the block in the low
// order part.

int numSizeBins = 20;
real scale = numSizeBins/(sizeRange.y-sizeRange.x);
for (unsigned int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
real4 box = blockBoundingBox[i];
real size = LOG(box.x+box.y+box.z);
int bin = (size-sizeRange.x)*scale;
bin = max(0, min(bin, numSizeBins-1));
sortedBlocks[i] = (((unsigned int) bin)<<BIN_SHIFT) + i;
}
}

/**
* Sort the data about bounding boxes so it can be accessed more efficiently in the next kernel.
*/
__kernel void sortBoxData(__global const real2* restrict sortedBlock, __global const real4* restrict blockCenter,
__kernel void sortBoxData(__global const unsigned int* restrict sortedBlocks, __global const real4* restrict blockCenter,
__global const real4* restrict blockBoundingBox, __global real4* restrict sortedBlockCenter,
__global real4* restrict sortedBlockBoundingBox, __global const real4* restrict posq, __global const real4* restrict oldPositions,
__global unsigned int* restrict interactionCount, __global int* restrict rebuildNeighborList, int forceRebuild
Expand All @@ -51,7 +111,7 @@ __kernel void sortBoxData(__global const real2* restrict sortedBlock, __global c
#endif
) {
for (int i = get_global_id(0); i < NUM_BLOCKS; i += get_global_size(0)) {
int index = (int) sortedBlock[i].y;
unsigned int index = sortedBlocks[i] & BLOCK_INDEX_MASK;
sortedBlockCenter[i] = blockCenter[index];
sortedBlockBoundingBox[i] = blockBoundingBox[index];
}
Expand Down Expand Up @@ -166,7 +226,7 @@ void storeInteractionData(int x, int* buffer, int* atoms, int* numAtoms, int num
*/
__kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodicBoxSize, real4 periodicBoxVecX, real4 periodicBoxVecY, real4 periodicBoxVecZ,
__global unsigned int* restrict interactionCount, __global int* restrict interactingTiles, __global unsigned int* restrict interactingAtoms,
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global real2* restrict sortedBlocks,
__global const real4* restrict posq, unsigned int maxTiles, unsigned int startBlockIndex, unsigned int numBlocks, __global unsigned int* restrict sortedBlocks,
__global const real4* restrict sortedBlockCenter, __global const real4* restrict sortedBlockBoundingBox,
__global const unsigned int* restrict exclusionIndices, __global const unsigned int* restrict exclusionRowIndices, __global real4* restrict oldPositions,
__global const int* restrict rebuildNeighborList
Expand All @@ -187,8 +247,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi
for (int i = startBlockIndex+get_group_id(0); i < startBlockIndex+numBlocks; i += get_num_groups(0)) {
valuesInBuffer = 0;
numAtoms = 0;
real2 sortedKey = sortedBlocks[i];
int x = (int) sortedKey.y;
int x = sortedBlocks[i] & BLOCK_INDEX_MASK;
real4 blockCenterX = sortedBlockCenter[i];
real4 blockSizeX = sortedBlockBoundingBox[i];

Expand All @@ -204,7 +263,7 @@ __kernel void findBlocksWithInteractions(real4 periodicBoxSize, real4 invPeriodi

for (int j = i+1; j < NUM_BLOCKS; j++) {
real2 sortedKey2 = sortedBlocks[j];
int y = (int) sortedKey2.y;
int y = sortedBlocks[j] & BLOCK_INDEX_MASK;
bool hasExclusions = false;
for (int k = 0; k < numExclusions; k++)
hasExclusions |= (exclusionsForX[k] == y);
Expand Down
42 changes: 0 additions & 42 deletions platforms/opencl/src/kernels/nonbonded_cpu.cl
Original file line number Diff line number Diff line change
Expand Up @@ -10,11 +10,7 @@ typedef struct {
*/

__kernel void computeNonbonded(
#ifdef SUPPORTS_64_BIT_ATOMICS
__global long* restrict forceBuffers,
#else
__global real4* restrict forceBuffers,
#endif
__global mixed* restrict energyBuffer, __global const real4* restrict posq, __global const unsigned int* restrict exclusions,
__global const int2* restrict exclusionTiles, unsigned int startTileIndex, unsigned long numTileIndices
#ifdef USE_CUTOFF
Expand Down Expand Up @@ -102,14 +98,9 @@ __kernel void computeNonbonded(

// Write results.

#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
}
}
else {
Expand Down Expand Up @@ -178,32 +169,18 @@ __kernel void computeNonbonded(

// Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
}

// Write results.

for (int tgx = 0; tgx < TILE_SIZE; tgx++) {
#ifdef SUPPORTS_64_BIT_ATOMICS
unsigned int offset = y*TILE_SIZE + tgx;
ATOMIC_ADD(&forceBuffers[offset], (mm_ulong) realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[offset+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[offset+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else
unsigned int offset = y*TILE_SIZE+tgx + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
#endif
}
}
}
Expand Down Expand Up @@ -337,14 +314,9 @@ __kernel void computeNonbonded(

// Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
}
}
else
Expand Down Expand Up @@ -404,14 +376,9 @@ __kernel void computeNonbonded(

// Write results for atom1.

#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom1], (mm_ulong) realToFixedPoint(force.x));
ATOMIC_ADD(&forceBuffers[atom1+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.y));
ATOMIC_ADD(&forceBuffers[atom1+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(force.z));
#else
unsigned int offset = atom1 + get_group_id(0)*PADDED_NUM_ATOMS;
forceBuffers[offset].xyz = forceBuffers[offset].xyz+force.xyz;
#endif
}
}

Expand All @@ -424,18 +391,9 @@ __kernel void computeNonbonded(
unsigned int atom2 = y*TILE_SIZE + tgx;
#endif
if (atom2 < PADDED_NUM_ATOMS) {
#ifdef SUPPORTS_64_BIT_ATOMICS
ATOMIC_ADD(&forceBuffers[atom2], (mm_ulong) realToFixedPoint(localData[tgx].fx));
ATOMIC_ADD(&forceBuffers[atom2+PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fy));
ATOMIC_ADD(&forceBuffers[atom2+2*PADDED_NUM_ATOMS], (mm_ulong) realToFixedPoint(localData[tgx].fz));
#else
unsigned int offset = atom2 + get_group_id(0)*PADDED_NUM_ATOMS;
real4 f = forceBuffers[offset];
f.x += localData[tgx].fx;
f.y += localData[tgx].fy;
f.z += localData[tgx].fz;
forceBuffers[offset] = f;
#endif
}
}
}
Expand Down

0 comments on commit abadc82

Please sign in to comment.