[FIX]: Fix releasing device set on host thread during multigpu call (#501)

## Describe the changes

This PR fixes an issue when `RunOnDevice` is called for multi-gpu while
other goroutines calling device operations are run outside of
`RunOnDevice`. The issue comes from setting a device other than the
default device (device 0) on a host thread within `RunOnDevice` and not
unsetting that host threads device when `RunOnDevice` finishes.

When `RunOnDevice` locks a host thread to ensure that all other calls in
the go routine are on the same device, it never unsets that thread’s
device. Once the thread is unlocked, other go routines can get scheduled
to it but it still has the device set to whatever it was before while it
was locked so its possible that the following sequence happens:

1. NTT domain is initialized on thread 2 via a goroutine on device 0
2. MSM multiGPU test runs and is locked on thread 3 setting its device
to 1
3. Other tests run concurrently on threads other than 3 (since it is
locked)
4. MSM multiGPU test finishes and release thread 3 back to the pool but
its device is still 1
5. NTT test runs and is assigned to thread 3 --> this will fail because
the thread’s device wasn’t released back

We really only want to set a thread's device while the thread is locked.
But once we unlock a thread, it’s device should return to whatever it
was set at originally. In theory, it should always be 0 if `SetDevice`
is never used outside of `RunOnDevice` - which it shouldn’t be in most 
situations
This commit is contained in:
Jeremy Felder
2024-05-08 14:07:29 +03:00
committed by GitHub
parent a56435d2e8
commit 14997566ff
12 changed files with 33 additions and 88 deletions

View File

@@ -34,7 +34,7 @@ jobs:
run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
build-curves-linux:
name: Build curves on Linux
name: Build and test curves on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format]
strategy:
@@ -60,19 +60,18 @@ jobs:
- name: Build
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} # builds a single curve with G2 and ECNTT enabled
- name: Upload ICICLE lib artifacts
uses: actions/upload-artifact@v4
# builds a single curve with the curve's specified build args
run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }}
- name: Test
working-directory: ./wrappers/golang/curves
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
with:
name: icicle-builds-${{ matrix.curve.name }}-${{ github.workflow }}-${{ github.sha }}
path: |
icicle/build/lib/libingo_curve_${{ matrix.curve.name }}.a
icicle/build/lib/libingo_field_${{ matrix.curve.name }}.a
retention-days: 1
run: |
CURVE=$(echo ${{ matrix.curve.name }} | sed -e 's/_//g')
export CPATH=$CPATH:/usr/local/cuda/include
go test ./$CURVE/tests -count=1 -failfast -p 2 -timeout 60m -v
build-fields-linux:
name: Build fields on Linux
name: Build and test fields on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format]
strategy:
@@ -90,18 +89,18 @@ jobs:
- name: Build
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }} # builds a single field with field-ext enabled
- name: Upload ICICLE lib artifacts
uses: actions/upload-artifact@v4
# builds a single field with the fields specified build args
run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }}
- name: Test
working-directory: ./wrappers/golang/fields
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
with:
name: icicle-builds-${{ matrix.field.name }}-${{ github.workflow }}-${{ github.sha }}
path: |
icicle/build/lib/libingo_field_${{ matrix.field.name }}.a
retention-days: 1
run: |
FIELD=$(echo ${{ matrix.field.name }} | sed -e 's/_//g')
export CPATH=$CPATH:/usr/local/cuda/include
go test ./$FIELD/tests -count=1 -failfast -p 2 -timeout 60m -v
build-hashes-linux:
name: Build hashes on Linux
name: Build and test hashes on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, check-format]
strategy:
@@ -119,41 +118,15 @@ jobs:
- name: Build
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }} # builds a single hash algorithm
- name: Upload ICICLE lib artifacts
uses: actions/upload-artifact@v4
# builds a single hash algorithm with the hash's specified build args
run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }}
- name: Test
working-directory: ./wrappers/golang/hash
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
with:
name: icicle-builds-${{ matrix.hash.name }}-${{ github.workflow }}-${{ github.sha }}
path: |
icicle/build/lib/libingo_hash.a
retention-days: 1
test-linux:
name: Test on Linux
runs-on: [self-hosted, Linux, X64, icicle]
needs: [check-changed-files, build-curves-linux, build-fields-linux, build-hashes-linux]
steps:
- name: Checkout Repo
uses: actions/checkout@v4
- name: Setup go
uses: actions/setup-go@v5
with:
go-version: '1.20.0'
- name: Download ICICLE lib artifacts
uses: actions/download-artifact@v4
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
with:
path: ./icicle/build/lib
merge-multiple: true
- name: Run Tests
working-directory: ./wrappers/golang
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
# -count ensures the test results are not cached
# -p controls the number of programs that can be run in parallel
run: |
HASH=$(echo ${{ matrix.hash.name }} | sed -e 's/_//g')
export CPATH=$CPATH:/usr/local/cuda/include
go test ./... -count=1 -failfast -p 2 -timeout 60m
go test ./$HASH/tests -count=1 -failfast -p 2 -timeout 60m -v
# TODO: bw6 on windows requires more memory than the standard runner has
# Add a large runner and then enable this job