mirror of
https://github.com/pseXperiments/icicle.git
synced 2026-01-08 20:48:06 -05:00
[FIX]: Fix releasing device set on host thread during multigpu call (#501)
## Describe the changes This PR fixes an issue when `RunOnDevice` is called for multi-gpu while other goroutines calling device operations are run outside of `RunOnDevice`. The issue comes from setting a device other than the default device (device 0) on a host thread within `RunOnDevice` and not unsetting that host threads device when `RunOnDevice` finishes. When `RunOnDevice` locks a host thread to ensure that all other calls in the go routine are on the same device, it never unsets that thread’s device. Once the thread is unlocked, other go routines can get scheduled to it but it still has the device set to whatever it was before while it was locked so its possible that the following sequence happens: 1. NTT domain is initialized on thread 2 via a goroutine on device 0 2. MSM multiGPU test runs and is locked on thread 3 setting its device to 1 3. Other tests run concurrently on threads other than 3 (since it is locked) 4. MSM multiGPU test finishes and release thread 3 back to the pool but its device is still 1 5. NTT test runs and is assigned to thread 3 --> this will fail because the thread’s device wasn’t released back We really only want to set a thread's device while the thread is locked. But once we unlock a thread, it’s device should return to whatever it was set at originally. In theory, it should always be 0 if `SetDevice` is never used outside of `RunOnDevice` - which it shouldn’t be in most situations
This commit is contained in:
77
.github/workflows/golang.yml
vendored
77
.github/workflows/golang.yml
vendored
@@ -34,7 +34,7 @@ jobs:
|
||||
run: if [[ $(go list ./... | xargs go fmt) ]]; then echo "Please run go fmt"; exit 1; fi
|
||||
|
||||
build-curves-linux:
|
||||
name: Build curves on Linux
|
||||
name: Build and test curves on Linux
|
||||
runs-on: [self-hosted, Linux, X64, icicle]
|
||||
needs: [check-changed-files, check-format]
|
||||
strategy:
|
||||
@@ -60,19 +60,18 @@ jobs:
|
||||
- name: Build
|
||||
working-directory: ./wrappers/golang
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }} # builds a single curve with G2 and ECNTT enabled
|
||||
- name: Upload ICICLE lib artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
# builds a single curve with the curve's specified build args
|
||||
run: ./build.sh -curve=${{ matrix.curve.name }} ${{ matrix.curve.build_args }}
|
||||
- name: Test
|
||||
working-directory: ./wrappers/golang/curves
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
with:
|
||||
name: icicle-builds-${{ matrix.curve.name }}-${{ github.workflow }}-${{ github.sha }}
|
||||
path: |
|
||||
icicle/build/lib/libingo_curve_${{ matrix.curve.name }}.a
|
||||
icicle/build/lib/libingo_field_${{ matrix.curve.name }}.a
|
||||
retention-days: 1
|
||||
run: |
|
||||
CURVE=$(echo ${{ matrix.curve.name }} | sed -e 's/_//g')
|
||||
export CPATH=$CPATH:/usr/local/cuda/include
|
||||
go test ./$CURVE/tests -count=1 -failfast -p 2 -timeout 60m -v
|
||||
|
||||
build-fields-linux:
|
||||
name: Build fields on Linux
|
||||
name: Build and test fields on Linux
|
||||
runs-on: [self-hosted, Linux, X64, icicle]
|
||||
needs: [check-changed-files, check-format]
|
||||
strategy:
|
||||
@@ -90,18 +89,18 @@ jobs:
|
||||
- name: Build
|
||||
working-directory: ./wrappers/golang
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }} # builds a single field with field-ext enabled
|
||||
- name: Upload ICICLE lib artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
# builds a single field with the fields specified build args
|
||||
run: ./build.sh -field=${{ matrix.field.name }} ${{ matrix.field.build_args }}
|
||||
- name: Test
|
||||
working-directory: ./wrappers/golang/fields
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
with:
|
||||
name: icicle-builds-${{ matrix.field.name }}-${{ github.workflow }}-${{ github.sha }}
|
||||
path: |
|
||||
icicle/build/lib/libingo_field_${{ matrix.field.name }}.a
|
||||
retention-days: 1
|
||||
run: |
|
||||
FIELD=$(echo ${{ matrix.field.name }} | sed -e 's/_//g')
|
||||
export CPATH=$CPATH:/usr/local/cuda/include
|
||||
go test ./$FIELD/tests -count=1 -failfast -p 2 -timeout 60m -v
|
||||
|
||||
build-hashes-linux:
|
||||
name: Build hashes on Linux
|
||||
name: Build and test hashes on Linux
|
||||
runs-on: [self-hosted, Linux, X64, icicle]
|
||||
needs: [check-changed-files, check-format]
|
||||
strategy:
|
||||
@@ -119,41 +118,15 @@ jobs:
|
||||
- name: Build
|
||||
working-directory: ./wrappers/golang
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }} # builds a single hash algorithm
|
||||
- name: Upload ICICLE lib artifacts
|
||||
uses: actions/upload-artifact@v4
|
||||
# builds a single hash algorithm with the hash's specified build args
|
||||
run: ./build.sh -hash=${{ matrix.hash.name }} ${{ matrix.hash.build_args }}
|
||||
- name: Test
|
||||
working-directory: ./wrappers/golang/hash
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
with:
|
||||
name: icicle-builds-${{ matrix.hash.name }}-${{ github.workflow }}-${{ github.sha }}
|
||||
path: |
|
||||
icicle/build/lib/libingo_hash.a
|
||||
retention-days: 1
|
||||
|
||||
test-linux:
|
||||
name: Test on Linux
|
||||
runs-on: [self-hosted, Linux, X64, icicle]
|
||||
needs: [check-changed-files, build-curves-linux, build-fields-linux, build-hashes-linux]
|
||||
steps:
|
||||
- name: Checkout Repo
|
||||
uses: actions/checkout@v4
|
||||
- name: Setup go
|
||||
uses: actions/setup-go@v5
|
||||
with:
|
||||
go-version: '1.20.0'
|
||||
- name: Download ICICLE lib artifacts
|
||||
uses: actions/download-artifact@v4
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
with:
|
||||
path: ./icicle/build/lib
|
||||
merge-multiple: true
|
||||
- name: Run Tests
|
||||
working-directory: ./wrappers/golang
|
||||
if: needs.check-changed-files.outputs.golang == 'true' || needs.check-changed-files.outputs.cpp_cuda == 'true'
|
||||
# -count ensures the test results are not cached
|
||||
# -p controls the number of programs that can be run in parallel
|
||||
run: |
|
||||
HASH=$(echo ${{ matrix.hash.name }} | sed -e 's/_//g')
|
||||
export CPATH=$CPATH:/usr/local/cuda/include
|
||||
go test ./... -count=1 -failfast -p 2 -timeout 60m
|
||||
go test ./$HASH/tests -count=1 -failfast -p 2 -timeout 60m -v
|
||||
|
||||
# TODO: bw6 on windows requires more memory than the standard runner has
|
||||
# Add a large runner and then enable this job
|
||||
|
||||
Reference in New Issue
Block a user