From daa70549f574f0f5c05b7aaab890e5f8f2e15917 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Fri, 8 May 2026 19:51:28 +0000 Subject: [PATCH 1/4] prover: GPU compression path + plumbed (gated) GPU aggregation * Compression (data-availability-v2) auto-enables the gpu/plonk2 prover whenever a CUDA device is reachable. Wall-clock on the reference host drops from ~4:40 (CPU) to ~2:10 per proof. * Aggregation GPU plumbing (gpu/plonk2 PI/BW6/BN254 + gpu/vortex PI MiMC and ring-SIS + gpu/quotient) is wired but disabled by default behind $LINEA_PROVER_GPU_AGGREGATION; leave the flag off in production for now. * cmd/controller refuses execution / aggregation / invalidity jobs when a GPU is detected; only compression is accepted on a GPU host. See prover/reference-benchmarks/README.md for the host class, build command, runtime flags and 3-proof compression reference (avg 2:10.19 on AWS g7e.8xlarge with an RTX PRO 6000 Blackwell). --- prover/Makefile | 7 +- prover/backend/aggregation/prove.go | 162 +- prover/backend/dataavailability/prove.go | 31 +- prover/circuits/aggregation/prover.go | 9 +- prover/circuits/emulation/circuit.go | 10 +- .../crypto/state-management/smt/smt_test.go | 34 + .../crypto/state-management/smt/tree.go | 127 + .../keccak/prover/crypto/vortex/commitment.go | 38 +- .../prover/crypto/vortex/gpu_mimc_cuda.go | 738 ++++ .../prover/crypto/vortex/gpu_mimc_stub.go | 28 + .../prover/crypto/vortex/gpu_mimc_test.go | 309 ++ .../protocol/compiler/globalcs/quotient.go | 205 +- .../compiler/globalcs/quotient_gpu_cuda.go | 179 + .../globalcs/quotient_gpu_cuda_test.go | 50 + .../compiler/globalcs/quotient_gpu_stub.go | 14 + prover/circuits/prove.go | 113 +- prover/circuits/setup.go | 64 +- prover/circuits/srs_store.go | 65 +- .../cmd/controller/controller/fs_watcher.go | 17 +- prover/cmd/prover/cmd/prove.go | 28 + prover/config/config-mainnet-limitless.toml | 6 +- prover/go.mod | 99 +- prover/go.sum | 216 +- prover/gpu/.gitignore | 8 + prover/gpu/cuda/CMakeLists.txt | 46 + prover/gpu/cuda/CMakePresets.json | 34 + prover/gpu/cuda/include/gnark_gpu.h | 1022 +++++ prover/gpu/cuda/include/gnark_gpu_kb.h | 320 ++ prover/gpu/cuda/src/plonk/api.cu | 3381 +++++++++++++++++ prover/gpu/cuda/src/plonk/ec.cuh | 539 +++ prover/gpu/cuda/src/plonk/field.cuh | 283 ++ prover/gpu/cuda/src/plonk/fp.cuh | 543 +++ prover/gpu/cuda/src/plonk/fr_arith.cuh | 225 ++ prover/gpu/cuda/src/plonk/fr_ops.cu | 1395 +++++++ prover/gpu/cuda/src/plonk/kernels.cu | 370 ++ prover/gpu/cuda/src/plonk/msm.cu | 1394 +++++++ prover/gpu/cuda/src/plonk/ntt.cu | 1212 ++++++ prover/gpu/cuda/src/plonk/plonk_eval.cu | 95 + prover/gpu/cuda/src/plonk/plonk_z.cu | 184 + prover/gpu/cuda/src/plonk2/ec.cuh | 338 ++ prover/gpu/cuda/src/plonk2/field.cuh | 844 ++++ prover/gpu/cuda/src/plonk2/g1.cu | 268 ++ prover/gpu/cuda/src/plonk2/kernels.cu | 2250 +++++++++++ prover/gpu/cuda/src/plonk2/mimc.cu | 893 +++++ prover/gpu/cuda/src/plonk2/msm.cu | 1117 ++++++ prover/gpu/cuda/src/vortex/kb.cu | 2424 ++++++++++++ prover/gpu/cuda/src/vortex/kb_field.cuh | 201 + prover/gpu/device.go | 191 + prover/gpu/device_stub.go | 28 + prover/gpu/enabled_cuda.go | 9 + prover/gpu/enabled_nocuda.go | 9 + prover/gpu/gpu.go | 102 + .../internal/generator/common/generator.go | 61 + .../gpu/internal/generator/config/bls12377.go | 20 + prover/gpu/internal/generator/config/bn254.go | 20 + .../gpu/internal/generator/config/bw6761.go | 20 + prover/gpu/internal/generator/config/curve.go | 26 + prover/gpu/internal/generator/main.go | 42 + .../gpu/internal/generator/plonk/generate.go | 47 + .../generator/plonk/template/cgo.go.tmpl | 44 + .../generator/plonk/template/doc.go.tmpl | 7 + .../generator/plonk/template/fft.go.tmpl | 211 + .../generator/plonk/template/fft_stub.go.tmpl | 37 + .../generator/plonk/template/fft_test.go.tmpl | 188 + .../generator/plonk/template/fr.go.tmpl | 270 ++ .../generator/plonk/template/fr_stub.go.tmpl | 37 + .../generator/plonk/template/fr_test.go.tmpl | 275 ++ .../generator/plonk/template/kernels.go.tmpl | 316 ++ .../plonk/template/kernels_stub.go.tmpl | 36 + .../generator/plonk/template/msm.go.tmpl | 407 ++ .../generator/plonk/template/msm_stub.go.tmpl | 34 + .../generator/plonk/template/msm_test.go.tmpl | 139 + .../plonk/template/pinned_fr.go.tmpl | 41 + .../plonk/template/plonk_test.go.tmpl | 210 + .../generator/plonk/template/prove.go.tmpl | 2618 +++++++++++++ .../plonk/template/prove_stub.go.tmpl | 34 + .../generator/plonk/template/templates.go | 54 + prover/gpu/plonk2/bls12377/cgo.go | 44 + prover/gpu/plonk2/bls12377/doc.go | 7 + prover/gpu/plonk2/bls12377/fft.go | 211 + prover/gpu/plonk2/bls12377/fft_stub.go | 37 + prover/gpu/plonk2/bls12377/fft_test.go | 188 + prover/gpu/plonk2/bls12377/fr.go | 270 ++ prover/gpu/plonk2/bls12377/fr_stub.go | 37 + prover/gpu/plonk2/bls12377/fr_test.go | 275 ++ prover/gpu/plonk2/bls12377/kernels.go | 316 ++ prover/gpu/plonk2/bls12377/kernels_stub.go | 36 + prover/gpu/plonk2/bls12377/msm.go | 390 ++ prover/gpu/plonk2/bls12377/msm_stub.go | 34 + prover/gpu/plonk2/bls12377/msm_test.go | 139 + prover/gpu/plonk2/bls12377/pinned_fr.go | 41 + prover/gpu/plonk2/bls12377/plonk_test.go | 205 + prover/gpu/plonk2/bls12377/prove.go | 2618 +++++++++++++ prover/gpu/plonk2/bls12377/prove_stub.go | 34 + prover/gpu/plonk2/bn254/cgo.go | 44 + prover/gpu/plonk2/bn254/doc.go | 7 + prover/gpu/plonk2/bn254/fft.go | 211 + prover/gpu/plonk2/bn254/fft_stub.go | 37 + prover/gpu/plonk2/bn254/fft_test.go | 188 + prover/gpu/plonk2/bn254/fr.go | 270 ++ prover/gpu/plonk2/bn254/fr_stub.go | 37 + prover/gpu/plonk2/bn254/fr_test.go | 275 ++ prover/gpu/plonk2/bn254/kernels.go | 316 ++ prover/gpu/plonk2/bn254/kernels_stub.go | 36 + prover/gpu/plonk2/bn254/msm.go | 390 ++ prover/gpu/plonk2/bn254/msm_stub.go | 34 + prover/gpu/plonk2/bn254/msm_test.go | 139 + prover/gpu/plonk2/bn254/pinned_fr.go | 41 + prover/gpu/plonk2/bn254/plonk_test.go | 169 + prover/gpu/plonk2/bn254/prove.go | 2618 +++++++++++++ prover/gpu/plonk2/bn254/prove_stub.go | 34 + prover/gpu/plonk2/bw6761/cgo.go | 44 + prover/gpu/plonk2/bw6761/doc.go | 7 + prover/gpu/plonk2/bw6761/fft.go | 211 + prover/gpu/plonk2/bw6761/fft_stub.go | 37 + prover/gpu/plonk2/bw6761/fft_test.go | 188 + prover/gpu/plonk2/bw6761/fr.go | 270 ++ prover/gpu/plonk2/bw6761/fr_stub.go | 37 + prover/gpu/plonk2/bw6761/fr_test.go | 275 ++ prover/gpu/plonk2/bw6761/kernels.go | 316 ++ prover/gpu/plonk2/bw6761/kernels_stub.go | 36 + prover/gpu/plonk2/bw6761/msm.go | 388 ++ prover/gpu/plonk2/bw6761/msm_stub.go | 34 + prover/gpu/plonk2/bw6761/msm_test.go | 139 + prover/gpu/plonk2/bw6761/pinned_fr.go | 41 + prover/gpu/plonk2/bw6761/plonk_test.go | 169 + prover/gpu/plonk2/bw6761/prove.go | 2618 +++++++++++++ prover/gpu/plonk2/bw6761/prove_stub.go | 34 + prover/gpu/plonk2/doc.go | 40 + prover/gpu/plonk2/options.go | 42 + prover/gpu/plonk2/prove.go | 197 + prover/gpu/plonk2/prove_test.go | 53 + prover/gpu/plonk2/stub.go | 3 + prover/gpu/quotient/quotient.go | 626 +++ prover/gpu/quotient/quotient_test.go | 81 + prover/gpu/quotient/stub.go | 25 + prover/gpu/singleton.go | 212 ++ prover/gpu/singleton_cuda_test.go | 33 + prover/gpu/singleton_test.go | 43 + prover/gpu/symbolic/adapter.go | 71 + prover/gpu/symbolic/compile.go | 167 + prover/gpu/symbolic/gpu.go | 206 + prover/gpu/symbolic/stub.go | 44 + prover/gpu/symbolic/symbolic_test.go | 347 ++ prover/gpu/threadlocal_linux.go | 68 + prover/gpu/threadlocal_other.go | 20 + prover/gpu/trace.go | 118 + prover/gpu/vortex/commit_cpu.go | 114 + prover/gpu/vortex/commit_merkle.go | 470 +++ prover/gpu/vortex/commit_merkle_stub.go | 69 + prover/gpu/vortex/commit_merkle_test.go | 582 +++ prover/gpu/vortex/gpu.go | 1453 +++++++ prover/gpu/vortex/gpu_test.go | 531 +++ prover/gpu/vortex/pinned_cache.go | 98 + prover/gpu/vortex/stub.go | 67 + prover/gpu/vortex/vortex.go | 217 ++ prover/gpu/vortex/vortex_test.go | 514 +++ .../circuit-testing/aggregation/main.go | 4 +- prover/protocol/compiler/globalcs/quotient.go | 39 +- prover/protocol/compiler/recursion/actions.go | 12 +- prover/protocol/compiler/vortex/committed.go | 162 + prover/protocol/compiler/vortex/prover.go | 415 +- prover/reference-benchmarks/README.md | 132 + ...innet-limitless-7.1.0-provertestdata2.toml | 138 + .../30388561-30389025-response.json | 1 + .../30389026-30389504-response.json | 1 + .../30389505-30390023-response.json | 1 + .../env.txt | 14 + .../logs/30388561-30389025.time.txt | 23 + .../logs/30389026-30389504.time.txt | 23 + .../logs/30389505-30390023.time.txt | 23 + .../selected_requests.txt | 3 + 172 files changed, 49159 insertions(+), 383 deletions(-) create mode 100644 prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_cuda.go create mode 100644 prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_stub.go create mode 100644 prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_test.go create mode 100644 prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda.go create mode 100644 prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda_test.go create mode 100644 prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_stub.go create mode 100644 prover/gpu/.gitignore create mode 100644 prover/gpu/cuda/CMakeLists.txt create mode 100644 prover/gpu/cuda/CMakePresets.json create mode 100644 prover/gpu/cuda/include/gnark_gpu.h create mode 100644 prover/gpu/cuda/include/gnark_gpu_kb.h create mode 100644 prover/gpu/cuda/src/plonk/api.cu create mode 100644 prover/gpu/cuda/src/plonk/ec.cuh create mode 100644 prover/gpu/cuda/src/plonk/field.cuh create mode 100644 prover/gpu/cuda/src/plonk/fp.cuh create mode 100644 prover/gpu/cuda/src/plonk/fr_arith.cuh create mode 100644 prover/gpu/cuda/src/plonk/fr_ops.cu create mode 100644 prover/gpu/cuda/src/plonk/kernels.cu create mode 100644 prover/gpu/cuda/src/plonk/msm.cu create mode 100644 prover/gpu/cuda/src/plonk/ntt.cu create mode 100644 prover/gpu/cuda/src/plonk/plonk_eval.cu create mode 100644 prover/gpu/cuda/src/plonk/plonk_z.cu create mode 100644 prover/gpu/cuda/src/plonk2/ec.cuh create mode 100644 prover/gpu/cuda/src/plonk2/field.cuh create mode 100644 prover/gpu/cuda/src/plonk2/g1.cu create mode 100644 prover/gpu/cuda/src/plonk2/kernels.cu create mode 100644 prover/gpu/cuda/src/plonk2/mimc.cu create mode 100644 prover/gpu/cuda/src/plonk2/msm.cu create mode 100644 prover/gpu/cuda/src/vortex/kb.cu create mode 100644 prover/gpu/cuda/src/vortex/kb_field.cuh create mode 100644 prover/gpu/device.go create mode 100644 prover/gpu/device_stub.go create mode 100644 prover/gpu/enabled_cuda.go create mode 100644 prover/gpu/enabled_nocuda.go create mode 100644 prover/gpu/gpu.go create mode 100644 prover/gpu/internal/generator/common/generator.go create mode 100644 prover/gpu/internal/generator/config/bls12377.go create mode 100644 prover/gpu/internal/generator/config/bn254.go create mode 100644 prover/gpu/internal/generator/config/bw6761.go create mode 100644 prover/gpu/internal/generator/config/curve.go create mode 100644 prover/gpu/internal/generator/main.go create mode 100644 prover/gpu/internal/generator/plonk/generate.go create mode 100644 prover/gpu/internal/generator/plonk/template/cgo.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/doc.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/fft.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/fft_stub.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/fft_test.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/fr.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/fr_stub.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/fr_test.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/kernels.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/kernels_stub.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/msm.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/msm_stub.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/msm_test.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/pinned_fr.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/plonk_test.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/prove.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/prove_stub.go.tmpl create mode 100644 prover/gpu/internal/generator/plonk/template/templates.go create mode 100644 prover/gpu/plonk2/bls12377/cgo.go create mode 100644 prover/gpu/plonk2/bls12377/doc.go create mode 100644 prover/gpu/plonk2/bls12377/fft.go create mode 100644 prover/gpu/plonk2/bls12377/fft_stub.go create mode 100644 prover/gpu/plonk2/bls12377/fft_test.go create mode 100644 prover/gpu/plonk2/bls12377/fr.go create mode 100644 prover/gpu/plonk2/bls12377/fr_stub.go create mode 100644 prover/gpu/plonk2/bls12377/fr_test.go create mode 100644 prover/gpu/plonk2/bls12377/kernels.go create mode 100644 prover/gpu/plonk2/bls12377/kernels_stub.go create mode 100644 prover/gpu/plonk2/bls12377/msm.go create mode 100644 prover/gpu/plonk2/bls12377/msm_stub.go create mode 100644 prover/gpu/plonk2/bls12377/msm_test.go create mode 100644 prover/gpu/plonk2/bls12377/pinned_fr.go create mode 100644 prover/gpu/plonk2/bls12377/plonk_test.go create mode 100644 prover/gpu/plonk2/bls12377/prove.go create mode 100644 prover/gpu/plonk2/bls12377/prove_stub.go create mode 100644 prover/gpu/plonk2/bn254/cgo.go create mode 100644 prover/gpu/plonk2/bn254/doc.go create mode 100644 prover/gpu/plonk2/bn254/fft.go create mode 100644 prover/gpu/plonk2/bn254/fft_stub.go create mode 100644 prover/gpu/plonk2/bn254/fft_test.go create mode 100644 prover/gpu/plonk2/bn254/fr.go create mode 100644 prover/gpu/plonk2/bn254/fr_stub.go create mode 100644 prover/gpu/plonk2/bn254/fr_test.go create mode 100644 prover/gpu/plonk2/bn254/kernels.go create mode 100644 prover/gpu/plonk2/bn254/kernels_stub.go create mode 100644 prover/gpu/plonk2/bn254/msm.go create mode 100644 prover/gpu/plonk2/bn254/msm_stub.go create mode 100644 prover/gpu/plonk2/bn254/msm_test.go create mode 100644 prover/gpu/plonk2/bn254/pinned_fr.go create mode 100644 prover/gpu/plonk2/bn254/plonk_test.go create mode 100644 prover/gpu/plonk2/bn254/prove.go create mode 100644 prover/gpu/plonk2/bn254/prove_stub.go create mode 100644 prover/gpu/plonk2/bw6761/cgo.go create mode 100644 prover/gpu/plonk2/bw6761/doc.go create mode 100644 prover/gpu/plonk2/bw6761/fft.go create mode 100644 prover/gpu/plonk2/bw6761/fft_stub.go create mode 100644 prover/gpu/plonk2/bw6761/fft_test.go create mode 100644 prover/gpu/plonk2/bw6761/fr.go create mode 100644 prover/gpu/plonk2/bw6761/fr_stub.go create mode 100644 prover/gpu/plonk2/bw6761/fr_test.go create mode 100644 prover/gpu/plonk2/bw6761/kernels.go create mode 100644 prover/gpu/plonk2/bw6761/kernels_stub.go create mode 100644 prover/gpu/plonk2/bw6761/msm.go create mode 100644 prover/gpu/plonk2/bw6761/msm_stub.go create mode 100644 prover/gpu/plonk2/bw6761/msm_test.go create mode 100644 prover/gpu/plonk2/bw6761/pinned_fr.go create mode 100644 prover/gpu/plonk2/bw6761/plonk_test.go create mode 100644 prover/gpu/plonk2/bw6761/prove.go create mode 100644 prover/gpu/plonk2/bw6761/prove_stub.go create mode 100644 prover/gpu/plonk2/doc.go create mode 100644 prover/gpu/plonk2/options.go create mode 100644 prover/gpu/plonk2/prove.go create mode 100644 prover/gpu/plonk2/prove_test.go create mode 100644 prover/gpu/plonk2/stub.go create mode 100644 prover/gpu/quotient/quotient.go create mode 100644 prover/gpu/quotient/quotient_test.go create mode 100644 prover/gpu/quotient/stub.go create mode 100644 prover/gpu/singleton.go create mode 100644 prover/gpu/singleton_cuda_test.go create mode 100644 prover/gpu/singleton_test.go create mode 100644 prover/gpu/symbolic/adapter.go create mode 100644 prover/gpu/symbolic/compile.go create mode 100644 prover/gpu/symbolic/gpu.go create mode 100644 prover/gpu/symbolic/stub.go create mode 100644 prover/gpu/symbolic/symbolic_test.go create mode 100644 prover/gpu/threadlocal_linux.go create mode 100644 prover/gpu/threadlocal_other.go create mode 100644 prover/gpu/trace.go create mode 100644 prover/gpu/vortex/commit_cpu.go create mode 100644 prover/gpu/vortex/commit_merkle.go create mode 100644 prover/gpu/vortex/commit_merkle_stub.go create mode 100644 prover/gpu/vortex/commit_merkle_test.go create mode 100644 prover/gpu/vortex/gpu.go create mode 100644 prover/gpu/vortex/gpu_test.go create mode 100644 prover/gpu/vortex/pinned_cache.go create mode 100644 prover/gpu/vortex/stub.go create mode 100644 prover/gpu/vortex/vortex.go create mode 100644 prover/gpu/vortex/vortex_test.go create mode 100644 prover/protocol/compiler/vortex/committed.go create mode 100644 prover/reference-benchmarks/README.md create mode 100644 prover/reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30388561-30389025-response.json create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389026-30389504-response.json create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389505-30390023-response.json create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/env.txt create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30388561-30389025.time.txt create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389026-30389504.time.txt create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389505-30390023.time.txt create mode 100644 prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/selected_requests.txt diff --git a/prover/Makefile b/prover/Makefile index 940bf719b56..3796f35a08f 100644 --- a/prover/Makefile +++ b/prover/Makefile @@ -2,6 +2,7 @@ SHELL := /usr/bin/env bash VERSION ?= $(shell git rev-parse --short=7 HEAD) +GO_BUILD_TAGS ?= debug CONSTRAINTS_COMMIT_HASH := c94b73d3aef361b5138986d38f27f21675aef275 GO_CORSET_VERSION := v1.2.7 @@ -33,6 +34,7 @@ DARWIN_ARM64_FLAGS := CGO_ENABLED=1 GOOS="linux" GOARCH="arm64" lib/compressor-and-shnarf-calculator-local \ docker \ bin/prover \ + bin/prover-cuda \ bin/checker \ go-corset \ testdata \ @@ -85,7 +87,10 @@ bin/controller: bin/prover: mkdir -p bin rm -f $@ - go build -tags debug -o $@ ./cmd/prover + go build -tags "$(GO_BUILD_TAGS)" -o $@ ./cmd/prover + +bin/prover-cuda: + $(MAKE) GO_BUILD_TAGS=debug,cuda bin/prover ## ## Compiles the state-manager inspector diff --git a/prover/backend/aggregation/prove.go b/prover/backend/aggregation/prove.go index c5714a93cb6..a18e20abc54 100644 --- a/prover/backend/aggregation/prove.go +++ b/prover/backend/aggregation/prove.go @@ -19,6 +19,7 @@ import ( "github.com/consensys/linea-monorepo/prover/circuits/dummy" "github.com/consensys/linea-monorepo/prover/circuits/emulation" "github.com/consensys/linea-monorepo/prover/config" + "github.com/consensys/linea-monorepo/prover/gpu" "github.com/consensys/linea-monorepo/prover/utils" "github.com/consensys/linea-monorepo/prover/utils/types" "github.com/ethereum/go-ethereum/common/hexutil" @@ -30,6 +31,18 @@ import ( frBn254 "github.com/consensys/gnark-crypto/ecc/bn254/fr" ) +type setupLoadResult struct { + setup circuits.Setup + err error +} + +type bw6SetupSelection struct { + circuitID circuits.CircuitID + setupPos int + bestSize int + bestAllowedVkForAggregation []string +} + func Prove(cfg *config.Config, req *Request) (*Response, error) { cf, err := collectFields(cfg, req) if err != nil { @@ -52,17 +65,36 @@ func makeProof( return makeDummyProof(cfg, publicInput, circuits.MockCircuitIDEmulation), nil } + useGPU := gpu.IsAggregationEnabled() + + bw6Selection, err := selectBw6Setup(cfg, cf) + if err != nil { + return "", fmt.Errorf("could not select the BW6 setup: %w", err) + } + + bw6Setup := prefetchSetup(cfg, bw6Selection.circuitID, "BW6", useGPU) + bn254Setup := prefetchSetup(cfg, circuits.EmulationCircuitID, "BN254", useGPU) + piProof, piPublicWitness, err := makePiProof(cfg, cf) if err != nil { return "", fmt.Errorf("could not create the public input proof: %w", err) } - proofBW6, setupPos, err := makeBw6Proof(cfg, cf, piProof, piPublicWitness, publicInput) + proofBW6, setupPos, err := makeBw6ProofWithSetup( + cfg, + cf, + piProof, + piPublicWitness, + publicInput, + &bw6Selection, + bw6Setup, + useGPU, + ) if err != nil { return "", fmt.Errorf("error when running the BW6 proof: %w", err) } - proofBn254, err := makeBn254Proof(cfg, setupPos, proofBW6, publicInput) + proofBn254, err := makeBn254Proof(cfg, setupPos, proofBW6, publicInput, bn254Setup, useGPU) if err != nil { return "", fmt.Errorf("error when running the Bn254 proof (aggregation setupPos=%v): %w", setupPos, err) } @@ -97,14 +129,26 @@ func (cf CollectedFields) AggregationPublicInput(cfg *config.Config) public_inpu } } -func makePiProof(cfg *config.Config, cf *CollectedFields) (plonk.Proof, witness.Witness, error) { +func makePiProof( + cfg *config.Config, + cf *CollectedFields, +) (plonk.Proof, witness.Witness, error) { + + // Aggregation phases (PI, BW6, BN254) only use the GPU prover when both a + // device is reachable and the operator opted in via $LINEA_PROVER_GPU_AGGREGATION. + // Compression is the only path GPU-on-by-default in this branch. + useGPU := gpu.IsAggregationEnabled() var setup circuits.Setup setupErr := make(chan error, 1) go func() { var err error - setup, err = circuits.LoadSetup(cfg, circuits.PublicInputInterconnectionCircuitID) + var setupOpts []circuits.LoadSetupOption + if useGPU { + setupOpts = append(setupOpts, circuits.WithoutLagrangeSRS()) + } + setup, err = circuits.LoadSetup(cfg, circuits.PublicInputInterconnectionCircuitID, setupOpts...) setupErr <- err close(setupErr) }() @@ -136,7 +180,7 @@ func makePiProof(cfg *config.Config, cf *CollectedFields) (plonk.Proof, witness. proverOpts := emPlonk.GetNativeProverOptions(ecc.BW6_761.ScalarField(), setup.Circuit.Field()) verifierOpts := emPlonk.GetNativeVerifierOptions(ecc.BW6_761.ScalarField(), setup.Circuit.Field()) - proof, err := circuits.ProveCheck(&setup, &assignment, proverOpts, verifierOpts) + proof, err := circuits.ProveCheck(&setup, &assignment, proverOpts, verifierOpts, circuits.WithGPU(useGPU)) return proof, w, err } @@ -168,20 +212,12 @@ func makeDummyProof(cfg *config.Config, input string, circID circuits.MockCircui return dummy.MakeProof(&setup, x, circID) } -func makeBw6Proof( - cfg *config.Config, - cf *CollectedFields, - piProof plonk.Proof, - piPublicWitness witness.Witness, - publicInput string, -) (proof plonk.Proof, setupPos int, err error) { - - // This determines which is the best circuit to use for aggregation, we - // take the smallest circuit that has enough capacity. +func selectBw6Setup(cfg *config.Config, cf *CollectedFields) (bw6SetupSelection, error) { var ( numProofClaims = len(cf.ProofClaims) biggestAvailable = 0 + setupPos = 0 bestSize = math.MaxInt bestAllowedVkForAggregation []string errs []error @@ -217,11 +253,11 @@ func makeBw6Proof( setupPath := cfg.PathForSetup(string(circuitIDStr)) manifest, err := circuits.ReadSetupManifest(filepath.Join(setupPath, config.ManifestFileName)) if err != nil { - return nil, 0, fmt.Errorf("could not read the manifest for circuit %v: %w", circuitIDStr, err) + return bw6SetupSelection{}, fmt.Errorf("could not read the manifest for circuit %v: %w", circuitIDStr, err) } allowedVkForAggregation, err := manifest.GetStringArray("allowedVkForAggregationDigests") if err != nil { - return nil, 0, fmt.Errorf("could not read the allowedVkForAggregationDigests: %w", err) + return bw6SetupSelection{}, fmt.Errorf("could not read the allowedVkForAggregationDigests: %w", err) } // Try to read circuit names (may not exist in older manifests) allowedVkCircuitNames, _ := manifest.GetStringArray("allowedVkForAggregationCircuitNames") @@ -255,14 +291,40 @@ func makeBw6Proof( ) errs = append(errs, err) - return nil, 0, errors.Join(errs...) + return bw6SetupSelection{}, errors.Join(errs...) } - logrus.Infof("reading the BW6 setup for %v proofs", bestSize) c := circuits.CircuitID(fmt.Sprintf("%s-%d", string(circuits.AggregationCircuitID), bestSize)) - setup, err := circuits.LoadSetup(cfg, c) + return bw6SetupSelection{ + circuitID: c, + setupPos: setupPos, + bestSize: bestSize, + bestAllowedVkForAggregation: bestAllowedVkForAggregation, + }, nil +} + +func makeBw6ProofWithSetup( + cfg *config.Config, + cf *CollectedFields, + piProof plonk.Proof, + piPublicWitness witness.Witness, + publicInput string, + selection *bw6SetupSelection, + setupResult <-chan setupLoadResult, + useGPU bool, +) (proof plonk.Proof, setupPos int, err error) { + if selection == nil { + s, err := selectBw6Setup(cfg, cf) + if err != nil { + return nil, 0, err + } + selection = &s + } + + logrus.Infof("reading the BW6 setup for %v proofs", selection.bestSize) + setup, err := loadOrAwaitSetup(cfg, selection.circuitID, setupResult, "BW6") if err != nil { - return nil, 0, fmt.Errorf("could not load the setup for circuit %v: %w", c, err) + return nil, 0, err } // Now, that we have selected "the best" setup to use to aggregate all the @@ -270,13 +332,13 @@ func makeBw6Proof( // not do it before because the "ordering" of the verifying keys can be // circuit dependent. So, we needed to pick the circuit first. - AssignCircuitIDToProofClaims(bestAllowedVkForAggregation, cf.ProofClaims) + AssignCircuitIDToProofClaims(selection.bestAllowedVkForAggregation, cf.ProofClaims) // Pre-flight check: validate that all assigned circuit IDs are allowed by // the IsAllowedCircuitID bitmask BEFORE running the expensive BW6 prover. // Without this check, disallowed circuits would only be caught inside the // circuit constraints, producing a cryptic "assertIsEqual 0==1" error. - if err := validateCircuitIDsAllowed(cfg.Aggregation.IsAllowedCircuitID, cf.ProofClaims, cf.ProofClaimSources, bestAllowedVkForAggregation); err != nil { + if err := validateCircuitIDsAllowed(cfg.Aggregation.IsAllowedCircuitID, cf.ProofClaims, cf.ProofClaimSources, selection.bestAllowedVkForAggregation); err != nil { return nil, 0, err } @@ -298,12 +360,12 @@ func makeBw6Proof( ActualIndexes: pi_interconnection.InnerCircuitTypesToIndexes(&cfg.PublicInputInterconnection, cf.InnerCircuitTypes), } - logrus.Infof("running the BW6 prover with aggregation setupPos=%v (aggregation-%v)", setupPos, bestSize) - proofBW6, err := aggregation.MakeProof(&setup, bestSize, cf.ProofClaims, piInfo, piBW6) + logrus.Infof("running the BW6 prover with aggregation setupPos=%v (aggregation-%v)", selection.setupPos, selection.bestSize) + proofBW6, err := aggregation.MakeProof(&setup, selection.bestSize, cf.ProofClaims, piInfo, piBW6, useGPU) if err != nil { return nil, 0, fmt.Errorf("could not create BW6 proof: %w", err) } - return proofBW6, setupPos, nil + return proofBW6, selection.setupPos, nil } func makeBn254Proof( @@ -311,13 +373,13 @@ func makeBn254Proof( setupPos int, proofBw6 plonk.Proof, publicInput string, + setupResult <-chan setupLoadResult, + useGPU bool, ) (proof plonk.Proof, err error) { - logrus.Infof("reading the BN254 setup from disk...") - - setup, err := circuits.LoadSetup(cfg, circuits.EmulationCircuitID) + setup, err := loadOrAwaitSetup(cfg, circuits.EmulationCircuitID, setupResult, "BN254") if err != nil { - return nil, fmt.Errorf("could not read the BN254 setup: %w", err) + return nil, err } logrus.Infof("running the prover for the BN254 circuit...") @@ -330,7 +392,7 @@ func makeBn254Proof( logrus.Infof("running the BN254 emulation prover with aggregation setupPos=%v", setupPos) - proofBn254, err := emulation.MakeProof(&setup, setupPos, proofBw6, piBn254) + proofBn254, err := emulation.MakeProof(&setup, setupPos, proofBw6, piBn254, useGPU) if err != nil { return nil, fmt.Errorf("(for Bn254) gnark's plonk Prover failed with error: %w", err) } @@ -338,6 +400,44 @@ func makeBn254Proof( } +func prefetchSetup(cfg *config.Config, circuitID circuits.CircuitID, label string, useGPU bool) <-chan setupLoadResult { + setupResult := make(chan setupLoadResult, 1) + go func() { + logrus.Infof("prefetching the %s setup from disk...", label) + var setupOpts []circuits.LoadSetupOption + if useGPU { + setupOpts = append(setupOpts, circuits.WithoutLagrangeSRS()) + } + setup, err := circuits.LoadSetup(cfg, circuitID, setupOpts...) + setupResult <- setupLoadResult{setup: setup, err: err} + close(setupResult) + }() + return setupResult +} + +func loadOrAwaitSetup( + cfg *config.Config, + circuitID circuits.CircuitID, + setupResult <-chan setupLoadResult, + label string, +) (circuits.Setup, error) { + if setupResult == nil { + logrus.Infof("reading the %s setup from disk...", label) + setup, err := circuits.LoadSetup(cfg, circuitID) + if err != nil { + return circuits.Setup{}, fmt.Errorf("could not read the %s setup: %w", label, err) + } + return setup, nil + } + + logrus.Infof("waiting for prefetched %s setup...", label) + result := <-setupResult + if result.err != nil { + return circuits.Setup{}, fmt.Errorf("could not read the prefetched %s setup: %w", label, result.err) + } + return result.setup, nil +} + // logAllowedVKs logs the allowed VKs for a given aggregation setup, with // circuit names if available (from the manifest's allowedVkForAggregationCircuitNames). func logAllowedVKs(maxNbProofs int, vks []string, circuitNames []string) { diff --git a/prover/backend/dataavailability/prove.go b/prover/backend/dataavailability/prove.go index cebcd7fc102..e464405e069 100644 --- a/prover/backend/dataavailability/prove.go +++ b/prover/backend/dataavailability/prove.go @@ -15,6 +15,7 @@ import ( "github.com/consensys/linea-monorepo/prover/circuits" "github.com/consensys/linea-monorepo/prover/circuits/dummy" "github.com/consensys/linea-monorepo/prover/config" + "github.com/consensys/linea-monorepo/prover/gpu" "github.com/consensys/linea-monorepo/prover/lib/compressor/blob" "github.com/consensys/linea-monorepo/prover/utils" @@ -63,6 +64,28 @@ func Prove(cfg *config.Config, req *Request) (*Response, error) { return nil, fmt.Errorf("unsupported blob version: %v", version) } + // Compression always uses the GPU prover when one is detected. The GPU + // path only needs the canonical SRS, so we skip the Lagrange SRS read + // (saves ~17s on the 126M-constraint compression circuit). + useGPU := gpu.HasDevice() + + type setupResult struct { + setup circuits.Setup + err error + } + var setupCh chan setupResult + if cfg.DataAvailability.ProverMode != config.ProverModeDev { + setupCh = make(chan setupResult, 1) + go func() { + var setupOpts []circuits.LoadSetupOption + if useGPU { + setupOpts = append(setupOpts, circuits.WithoutLagrangeSRS()) + } + setup, err := circuits.LoadSetup(cfg, circuitID, setupOpts...) + setupCh <- setupResult{setup: setup, err: err} + }() + } + logrus.Info("reading dictionaries") dictStore := cfg.BlobDecompressionDictStore(string(circuitID)) @@ -112,9 +135,12 @@ func Prove(cfg *config.Config, req *Request) (*Response, error) { proofSerialized = dummy.MakeProof(&setup, pubInput, circuits.MockCircuitIDDecompression) } else { - if setup, err = circuits.LoadSetup(cfg, circuitID); err != nil { + setupRes := <-setupCh + if setupRes.err != nil { + err = setupRes.err return nil, fmt.Errorf("could not load the setup: %w", err) } + setup = setupRes.setup maxUsableBytes, err := setup.Manifest.GetInt("maxUsableBytes") if err != nil { @@ -140,11 +166,12 @@ func Prove(cfg *config.Config, req *Request) (*Response, error) { opts := []any{ emPlonk.GetNativeProverOptions(ecc.BW6_761.ScalarField(), ecc.BLS12_377.ScalarField()), emPlonk.GetNativeVerifierOptions(ecc.BW6_761.ScalarField(), ecc.BLS12_377.ScalarField()), + circuits.WithGPU(useGPU), } // This actually runs the compression prover - logrus.Infof("running the decompression prover") + logrus.Infof("running the decompression prover (gpu=%t)", useGPU) proof, err := circuits.ProveCheck( &setup, diff --git a/prover/circuits/aggregation/prover.go b/prover/circuits/aggregation/prover.go index d4987346b45..1a18521533f 100644 --- a/prover/circuits/aggregation/prover.go +++ b/prover/circuits/aggregation/prover.go @@ -14,12 +14,18 @@ import ( // Make proof runs the prover of the aggregation circuit and returns the // corresponding proof. +// +// useGPU forwards the GPU choice from the aggregation backend to the +// underlying ProveCheck. The aggregation BW6 circuit is one of the GPU +// targets gated behind LINEA_PROVER_GPU_AGGREGATION; the backend computes +// that boolean once and threads it through here. func MakeProof( setup *circuits.Setup, maxNbProof int, proofClaims []ProofClaimAssignment, piInfo PiInfo, publicInput fr.Element, + useGPU bool, ) ( plonk.Proof, error, @@ -37,12 +43,13 @@ func MakeProof( return nil, fmt.Errorf("while generating the aggregation circuit assignment: %w", err) } - logrus.Infof("Running the prove-check") + logrus.Infof("Running the prove-check (gpu=%t)", useGPU) return circuits.ProveCheck( setup, assignment, emPlonk.GetNativeProverOptions(ecc.BN254.ScalarField(), setup.Circuit.Field()), emPlonk.GetNativeVerifierOptions(ecc.BN254.ScalarField(), setup.Circuit.Field()), + circuits.WithGPU(useGPU), ) } diff --git a/prover/circuits/emulation/circuit.go b/prover/circuits/emulation/circuit.go index e5282cf737a..86a02c7d24c 100644 --- a/prover/circuits/emulation/circuit.go +++ b/prover/circuits/emulation/circuit.go @@ -84,12 +84,18 @@ func (c *CircuitEmulation) Define(api frontend.API) error { return nil } -// Produces a proof for the outer-proof outside on the BN field +// Produces a proof for the outer-proof outside on the BN field. +// +// useGPU forwards the GPU choice from the aggregation backend to the +// underlying ProveCheck. The BN254 emulation circuit is one of the GPU +// targets gated behind LINEA_PROVER_GPU_AGGREGATION; the backend computes +// that boolean once and threads it through here. func MakeProof( setup *circuits.Setup, circuitID int, innerProof plonk.Proof, publicInput fr.Element, + useGPU bool, ) ( proof plonk.Proof, err error, @@ -105,7 +111,7 @@ func MakeProof( return nil, fmt.Errorf("while generating the aggregation circuit assignment: %w", err) } - return circuits.ProveCheck(setup, assignment) + return circuits.ProveCheck(setup, assignment, circuits.WithGPU(useGPU)) } // Allocates a new outer-circuit that can be passed to `frontend.Compile`. The diff --git a/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/smt_test.go b/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/smt_test.go index a906089bd85..c575ffa0167 100644 --- a/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/smt_test.go +++ b/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/smt_test.go @@ -57,6 +57,26 @@ func TestMerkleProofNative(t *testing.T) { } } +func TestBuildCompleteMiMCEquivalent(t *testing.T) { + const depth = 12 + config := &smt.Config{ + HashFunc: hashtypes.MiMC, + Depth: depth, + } + + leavesFr := vector.Rand(1 << config.Depth) + leaves := make([]Bytes32, len(leavesFr)) + for i := range leaves { + leaves[i] = Bytes32(leavesFr[i].Bytes()) + } + + generic := smt.BuildComplete(leaves, config.HashFunc) + specialized := smt.BuildCompleteMiMC(leaves) + + require.Equal(t, generic.Root, specialized.Root, "specialized MiMC tree root should match generic builder") + require.Equal(t, generic.OccupiedNodes, specialized.OccupiedNodes, "specialized MiMC tree nodes should match generic builder") +} + func BenchmarkBuildComplete(b *testing.B) { config := &smt.Config{ HashFunc: hashtypes.MiMC, @@ -74,3 +94,17 @@ func BenchmarkBuildComplete(b *testing.B) { _ = smt.BuildComplete(leaves, config.HashFunc) } } + +func BenchmarkBuildCompleteMiMC(b *testing.B) { + const depth = 20 + + leavesFr := vector.Rand(1 << depth) + leaves := make([]Bytes32, len(leavesFr)) + for i := range leaves { + leaves[i] = Bytes32(leavesFr[i].Bytes()) + } + + for b.Loop() { + _ = smt.BuildCompleteMiMC(leaves) + } +} diff --git a/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/tree.go b/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/tree.go index 791a3738c95..d4df26a1d21 100644 --- a/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/tree.go +++ b/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt/tree.go @@ -5,7 +5,9 @@ import ( "runtime" "sync" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/mimc" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/hashtypes" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/types" ) @@ -367,3 +369,128 @@ func BuildComplete(leaves []types.Bytes32, hashFunc func() hashtypes.Hasher) *Tr tree.Root = hashLR(hasher, currLevels[0], currLevels[1]) return tree } + +// BuildCompleteMiMC builds a complete Merkle tree using the BLS12-377 MiMC +// field hasher. It is equivalent to BuildComplete(leaves, hashtypes.MiMC), but +// avoids the generic hash.Hash byte interface on the Vortex prover hot path. +func BuildCompleteMiMC(leaves []types.Bytes32) *Tree { + numLeaves := len(leaves) + + if !utils.IsPowerOfTwo(numLeaves) || numLeaves == 0 { + utils.Panic("expected power of two number of leaves, got %v", numLeaves) + } + + depth := utils.Log2Ceil(numLeaves) + config := &Config{HashFunc: hashtypes.MiMC, Depth: depth} + tree := newEmptyTreeMiMC(config) + tree.OccupiedLeaves = leaves + + nbTotalNodes := 0 + for d := 1; d < depth; d++ { + nbTotalNodes += 1 << (depth - d) + } + arena := make([]types.Bytes32, nbTotalNodes) + offset := 0 + + type workerTask struct { + curr []types.Bytes32 + next []types.Bytes32 + start int + stop int + wg *sync.WaitGroup + } + + workerCount := runtime.GOMAXPROCS(0) + if workerCount < 1 { + workerCount = 1 + } + + tasks := make(chan workerTask) + var workersWG sync.WaitGroup + workersWG.Add(workerCount) + for i := 0; i < workerCount; i++ { + go func() { + hasher := mimc.NewFieldHasher() + for task := range tasks { + for k := task.start; k < task.stop; k++ { + task.next[k] = hashLRMiMC(hasher, task.curr[2*k], task.curr[2*k+1]) + } + task.wg.Done() + } + workersWG.Done() + }() + } + + currLevels := leaves + for d := 1; d < depth; d++ { + levelSize := 1 << (depth - d) + nextLevel := arena[offset : offset+levelSize] + + activeWorkers := workerCount + if levelSize < activeWorkers { + activeWorkers = levelSize + } + + var levelWG sync.WaitGroup + levelWG.Add(activeWorkers) + for i := 0; i < activeWorkers; i++ { + start := i * levelSize / activeWorkers + stop := (i + 1) * levelSize / activeWorkers + tasks <- workerTask{ + curr: currLevels, + next: nextLevel, + start: start, + stop: stop, + wg: &levelWG, + } + } + levelWG.Wait() + + tree.OccupiedNodes[d-1] = nextLevel + currLevels = nextLevel + offset += levelSize + } + + close(tasks) + workersWG.Wait() + + if len(currLevels) != 2 { + utils.Panic("broken invariant : len(currLevels) != 2, =%v", len(currLevels)) + } + + hasher := mimc.NewFieldHasher() + tree.Root = hashLRMiMC(hasher, currLevels[0], currLevels[1]) + return tree +} + +func newEmptyTreeMiMC(conf *Config) *Tree { + emptyNodes := make([]types.Bytes32, conf.Depth-1) + prevNode := EmptyLeaf() + hasher := mimc.NewFieldHasher() + + for i := range emptyNodes { + newNode := hashLRMiMC(hasher, prevNode, prevNode) + emptyNodes[i] = newNode + prevNode = newNode + } + + root := hashLRMiMC(hasher, prevNode, prevNode) + + return &Tree{ + Config: conf, + Root: root, + OccupiedLeaves: make([]types.Bytes32, 0), + OccupiedNodes: make([][]types.Bytes32, conf.Depth-1), + EmptyNodes: emptyNodes, + } +} + +func hashLRMiMC(hasher mimc.FieldHasher, nodeL, nodeR types.Bytes32) types.Bytes32 { + var elems [2]field.Element + elems[0].SetBytes(nodeL[:]) + elems[1].SetBytes(nodeR[:]) + + hasher.Reset() + digest := hasher.SumElements(elems[:]) + return digest.Bytes() +} diff --git a/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/commitment.go b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/commitment.go index d6da182affd..5baff4f4c76 100644 --- a/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/commitment.go +++ b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/commitment.go @@ -5,7 +5,6 @@ import ( "runtime" "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/mimc" - "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/hashtypes" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/common/smartvectors" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" @@ -40,21 +39,31 @@ func (p *Params) CommitMerkleWithSIS(ps []smartvectors.SmartVector) (encodedMatr timeEncoding := profiling.TimeIt(func() { encodedMatrix = p.encodeRows(ps) }) + sisDoneOnGPU := false timeSisHashing := profiling.TimeIt(func() { + if gpuTree, gpuColHashes, ok := tryCommitSISGPU(encodedMatrix, p.Key); ok { + tree = gpuTree + colHashes = gpuColHashes + sisDoneOnGPU = true + return + } + // colHashes stores concatenation of SIS hashes of the columns colHashes = p.Key.TransversalHash(encodedMatrix) }) timeTree := profiling.TimeIt(func() { + if sisDoneOnGPU { + return + } + if gpuTree, ok := tryBuildSISMiMCTreeGPU(colHashes, p.Key.OutputSize()); ok { + tree = gpuTree + return + } + // Hash the SIS digests to obtain the leaves of the Merkle tree. leaves := p.hashSisHash(colHashes) - - tree = smt.BuildComplete( - leaves, - func() hashtypes.Hasher { - return hashtypes.Hasher{Hash: mimc.NewMiMC()} - }, - ) + tree = smt.BuildCompleteMiMC(leaves) }) logrus.Infof( @@ -86,6 +95,12 @@ func (p *Params) CommitMerkleWithoutSIS(ps []smartvectors.SmartVector) (encodedM }) timeTree := profiling.TimeIt(func() { + if gpuTree, gpuColHashes, ok := tryCommitNoSISMiMCGPU(encodedMatrix); ok { + tree = gpuTree + colHashes = gpuColHashes + return + } + // colHashes stores the MiMC hashes // of the columns. colHashes = p.noSisTransversalHash(encodedMatrix) @@ -94,12 +109,7 @@ func (p *Params) CommitMerkleWithoutSIS(ps []smartvectors.SmartVector) (encodedM leaves[i] = colHashes[i].Bytes() } - tree = smt.BuildComplete( - leaves, - func() hashtypes.Hasher { - return hashtypes.Hasher{Hash: mimc.NewMiMC()} - }, - ) + tree = smt.BuildCompleteMiMC(leaves) }) logrus.Infof( diff --git a/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_cuda.go b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_cuda.go new file mode 100644 index 00000000000..9fff45f898c --- /dev/null +++ b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_cuda.go @@ -0,0 +1,738 @@ +//go:build cuda + +package vortex + +/* +#cgo LDFLAGS: -L${SRCDIR}/../../../../../../gpu/cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/../../../../../../gpu/cuda/include + +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "fmt" + "os" + "runtime" + "strconv" + "sync" + "unsafe" + + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/mimc" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/ringsis" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/hashtypes" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/parallel" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/types" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/sirupsen/logrus" +) + +const ( + gpuSISDegree = 64 + gpuSISLogTwoBound = 16 + gpuSISLimbsPerField = 16 + gpuSISMinRows = 512 + gpuSISSplitMinRows = 256 + + gpuSISRowRegular = 0 + gpuSISRowConstant = 1 +) + +type gpuSISStaticData struct { + twiddles []field.Element + twiddlesInv []field.Element + coset []field.Element + cosetInv []field.Element + cardinalityInv []field.Element + mimcConstants []field.Element +} + +type gpuSISKeyCacheKey struct { + key *ringsis.Key + numPolys int +} + +var ( + gpuSISStaticOnce sync.Once + gpuSISStatic gpuSISStaticData + gpuSISStaticErr error + + gpuSISKeyCacheMu sync.Mutex + gpuSISKeyCache = map[gpuSISKeyCacheKey][]field.Element{} +) + +func tryCommitSISGPU(v EncodedMatrix, key *ringsis.Key) (*smt.Tree, []field.Element, bool) { + if !usePIGPUSIS() { + return nil, nil, false + } + if len(v) < piGPUSISMinRows() && !shouldAttemptSplitSISGPU(v) { + return nil, nil, false + } + tree, colHashes, err := buildSISMiMCTreeGPUSplitFromRows(v, key) + if err != nil { + logrus.WithError(err).Warn("PI Vortex GPU SIS failed; falling back to CPU") + return nil, nil, false + } + return tree, colHashes, true +} + +func tryBuildSISMiMCTreeGPU(colHashes []field.Element, chunkSize int) (*smt.Tree, bool) { + if !usePIGPUMiMC() { + return nil, false + } + tree, err := buildSISMiMCTreeGPU(colHashes, chunkSize) + if err != nil { + logrus.WithError(err).Warn("PI Vortex GPU MiMC tree failed; falling back to CPU") + return nil, false + } + return tree, true +} + +func tryCommitNoSISMiMCGPU(v []smartvectors.SmartVector) (*smt.Tree, []field.Element, bool) { + if !usePIGPUMiMC() { + return nil, nil, false + } + tree, colHashes, err := buildNoSISMiMCTreeGPU(v) + if err != nil { + logrus.WithError(err).Warn("PI Vortex GPU no-SIS MiMC failed; falling back to CPU") + return nil, nil, false + } + return tree, colHashes, true +} + +func usePIGPUMiMC() bool { + return os.Getenv("LINEA_PROVER_GPU_PI_VORTEX") == "1" || + os.Getenv("LINEA_PROVER_GPU_PI_MIMC") == "1" +} + +func usePIGPUSIS() bool { + return os.Getenv("LINEA_PROVER_GPU_PI_VORTEX") == "1" || + os.Getenv("LINEA_PROVER_GPU_PI_SIS") == "1" +} + +func piGPUSISMinRows() int { + raw := os.Getenv("LINEA_PROVER_GPU_PI_SIS_MIN_ROWS") + if raw == "" { + return gpuSISMinRows + } + v, err := strconv.Atoi(raw) + if err != nil || v < 0 { + logrus.WithField("value", raw).Warn("invalid LINEA_PROVER_GPU_PI_SIS_MIN_ROWS; using default") + return gpuSISMinRows + } + return v +} + +func piGPUSISSplitMinRows() int { + raw := os.Getenv("LINEA_PROVER_GPU_PI_SIS_SPLIT_MIN_ROWS") + if raw == "" { + return gpuSISSplitMinRows + } + v, err := strconv.Atoi(raw) + if err != nil || v < 0 { + logrus.WithField("value", raw).Warn("invalid LINEA_PROVER_GPU_PI_SIS_SPLIT_MIN_ROWS; using default") + return gpuSISSplitMinRows + } + return v +} + +func shouldAttemptSplitSISGPU(v EncodedMatrix) bool { + if len(v) < piGPUSISSplitMinRows() { + return false + } + if os.Getenv("LINEA_PROVER_GPU_PI_DISABLE_SECONDARY_DEVICE") != "" { + return false + } + if len(v) == 0 || v[0].Len() < 2 || !utils.IsPowerOfTwo(v[0].Len()) { + return false + } + _, primaryID, err := piPrimaryGPUDevice() + if err != nil { + logrus.WithError(err).Warn("PI Vortex GPU primary device unavailable for SIS split threshold") + return false + } + _, ok, err := piSecondaryGPUDeviceID(primaryID) + if err != nil { + logrus.WithError(err).Warn("PI Vortex GPU secondary device unavailable for SIS split threshold") + return false + } + return ok +} + +func buildSISMiMCTreeGPU(colHashes []field.Element, chunkSize int) (*smt.Tree, error) { + tree, _, err := buildMiMCTreeGPUFromChunks(colHashes, chunkSize) + return tree, err +} + +func buildSISMiMCTreeGPUFromRows(v EncodedMatrix, key *ringsis.Key) (*smt.Tree, []field.Element, error) { + dev, _, err := piPrimaryGPUDevice() + if err != nil { + return nil, nil, err + } + return buildSISMiMCTreeGPUFromRowsOnDevice(dev, v, key) +} + +func buildSISMiMCTreeGPUSplitFromRows(v EncodedMatrix, key *ringsis.Key) (*smt.Tree, []field.Element, error) { + if os.Getenv("LINEA_PROVER_GPU_PI_DISABLE_SECONDARY_DEVICE") != "" { + return buildSISMiMCTreeGPUFromRows(v, key) + } + primary, primaryID, err := piPrimaryGPUDevice() + if err != nil { + return nil, nil, err + } + secondaryID, ok, err := piSecondaryGPUDeviceID(primaryID) + if err != nil { + return nil, nil, err + } + if !ok { + return buildSISMiMCTreeGPUFromRowsOnDevice(primary, v, key) + } + if len(v) == 0 || v[0].Len() < 2 { + return buildSISMiMCTreeGPUFromRowsOnDevice(primary, v, key) + } + + numCols := v[0].Len() + if !utils.IsPowerOfTwo(numCols) { + return buildSISMiMCTreeGPUFromRowsOnDevice(primary, v, key) + } + split := numCols / 2 + leftRows, rightRows, err := splitSISRows(v, split) + if err != nil { + return nil, nil, err + } + + secondary := gpu.GetDeviceN(secondaryID) + if secondary == nil { + return nil, nil, fmt.Errorf("secondary GPU device %d is unavailable", secondaryID) + } + logrus.Infof( + "PI Vortex GPU SIS split across devices: primary=%d secondary=%d rows=%d cols=%d split=%d", + primaryID, secondaryID, len(v), numCols, split, + ) + + type result struct { + tree *smt.Tree + colHashes []field.Element + err error + } + leftCh := make(chan result, 1) + rightCh := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + tree, colHashes, err := buildSISMiMCTreeGPUFromRowsOnDevice(primary, leftRows, key) + leftCh <- result{tree: tree, colHashes: colHashes, err: err} + }() + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + tree, colHashes, err := buildSISMiMCTreeGPUFromRowsOnDevice(secondary, rightRows, key) + rightCh <- result{tree: tree, colHashes: colHashes, err: err} + }() + + left := <-leftCh + right := <-rightCh + if left.err != nil { + return nil, nil, fmt.Errorf("primary split SIS commitment: %w", left.err) + } + if right.err != nil { + return nil, nil, fmt.Errorf("secondary split SIS commitment: %w", right.err) + } + + tree, err := mergeSplitSISMiMCTrees(left.tree, right.tree) + if err != nil { + return nil, nil, err + } + colHashes := make([]field.Element, 0, len(left.colHashes)+len(right.colHashes)) + colHashes = append(colHashes, left.colHashes...) + colHashes = append(colHashes, right.colHashes...) + return tree, colHashes, nil +} + +func buildSISMiMCTreeGPUFromRowsOnDevice(dev *gpu.Device, v EncodedMatrix, key *ringsis.Key) (*smt.Tree, []field.Element, error) { + if len(v) == 0 { + return nil, nil, fmt.Errorf("empty matrix") + } + if key.LogTwoDegree != utils.Log2Ceil(gpuSISDegree) || key.LogTwoBound != gpuSISLogTwoBound { + return nil, nil, fmt.Errorf( + "unsupported SIS params: degree=%d logTwoBound=%d", + key.OutputSize(), + key.LogTwoBound, + ) + } + + numRows := len(v) + numCols := v[0].Len() + if numCols == 0 { + return nil, nil, fmt.Errorf("matrix has zero columns") + } + for i := range v { + if v[i].Len() != numCols { + return nil, nil, fmt.Errorf("row %d has length %d, expected %d", i, v[i].Len(), numCols) + } + } + if !utils.IsPowerOfTwo(numCols) { + return nil, nil, fmt.Errorf("numCols=%d is not a power of two", numCols) + } + if numRows > key.MaxNumFieldToHash { + return nil, nil, fmt.Errorf("numRows=%d exceeds SIS key capacity %d", numRows, key.MaxNumFieldToHash) + } + + rowPtrs := make([]uintptr, numRows) + rowKinds := make([]uint8, numRows) + rowConstants := make([]field.Element, numRows) + for i := range v { + switch vi := v[i].(type) { + case *smartvectors.Regular: + rowKinds[i] = gpuSISRowRegular + rowPtrs[i] = uintptr(unsafe.Pointer(&(*vi)[0])) + case *smartvectors.Constant: + rowKinds[i] = gpuSISRowConstant + rowConstants[i] = vi.Value + default: + return nil, nil, fmt.Errorf("unsupported smart vector row %d of type %T", i, v[i]) + } + } + + numPolys := utils.DivCeil(numRows*gpuSISLimbsPerField, gpuSISDegree) + ag, err := cachedFlattenSISKey(key, numPolys) + if err != nil { + return nil, nil, err + } + static, err := getGPUSISStaticData() + if err != nil { + return nil, nil, err + } + + if dev == nil { + return nil, nil, fmt.Errorf("GPU device is unavailable") + } + if err := dev.Bind(); err != nil { + return nil, nil, fmt.Errorf("bind GPU device %d: %w", dev.DeviceID(), err) + } + + colHashes := make([]field.Element, numCols*gpuSISDegree) + nodes := make([]field.Element, 2*numCols-1) + + errCode := C.gnark_gpu_bls12377_sis_mimc_tree( + C.gnark_gpu_context_t(dev.Handle()), + (*C.uintptr_t)(unsafe.Pointer(&rowPtrs[0])), + (*C.uint8_t)(unsafe.Pointer(&rowKinds[0])), + (*C.uint64_t)(unsafe.Pointer(&rowConstants[0])), + C.size_t(numRows), + C.size_t(numCols), + (*C.uint64_t)(unsafe.Pointer(&ag[0])), + C.size_t(numPolys), + (*C.uint64_t)(unsafe.Pointer(&static.twiddles[0])), + (*C.uint64_t)(unsafe.Pointer(&static.twiddlesInv[0])), + (*C.uint64_t)(unsafe.Pointer(&static.coset[0])), + (*C.uint64_t)(unsafe.Pointer(&static.cosetInv[0])), + (*C.uint64_t)(unsafe.Pointer(&static.cardinalityInv[0])), + (*C.uint64_t)(unsafe.Pointer(&static.mimcConstants[0])), + (*C.uint64_t)(unsafe.Pointer(&colHashes[0])), + (*C.uint64_t)(unsafe.Pointer(&nodes[0])), + ) + runtime.KeepAlive(v) + runtime.KeepAlive(rowPtrs) + runtime.KeepAlive(rowKinds) + runtime.KeepAlive(rowConstants) + runtime.KeepAlive(ag) + runtime.KeepAlive(static) + if errCode != C.GNARK_GPU_SUCCESS { + return nil, nil, fmt.Errorf("gnark_gpu_bls12377_sis_mimc_tree: %s", gpuErrorString(errCode)) + } + + return bottomUpMiMCTreeFromField(nodes, numCols), colHashes, nil +} + +func piPrimaryGPUDevice() (*gpu.Device, int, error) { + dev, deviceID, err := gpu.DeviceFromEnvOrCurrent() + if err != nil { + return nil, 0, err + } + if dev == nil { + dev = gpu.GetDevice() + if dev == nil { + return nil, 0, fmt.Errorf("GPU device is unavailable") + } + deviceID = dev.DeviceID() + if err := dev.Bind(); err != nil { + return nil, 0, fmt.Errorf("bind GPU device %d: %w", deviceID, err) + } + } + return dev, deviceID, nil +} + +func piSecondaryGPUDeviceID(primaryID int) (int, bool, error) { + raw := os.Getenv("LINEA_PROVER_GPU_PI_SECONDARY_DEVICE_ID") + if raw != "" { + id, err := strconv.Atoi(raw) + if err != nil { + return 0, false, fmt.Errorf("invalid LINEA_PROVER_GPU_PI_SECONDARY_DEVICE_ID %q: %w", raw, err) + } + if id < 0 { + return 0, false, fmt.Errorf("LINEA_PROVER_GPU_PI_SECONDARY_DEVICE_ID must be non-negative, got %d", id) + } + if id == primaryID { + return 0, false, fmt.Errorf("PI secondary device matches primary device %d", primaryID) + } + return id, true, nil + } + + n := gpu.PhysicalDeviceCount() + if n < 2 { + return 0, false, nil + } + return (primaryID + 1) % n, true, nil +} + +func splitSISRows(v EncodedMatrix, split int) (EncodedMatrix, EncodedMatrix, error) { + left := make(EncodedMatrix, len(v)) + right := make(EncodedMatrix, len(v)) + for i := range v { + if v[i].Len() <= split || split <= 0 { + return nil, nil, fmt.Errorf("invalid split %d for row %d length %d", split, i, v[i].Len()) + } + switch row := v[i].(type) { + case *smartvectors.Regular: + left[i] = smartvectors.NewRegular((*row)[:split]) + right[i] = smartvectors.NewRegular((*row)[split:]) + case *smartvectors.Constant: + left[i] = smartvectors.NewConstant(row.Value, split) + right[i] = smartvectors.NewConstant(row.Value, row.Len()-split) + default: + return nil, nil, fmt.Errorf("unsupported smart vector row %d of type %T", i, v[i]) + } + } + return left, right, nil +} + +func mergeSplitSISMiMCTrees(left, right *smt.Tree) (*smt.Tree, error) { + if left == nil || right == nil { + return nil, fmt.Errorf("cannot merge nil split SIS tree") + } + if left.Config == nil || right.Config == nil { + return nil, fmt.Errorf("cannot merge split SIS tree without config") + } + if left.Config.Depth != right.Config.Depth { + return nil, fmt.Errorf("split SIS tree depth mismatch: %d != %d", left.Config.Depth, right.Config.Depth) + } + if len(left.OccupiedLeaves) != len(right.OccupiedLeaves) { + return nil, fmt.Errorf("split SIS leaf count mismatch: %d != %d", len(left.OccupiedLeaves), len(right.OccupiedLeaves)) + } + if left.Config.Depth == 0 { + return nil, fmt.Errorf("split SIS tree depth is too small to merge") + } + + depth := left.Config.Depth + 1 + tree := smt.NewEmptyTree(&smt.Config{HashFunc: hashtypes.MiMC, Depth: depth}) + tree.OccupiedLeaves = make([]types.Bytes32, 0, len(left.OccupiedLeaves)+len(right.OccupiedLeaves)) + tree.OccupiedLeaves = append(tree.OccupiedLeaves, left.OccupiedLeaves...) + tree.OccupiedLeaves = append(tree.OccupiedLeaves, right.OccupiedLeaves...) + + tree.OccupiedNodes = make([][]types.Bytes32, depth-1) + for level := 0; level < left.Config.Depth-1; level++ { + tree.OccupiedNodes[level] = make([]types.Bytes32, 0, len(left.OccupiedNodes[level])+len(right.OccupiedNodes[level])) + tree.OccupiedNodes[level] = append(tree.OccupiedNodes[level], left.OccupiedNodes[level]...) + tree.OccupiedNodes[level] = append(tree.OccupiedNodes[level], right.OccupiedNodes[level]...) + } + tree.OccupiedNodes[depth-2] = []types.Bytes32{left.Root, right.Root} + + hasher := mimc.NewMiMC() + left.Root.WriteTo(hasher) + right.Root.WriteTo(hasher) + tree.Root = types.AsBytes32(hasher.Sum(nil)) + return tree, nil +} + +func buildNoSISMiMCTreeGPU(v []smartvectors.SmartVector) (*smt.Tree, []field.Element, error) { + if len(v) == 0 { + return nil, nil, fmt.Errorf("empty matrix") + } + + numRows := len(v) + numCols := v[0].Len() + if numCols == 0 { + return nil, nil, fmt.Errorf("matrix has zero columns") + } + for i := range v { + if v[i].Len() != numCols { + return nil, nil, fmt.Errorf("row %d has length %d, expected %d", i, v[i].Len(), numCols) + } + } + if !utils.IsPowerOfTwo(numCols) { + return nil, nil, fmt.Errorf("numCols=%d is not a power of two", numCols) + } + + columnChunks := make([]field.Element, numCols*numRows) + parallel.Execute(numCols, func(start, stop int) { + for col := start; col < stop; col++ { + dst := columnChunks[col*numRows : (col+1)*numRows] + for row := range v { + switch vi := v[row].(type) { + case *smartvectors.Constant: + dst[row] = vi.Value + case *smartvectors.Regular: + dst[row] = (*vi)[col] + default: + dst[row] = v[row].Get(col) + } + } + } + }) + + return buildMiMCTreeGPUFromChunks(columnChunks, numRows) +} + +func buildMiMCTreeGPUFromChunks(chunks []field.Element, chunkSize int) (*smt.Tree, []field.Element, error) { + if chunkSize <= 0 { + return nil, nil, fmt.Errorf("invalid chunk size %d", chunkSize) + } + if len(chunks) == 0 || len(chunks)%chunkSize != 0 { + return nil, nil, fmt.Errorf("input length %d is not a multiple of chunk size %d", len(chunks), chunkSize) + } + numLeaves := len(chunks) / chunkSize + if !utils.IsPowerOfTwo(numLeaves) { + return nil, nil, fmt.Errorf("numLeaves=%d is not a power of two", numLeaves) + } + + dev := gpu.GetDevice() + if dev == nil { + return nil, nil, fmt.Errorf("GPU device is unavailable") + } + if err := dev.Bind(); err != nil { + return nil, nil, fmt.Errorf("bind GPU device: %w", err) + } + + static, err := getGPUSISStaticData() + if err != nil { + return nil, nil, err + } + + totalNodes := 2*numLeaves - 1 + nodes := make([]field.Element, totalNodes) + + errCode := C.gnark_gpu_bls12377_mimc_sis_tree( + C.gnark_gpu_context_t(dev.Handle()), + (*C.uint64_t)(unsafe.Pointer(&chunks[0])), + C.size_t(numLeaves), + C.size_t(chunkSize), + (*C.uint64_t)(unsafe.Pointer(&static.mimcConstants[0])), + (*C.uint64_t)(unsafe.Pointer(&nodes[0])), + ) + runtime.KeepAlive(chunks) + runtime.KeepAlive(static) + if errCode != C.GNARK_GPU_SUCCESS { + return nil, nil, fmt.Errorf("gnark_gpu_bls12377_mimc_sis_tree: %s", gpuErrorString(errCode)) + } + + leaves := append([]field.Element(nil), nodes[:numLeaves]...) + return bottomUpMiMCTreeFromField(nodes, numLeaves), leaves, nil +} + +func cachedFlattenSISKey(key *ringsis.Key, numPolys int) ([]field.Element, error) { + cacheKey := gpuSISKeyCacheKey{key: key, numPolys: numPolys} + + gpuSISKeyCacheMu.Lock() + if ag, ok := gpuSISKeyCache[cacheKey]; ok { + gpuSISKeyCacheMu.Unlock() + return ag, nil + } + gpuSISKeyCacheMu.Unlock() + + ag, err := flattenSISKey(key, numPolys) + if err != nil { + return nil, err + } + + gpuSISKeyCacheMu.Lock() + if cached, ok := gpuSISKeyCache[cacheKey]; ok { + gpuSISKeyCacheMu.Unlock() + return cached, nil + } + gpuSISKeyCache[cacheKey] = ag + gpuSISKeyCacheMu.Unlock() + return ag, nil +} + +func flattenSISKey(key *ringsis.Key, numPolys int) ([]field.Element, error) { + agByPoly := key.Ag() + if numPolys > len(agByPoly) { + return nil, fmt.Errorf("numPolys=%d exceeds SIS key length %d", numPolys, len(agByPoly)) + } + ag := make([]field.Element, numPolys*gpuSISDegree) + for i := 0; i < numPolys; i++ { + if len(agByPoly[i]) != gpuSISDegree { + return nil, fmt.Errorf("SIS key polynomial %d has length %d", i, len(agByPoly[i])) + } + copy(ag[i*gpuSISDegree:(i+1)*gpuSISDegree], agByPoly[i]) + } + return ag, nil +} + +func getGPUSISStaticData() (gpuSISStaticData, error) { + gpuSISStaticOnce.Do(func() { + twiddles, twiddlesInv, coset, cosetInv, cardinalityInv, err := sisFFTTables() + if err != nil { + gpuSISStaticErr = err + return + } + constants, err := mimcConstants() + if err != nil { + gpuSISStaticErr = err + return + } + gpuSISStatic = gpuSISStaticData{ + twiddles: twiddles, + twiddlesInv: twiddlesInv, + coset: coset, + cosetInv: cosetInv, + cardinalityInv: cardinalityInv, + mimcConstants: constants, + } + }) + return gpuSISStatic, gpuSISStaticErr +} + +func sisFFTTables() ( + twiddles []field.Element, + twiddlesInv []field.Element, + coset []field.Element, + cosetInv []field.Element, + cardinalityInv []field.Element, + err error, +) { + shift, err := fr.Generator(2 * gpuSISDegree) + if err != nil { + return nil, nil, nil, nil, nil, err + } + domain := fft.NewDomain(gpuSISDegree, fft.WithShift(shift)) + + twiddlesByStage, err := domain.Twiddles() + if err != nil { + return nil, nil, nil, nil, nil, err + } + twiddlesInvByStage, err := domain.TwiddlesInv() + if err != nil { + return nil, nil, nil, nil, nil, err + } + twiddles, err = flattenSISTwiddles(twiddlesByStage) + if err != nil { + return nil, nil, nil, nil, nil, err + } + twiddlesInv, err = flattenSISTwiddles(twiddlesInvByStage) + if err != nil { + return nil, nil, nil, nil, nil, err + } + coset, err = domain.CosetTable() + if err != nil { + return nil, nil, nil, nil, nil, err + } + cosetInv, err = domain.CosetTableInv() + if err != nil { + return nil, nil, nil, nil, nil, err + } + cardinalityInv = []field.Element{domain.CardinalityInv} + return twiddles, twiddlesInv, coset, cosetInv, cardinalityInv, nil +} + +func flattenSISTwiddles(twiddlesByStage [][]field.Element) ([]field.Element, error) { + const numStages = 6 + if len(twiddlesByStage) != numStages { + return nil, fmt.Errorf("unexpected SIS twiddle stage count %d", len(twiddlesByStage)) + } + res := make([]field.Element, 0, 69) + for stage := 0; stage < numStages; stage++ { + expectedLen := 1 + (gpuSISDegree >> (stage + 1)) + if len(twiddlesByStage[stage]) != expectedLen { + return nil, fmt.Errorf( + "unexpected SIS twiddle stage %d length %d, expected %d", + stage, + len(twiddlesByStage[stage]), + expectedLen, + ) + } + res = append(res, twiddlesByStage[stage]...) + } + return res, nil +} + +func mimcConstants() ([]field.Element, error) { + bigConstants := mimc.GetConstants() + if len(bigConstants) != 62 { + return nil, fmt.Errorf("unexpected MiMC constant count: %d", len(bigConstants)) + } + constants := make([]field.Element, len(bigConstants)) + for i := range bigConstants { + constants[i].SetBigInt(&bigConstants[i]) + } + return constants, nil +} + +func bottomUpMiMCTreeFromField(nodes []field.Element, numLeaves int) *smt.Tree { + depth := utils.Log2Ceil(numLeaves) + tree := smt.NewEmptyTree(&smt.Config{HashFunc: hashtypes.MiMC, Depth: depth}) + + tree.OccupiedLeaves = make([]types.Bytes32, numLeaves) + copyFieldElementsAsBytes(tree.OccupiedLeaves, nodes[:numLeaves]) + + offset := numLeaves + if depth == 0 { + tree.Root = nodes[0].Bytes() + return tree + } + + tree.OccupiedNodes = make([][]types.Bytes32, depth-1) + for level := 1; level < depth; level++ { + levelSize := numLeaves >> level + tree.OccupiedNodes[level-1] = make([]types.Bytes32, levelSize) + copyFieldElementsAsBytes( + tree.OccupiedNodes[level-1], + nodes[offset:offset+levelSize], + ) + offset += levelSize + } + + tree.Root = nodes[len(nodes)-1].Bytes() + return tree +} + +func copyFieldElementsAsBytes(dst []types.Bytes32, src []field.Element) { + const parallelThreshold = 4096 + if len(dst) != len(src) { + utils.Panic("mismatched byte conversion lengths: dst=%d src=%d", len(dst), len(src)) + } + if len(src) < parallelThreshold { + for i := range src { + dst[i] = src[i].Bytes() + } + return + } + parallel.Execute(len(src), func(start, stop int) { + for i := start; i < stop; i++ { + dst[i] = src[i].Bytes() + } + }) +} + +func gpuErrorString(code C.gnark_gpu_error_t) string { + switch code { + case C.GNARK_GPU_ERROR_CUDA: + return "CUDA error" + case C.GNARK_GPU_ERROR_INVALID_ARG: + return "invalid argument" + case C.GNARK_GPU_ERROR_OUT_OF_MEMORY: + return "out of GPU memory" + case C.GNARK_GPU_ERROR_SIZE_MISMATCH: + return "size mismatch" + default: + return fmt.Sprintf("unknown error code %d", int(code)) + } +} diff --git a/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_stub.go b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_stub.go new file mode 100644 index 00000000000..b3c4ee517a4 --- /dev/null +++ b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_stub.go @@ -0,0 +1,28 @@ +//go:build !cuda + +package vortex + +import ( + "fmt" + + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/ringsis" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" +) + +func tryCommitSISGPU(_ EncodedMatrix, _ *ringsis.Key) (*smt.Tree, []field.Element, bool) { + return nil, nil, false +} + +func tryBuildSISMiMCTreeGPU(_ []field.Element, _ int) (*smt.Tree, bool) { + return nil, false +} + +func tryCommitNoSISMiMCGPU(_ []smartvectors.SmartVector) (*smt.Tree, []field.Element, bool) { + return nil, nil, false +} + +func buildSISMiMCTreeGPU(_ []field.Element, _ int) (*smt.Tree, error) { + return nil, fmt.Errorf("cuda build tag required") +} diff --git a/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_test.go b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_test.go new file mode 100644 index 00000000000..5bcf5bdaf26 --- /dev/null +++ b/prover/circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_test.go @@ -0,0 +1,309 @@ +//go:build cuda + +package vortex + +import ( + "testing" + + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/mimc" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/ringsis" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/crypto/state-management/smt" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/types" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/stretchr/testify/require" +) + +func TestBuildSISMiMCTreeGPUvsCPU(t *testing.T) { + const ( + numLeaves = 1024 + chunkSize = 64 + ) + + colHashes := make([]field.Element, numLeaves*chunkSize) + for i := range colHashes { + colHashes[i].SetUint64(uint64(i*17 + 3)) + } + + gpuTree, err := buildSISMiMCTreeGPU(colHashes, chunkSize) + require.NoError(t, err) + + leaves := hashSisHashForTest(colHashes, chunkSize) + cpuTree := smt.BuildCompleteMiMC(leaves) + + require.Equal(t, cpuTree.Root, gpuTree.Root, "GPU MiMC tree root should match CPU") + require.Equal(t, cpuTree.OccupiedLeaves, gpuTree.OccupiedLeaves, "GPU MiMC leaves should match CPU") + require.Equal(t, cpuTree.OccupiedNodes, gpuTree.OccupiedNodes, "GPU MiMC internal nodes should match CPU") +} + +func TestBuildNoSISMiMCTreeGPUvsCPU(t *testing.T) { + const ( + numRows = 9 + numCols = 1024 + ) + + rows := make([]smartvectors.SmartVector, numRows) + for i := range rows { + if i%4 == 0 { + rows[i] = smartvectors.NewConstant(field.NewElement(uint64(i+11)), numCols) + continue + } + rows[i] = smartvectors.Rand(numCols) + } + + params := NewParams(1, numCols, numRows, ringsis.StdParams) + gpuTree, gpuColHashes, err := buildNoSISMiMCTreeGPU(rows) + require.NoError(t, err) + + cpuColHashes := params.noSisTransversalHash(rows) + leaves := make([]types.Bytes32, len(cpuColHashes)) + for i := range leaves { + leaves[i] = cpuColHashes[i].Bytes() + } + cpuTree := smt.BuildCompleteMiMC(leaves) + + require.Equal(t, cpuColHashes, gpuColHashes, "GPU no-SIS column hashes should match CPU") + require.Equal(t, cpuTree.Root, gpuTree.Root, "GPU no-SIS MiMC tree root should match CPU") + require.Equal(t, cpuTree.OccupiedLeaves, gpuTree.OccupiedLeaves, "GPU no-SIS MiMC leaves should match CPU") + require.Equal(t, cpuTree.OccupiedNodes, gpuTree.OccupiedNodes, "GPU no-SIS MiMC internal nodes should match CPU") +} + +func TestBuildSISMiMCTreeGPUFromRowsVsCPU(t *testing.T) { + const ( + numRows = 19 + numCols = 1024 + ) + + rows := make([]smartvectors.SmartVector, numRows) + for i := range rows { + if i%5 == 0 { + rows[i] = smartvectors.NewConstant(field.NewElement(uint64(i+7)), numCols) + continue + } + rows[i] = smartvectors.Rand(numCols) + } + + params := NewParams(1, numCols, numRows, ringsis.StdParams) + gpuTree, gpuColHashes, err := buildSISMiMCTreeGPUFromRows(EncodedMatrix(rows), params.Key) + require.NoError(t, err) + + cpuColHashes := params.Key.TransversalHash(rows) + cpuTree := smt.BuildCompleteMiMC(params.hashSisHash(cpuColHashes)) + + require.Equal(t, cpuColHashes, gpuColHashes, "GPU SIS column hashes should match CPU") + require.Equal(t, cpuTree.Root, gpuTree.Root, "GPU SIS MiMC tree root should match CPU") + require.Equal(t, cpuTree.OccupiedLeaves, gpuTree.OccupiedLeaves, "GPU SIS MiMC leaves should match CPU") + require.Equal(t, cpuTree.OccupiedNodes, gpuTree.OccupiedNodes, "GPU SIS MiMC internal nodes should match CPU") +} + +func TestMergeSplitSISMiMCTreesMatchesFullTree(t *testing.T) { + const ( + numLeaves = 1024 + chunkSize = 64 + ) + + colHashes := make([]field.Element, numLeaves*chunkSize) + for i := range colHashes { + colHashes[i].SetUint64(uint64(i*19 + 5)) + } + + fullTree, err := buildSISMiMCTreeGPU(colHashes, chunkSize) + require.NoError(t, err) + + split := len(colHashes) / 2 + leftTree, err := buildSISMiMCTreeGPU(colHashes[:split], chunkSize) + require.NoError(t, err) + rightTree, err := buildSISMiMCTreeGPU(colHashes[split:], chunkSize) + require.NoError(t, err) + + merged, err := mergeSplitSISMiMCTrees(leftTree, rightTree) + require.NoError(t, err) + require.Equal(t, fullTree.Root, merged.Root, "merged split tree root should match the full tree") + require.Equal(t, fullTree.OccupiedLeaves, merged.OccupiedLeaves, "merged leaves should match the full tree") + require.Equal(t, fullTree.OccupiedNodes, merged.OccupiedNodes, "merged internal nodes should match the full tree") +} + +func TestBuildSISMiMCTreeGPUSplitFromRowsVsCPU(t *testing.T) { + if gpu.PhysicalDeviceCount() < 2 { + t.Skip("requires two visible CUDA devices") + } + + const ( + numRows = 19 + numCols = 2048 + ) + + rows := make([]smartvectors.SmartVector, numRows) + for i := range rows { + if i%5 == 0 { + rows[i] = smartvectors.NewConstant(field.NewElement(uint64(i+7)), numCols) + continue + } + rows[i] = smartvectors.Rand(numCols) + } + + params := NewParams(1, numCols, numRows, ringsis.StdParams) + gpuTree, gpuColHashes, err := buildSISMiMCTreeGPUSplitFromRows( + EncodedMatrix(rows), + params.Key, + ) + require.NoError(t, err) + + cpuColHashes := params.Key.TransversalHash(rows) + cpuTree := smt.BuildCompleteMiMC(params.hashSisHash(cpuColHashes)) + + require.Equal(t, cpuColHashes, gpuColHashes, "split GPU SIS column hashes should match CPU") + require.Equal(t, cpuTree.Root, gpuTree.Root, "split GPU SIS MiMC tree root should match CPU") + require.Equal(t, cpuTree.OccupiedLeaves, gpuTree.OccupiedLeaves, "split GPU SIS MiMC leaves should match CPU") + require.Equal(t, cpuTree.OccupiedNodes, gpuTree.OccupiedNodes, "split GPU SIS MiMC internal nodes should match CPU") +} + +func hashSisHashForTest(colHashes []field.Element, chunkSize int) []types.Bytes32 { + numChunks := len(colHashes) / chunkSize + leaves := make([]types.Bytes32, numChunks) + for chunkID := 0; chunkID < numChunks; chunkID++ { + startChunk := chunkID * chunkSize + hasher := mimc.NewFieldHasher() + digest := hasher.SumElements(colHashes[startChunk : startChunk+chunkSize]) + leaves[chunkID] = digest.Bytes() + } + return leaves +} + +func BenchmarkSISMiMCTreeProductionCPU(b *testing.B) { + benchmarkSISMiMCTree(b, false) +} + +func BenchmarkSISMiMCTreeProductionGPU(b *testing.B) { + benchmarkSISMiMCTree(b, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPU_812Rows(b *testing.B) { + benchmarkSISMiMCTreeFromRows(b, 812, false) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPU_812RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRows(b, 812, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPU_288RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRows(b, 288, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPU_108RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRows(b, 108, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPU_1880RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRows(b, 1880, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPUSplit_1880RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRowsSplit(b, 1880, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPUSplit_812RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRowsSplit(b, 812, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPUSplit_288RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRowsSplit(b, 288, true) +} + +func BenchmarkSISMiMCTreeFromRowsProductionGPUSplit_108RegularRows(b *testing.B) { + benchmarkSISMiMCTreeFromRowsSplit(b, 108, true) +} + +func benchmarkSISMiMCTree(b *testing.B, useGPU bool) { + const ( + blowUpFactor = 2 + numColumns = 1 << 18 + maxRows = 1880 + ) + + params := NewParams(blowUpFactor, numColumns, maxRows, ringsis.StdParams) + numLeaves := params.NumEncodedCols() + chunkSize := params.Key.OutputSize() + colHashes := make([]field.Element, numLeaves*chunkSize) + for i := range colHashes { + colHashes[i].SetUint64(uint64(i*17 + 3)) + } + + b.ResetTimer() + for i := 0; i < b.N; i++ { + if useGPU { + tree, err := buildSISMiMCTreeGPU(colHashes, chunkSize) + require.NoError(b, err) + require.NotEqual(b, types.Bytes32{}, tree.Root) + continue + } + leaves := params.hashSisHash(colHashes) + tree := smt.BuildCompleteMiMC(leaves) + require.NotEqual(b, types.Bytes32{}, tree.Root) + } +} + +func benchmarkSISMiMCTreeFromRows(b *testing.B, numRows int, regularRows bool) { + const numCols = 1 << 19 + + rows := make([]smartvectors.SmartVector, numRows) + for i := range rows { + if regularRows { + row := make([]field.Element, numCols) + for j := range row { + row[j].SetUint64(uint64(i + j + 1)) + } + rows[i] = smartvectors.NewRegular(row) + continue + } + rows[i] = smartvectors.NewConstant(field.NewElement(uint64(i+1)), numCols) + } + + params := NewParams(1, numCols, numRows, ringsis.StdParams) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + tree, colHashes, err := buildSISMiMCTreeGPUFromRows( + EncodedMatrix(rows), + params.Key, + ) + require.NoError(b, err) + require.NotEqual(b, types.Bytes32{}, tree.Root) + require.Len(b, colHashes, numCols*params.Key.OutputSize()) + } +} + +func benchmarkSISMiMCTreeFromRowsSplit(b *testing.B, numRows int, regularRows bool) { + if gpu.PhysicalDeviceCount() < 2 { + b.Skip("requires two visible CUDA devices") + } + + const numCols = 1 << 19 + + rows := make([]smartvectors.SmartVector, numRows) + for i := range rows { + if regularRows { + row := make([]field.Element, numCols) + for j := range row { + row[j].SetUint64(uint64(i + j + 1)) + } + rows[i] = smartvectors.NewRegular(row) + continue + } + rows[i] = smartvectors.NewConstant(field.NewElement(uint64(i+1)), numCols) + } + + params := NewParams(1, numCols, numRows, ringsis.StdParams) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + tree, colHashes, err := buildSISMiMCTreeGPUSplitFromRows( + EncodedMatrix(rows), + params.Key, + ) + require.NoError(b, err) + require.NotEqual(b, types.Bytes32{}, tree.Root) + require.Len(b, colHashes, numCols*params.Key.OutputSize()) + } +} diff --git a/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient.go b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient.go index 83d2833e48a..69b0d04e458 100644 --- a/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient.go +++ b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient.go @@ -5,6 +5,7 @@ import ( "runtime" "sort" "sync" + "time" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/protocol/coin" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/protocol/variables" @@ -23,8 +24,8 @@ import ( "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/arena" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/collection" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/parallel" - ppool "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/parallel/pool" "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/utils/profiling" + "github.com/sirupsen/logrus" ) const ( @@ -171,6 +172,15 @@ type computeQuotientCtx struct { maxRatio int } +// quotientCoeffEntry caches the coefficient form of a root column witness. +// Quotient evaluation reuses root columns across cosets, so computing this once +// avoids repeating the inverse FFT for every quotient share. +type quotientCoeffEntry struct { + isConst bool + constVal field.Element + coeffs []field.Element +} + // refineContext analyzes the context and the prover runtime to build a refined // context that is more efficient to use during the actual quotient computation. // In particular, it tries to simplify the expressions by doing constant @@ -313,6 +323,31 @@ func (ctx *QuotientCtx) Run(run *wizard.ProverRuntime) { vArena = arena.NewVectorArena[field.Element](cctx.maxNbAllocs * ctx.DomainSize) vArenaEvaluate = arena.NewVectorArena[field.Element]((cctx.maxExprNodes * symbolic.ChunkSize()) * runtime.GOMAXPROCS(0)) ) + globalRootsMap, globalRoots := collectQuotientRoots(cctx.rootsForRatio) + tCoeffCache := time.Now() + globalCoeffCache := ctx.computeQuotientCoeffCache(run, domain, globalRoots) + timeCoeffCache := time.Since(tCoeffCache) + numNonConstRoots := 0 + for i := range globalCoeffCache { + if !globalCoeffCache[i].isConst { + numNonConstRoots++ + } + } + + tDomainCosets := time.Now() + domainCosets := make([]*fft.Domain, cctx.maxRatio) + domainCosetShifts := make([]field.Element, cctx.maxRatio) + for i := range domainCosets { + shift := computeShift(uint64(ctx.DomainSize), cctx.maxRatio, i) + domainCosetShifts[i] = shift + domainCosets[i] = fft.NewDomain(uint64(ctx.DomainSize), fft.WithCache(), fft.WithShift(shift)) + } + timeDomainCosets := time.Since(tDomainCosets) + + metadatasByRatio := make([][]symbolic.Metadata, len(cctx.aggregateExpressionsBoard)) + for j := range cctx.aggregateExpressionsBoard { + metadatasByRatio[j] = cctx.aggregateExpressionsBoard[j].ListVariableMetadata() + } // Precompute annulator inverses for all cosets chAnnulator := make(chan struct{}, 1) @@ -323,11 +358,10 @@ func (ctx *QuotientCtx) Run(run *wizard.ProverRuntime) { close(chAnnulator) }() - var computedReeval sync.Map - var wgAssignments sync.WaitGroup + var totalReeval, totalInput, totalEval time.Duration for i := 0; i < cctx.maxRatio; i++ { - computedReeval.Clear() + computedReeval := make(map[ifaces.ColID]sv.SmartVector, len(globalRoots)) vArena.Reset(0) for j, ratio := range ctx.Ratios { @@ -338,33 +372,64 @@ func (ctx *QuotientCtx) Run(run *wizard.ProverRuntime) { share := i * ratio / cctx.maxRatio roots := cctx.rootsForRatio[j] board := cctx.aggregateExpressionsBoard[j] - metadatas := board.ListVariableMetadata() + metadatas := metadatasByRatio[j] - shift := computeShift(uint64(ctx.DomainSize), ratio, share) - domainCoset := fft.NewDomain(uint64(ctx.DomainSize), fft.WithCache(), fft.WithShift(shift)) + domainCoset := domainCosets[i] // Reevaluate roots on coset in parallel - ppool.ExecutePoolChunky(len(roots), func(k int) { - root := roots[k] - name := root.GetColID() - - _v, found := computedReeval.Load(name) - if found && _v != nil { - return + tReeval := time.Now() + missingRoots := make([]ifaces.Column, 0, len(roots)) + for _, root := range roots { + if _, found := computedReeval[root.GetColID()]; !found { + missingRoots = append(missingRoots, root) } - // Mark as in-progress, this should be useless since we use "unique roots for ratio" - // there shouldn't be any collisions - computedReeval.Store(name, nil) + } + missingResults := make([]sv.SmartVector, len(missingRoots)) + + gpuInputs := make([][]field.Element, 0, len(missingRoots)) + gpuOutputs := make([][]field.Element, 0, len(missingRoots)) + gpuResultIndexes := make([]int, 0, len(missingRoots)) + for k, root := range missingRoots { + entry := &globalCoeffCache[globalRootsMap[root.GetColID()]] + if entry.isConst { + missingResults[k] = sv.NewConstant(entry.constVal, ctx.DomainSize) + continue + } + res := arena.Get[field.Element](vArena, ctx.DomainSize) + gpuInputs = append(gpuInputs, entry.coeffs) + gpuOutputs = append(gpuOutputs, res) + gpuResultIndexes = append(gpuResultIndexes, k) + } - v, isNatural := run.TryGetColumn(name) - if !isNatural { - v = root.GetColAssignment(run) + usedGPU := tryGPUQuotientReevalCoset(ctx.DomainSize, domainCosetShifts[i], gpuInputs, gpuOutputs) + if usedGPU { + for idx, resultIndex := range gpuResultIndexes { + missingResults[resultIndex] = sv.NewRegular(gpuOutputs[idx]) } - reevaledRoot := reevalOnCoset(v, vArena, domain, domainCoset) - computedReeval.Store(name, reevaledRoot) - }) + } else { + parallel.Execute(len(missingRoots), func(start, stop int) { + for k := start; k < stop; k++ { + entry := &globalCoeffCache[globalRootsMap[missingRoots[k].GetColID()]] + var reevaledRoot sv.SmartVector + if entry.isConst { + reevaledRoot = sv.NewConstant(entry.constVal, ctx.DomainSize) + } else { + res := arena.Get[field.Element](vArena, ctx.DomainSize) + copy(res, entry.coeffs) + domainCoset.FFT(res, fft.DIT, fft.OnCoset(), fft.WithNbTasks(2)) + reevaledRoot = sv.NewRegular(res) + } + missingResults[k] = reevaledRoot + } + }) + } + for k, root := range missingRoots { + computedReeval[root.GetColID()] = missingResults[k] + } + totalReeval += time.Since(tReeval) // Prepare evaluation inputs for the constraint expression + tInput := time.Now() var wg sync.WaitGroup evalInputs := make([]sv.SmartVector, len(metadatas)) for k := 0; k < len(metadatas); k++ { @@ -372,8 +437,7 @@ func (ctx *QuotientCtx) Run(run *wizard.ProverRuntime) { case ifaces.Column: root := column.RootParents(metadata) rootName := root.GetColID() - _reevaledRoot, _ := computedReeval.Load(rootName) - reevaledRoot := _reevaledRoot.(sv.SmartVector) + reevaledRoot := computedReeval[rootName] if !metadata.IsComposite() { evalInputs[k] = reevaledRoot continue @@ -405,23 +469,30 @@ func (ctx *QuotientCtx) Run(run *wizard.ProverRuntime) { } } wg.Wait() + totalInput += time.Since(tInput) // Evaluate and assign quotient share + tEval := time.Now() vArenaEvaluate.Reset(0) quotientShare := board.Evaluate(evalInputs, vArenaEvaluate) <-chAnnulator quotientShare = sv.ScalarMul(quotientShare, annulatorInv[i]) run.AssignColumn(ctx.QuotientShares[j][share].GetColID(), quotientShare) + totalEval += time.Since(tEval) } } vArena = nil vArenaEvaluate = nil - computedReeval.Clear() + globalCoeffCache = nil - wgAssignments.Wait() + logrus.Infof( + "[pi-quotient] domain=%d maxRatio=%d roots=%d nonConstRoots=%d coeffCache=%v domainCosets=%v reeval=%v inputPrep=%v evalScaleAssign=%v", + ctx.DomainSize, cctx.maxRatio, len(globalRoots), numNonConstRoots, + timeCoeffCache, timeDomainCosets, totalReeval, totalInput, totalEval, + ) if ctx.DomainSize >= GC_DOMAIN_SIZE { runtime.GC() @@ -429,34 +500,66 @@ func (ctx *QuotientCtx) Run(run *wizard.ProverRuntime) { } -// reevalOnCoset takes a vector v in evaluation form on the base domain -// and returns the vector evaluated on the coset defined by (cosetRatio, cosetID) -func reevalOnCoset(v sv.SmartVector, vArena *arena.VectorArena, domain, domainCoset *fft.Domain) sv.SmartVector { - skipInverse := false - switch x := v.(type) { - case *sv.Constant: - return x - case *sv.PaddedCircularWindow: - interval := x.Interval() - if interval.IntervalLen == 1 && interval.Start() == 0 && x.PaddingVal_.IsZero() { - // It's a multiple of the first Lagrange polynomial c * (1 + x + x^2 + x^3 + ...) - // The ifft is (c) = (c/N, c/N, c/N, ...) - constTerm := field.NewElement(uint64(x.Len())) - constTerm.Inverse(&constTerm) - constTerm.Mul(&constTerm, &x.Window_[0]) - v = sv.NewConstant(constTerm, x.Len()) - skipInverse = true +func collectQuotientRoots(rootsForRatio [][]ifaces.Column) (map[ifaces.ColID]int, []ifaces.Column) { + rootMap := make(map[ifaces.ColID]int) + var roots []ifaces.Column + for _, ratioRoots := range rootsForRatio { + for _, root := range ratioRoots { + name := root.GetColID() + if _, ok := rootMap[name]; ok { + continue + } + rootMap[name] = len(roots) + roots = append(roots, root) } } - res := arena.Get[field.Element](vArena, v.Len()) - v.WriteInSlice(res) + return rootMap, roots +} - if !skipInverse { - domain.FFTInverse(res, fft.DIF, fft.WithNbTasks(2)) - } +func (ctx *QuotientCtx) computeQuotientCoeffCache( + run *wizard.ProverRuntime, + domain *fft.Domain, + roots []ifaces.Column, +) []quotientCoeffEntry { + cache := make([]quotientCoeffEntry, len(roots)) + nbIFFTTasks := max(2, min(64, runtime.GOMAXPROCS(0)/max(1, len(roots)))) + + parallel.Execute(len(roots), func(start, stop int) { + for k := start; k < stop; k++ { + root := roots[k] + name := root.GetColID() + v, isNatural := run.TryGetColumn(name) + if !isNatural { + v = root.GetColAssignment(run) + } + + if c, ok := v.(*sv.Constant); ok { + cache[k] = quotientCoeffEntry{isConst: true, constVal: c.Value} + continue + } + + coeffs := make([]field.Element, ctx.DomainSize) + skipInverse := false + if x, ok := v.(*sv.PaddedCircularWindow); ok { + interval := x.Interval() + if interval.IntervalLen == 1 && interval.Start() == 0 && x.PaddingVal_.IsZero() { + constTerm := field.NewElement(uint64(x.Len())) + constTerm.Inverse(&constTerm) + constTerm.Mul(&constTerm, &x.Window_[0]) + v = sv.NewConstant(constTerm, x.Len()) + skipInverse = true + } + } + + v.WriteInSlice(coeffs) + if !skipInverse { + domain.FFTInverse(coeffs, fft.DIF, fft.WithNbTasks(nbIFFTTasks)) + } + cache[k] = quotientCoeffEntry{coeffs: coeffs} + } + }) - domainCoset.FFT(res, fft.DIT, fft.OnCoset(), fft.WithNbTasks(2)) - return sv.NewRegular(res) + return cache } func computeShift(n uint64, cosetRatio int, cosetID int) field.Element { diff --git a/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda.go b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda.go new file mode 100644 index 00000000000..8e153317a85 --- /dev/null +++ b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda.go @@ -0,0 +1,179 @@ +//go:build cuda + +package globalcs + +import ( + "fmt" + "os" + "runtime" + "strconv" + "sync" + + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" + "github.com/consensys/linea-monorepo/prover/gpu" + gpubls12377 "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bls12377" + "github.com/sirupsen/logrus" +) + +const ( + envPIQuotientGPUReeval = "LINEA_PROVER_GPU_PI_QUOTIENT_REEVAL" + envPIQuotientGPUSecondaryDeviceID = "LINEA_PROVER_GPU_PI_QUOTIENT_SECONDARY_DEVICE_ID" + envPIQuotientGPUDisableSecondaryDevice = "LINEA_PROVER_GPU_PI_QUOTIENT_DISABLE_SECONDARY_DEVICE" + + gpuQuotientReevalMinDomain = 1 << 18 +) + +func tryGPUQuotientReevalCoset( + domainSize int, + shift field.Element, + inputs [][]field.Element, + outputs [][]field.Element, +) bool { + if os.Getenv(envPIQuotientGPUReeval) != "1" { + return false + } + if domainSize < gpuQuotientReevalMinDomain { + return false + } + if len(inputs) == 0 { + return true + } + if len(inputs) != len(outputs) { + logrus.Warn("PI quotient GPU reeval input/output length mismatch; falling back to CPU") + return false + } + devices, err := quotientReevalDevices() + if err != nil { + logrus.WithError(err).Warn("PI quotient GPU reeval device selection failed; falling back to CPU") + return false + } + if len(devices) == 0 { + return false + } + + for i := range inputs { + if len(inputs[i]) != domainSize || len(outputs[i]) != domainSize { + logrus.Warnf("PI quotient GPU reeval vector size mismatch at %d; falling back to CPU", i) + return false + } + } + + chunk := (len(inputs) + len(devices) - 1) / len(devices) + var wg sync.WaitGroup + errCh := make(chan error, len(devices)) + for deviceIndex, dev := range devices { + start := deviceIndex * chunk + stop := min(start+chunk, len(inputs)) + if start >= stop { + continue + } + wg.Add(1) + go func(dev *gpu.Device, start, stop int) { + defer wg.Done() + if err := runGPUQuotientReevalCoset(dev, domainSize, shift, inputs[start:stop], outputs[start:stop]); err != nil { + errCh <- err + } + }(dev, start, stop) + } + wg.Wait() + close(errCh) + if err := <-errCh; err != nil { + logrus.WithError(err).Warn("PI quotient GPU reeval failed; falling back to CPU") + return false + } + + logrus.Infof( + "PI quotient GPU reeval completed roots=%d domain=%d devices=%d", + len(inputs), domainSize, len(devices), + ) + return true +} + +func runGPUQuotientReevalCoset( + dev *gpu.Device, + domainSize int, + shift field.Element, + inputs [][]field.Element, + outputs [][]field.Element, +) error { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + + if err := dev.Bind(); err != nil { + return fmt.Errorf("bind GPU device %d: %w", dev.DeviceID(), err) + } + domain, err := gpubls12377.NewFFTDomain(dev, domainSize) + if err != nil { + return fmt.Errorf("create GPU FFT domain on device %d: %w", dev.DeviceID(), err) + } + defer domain.Close() + + vec, err := gpubls12377.NewFrVector(dev, domainSize) + if err != nil { + return fmt.Errorf("allocate quotient reeval vector on device %d: %w", dev.DeviceID(), err) + } + defer vec.Free() + + for i := range inputs { + vec.CopyFromHost(field.Vector(inputs[i])) + domain.BitReverse(vec) + domain.CosetFFT(vec, shift) + vec.CopyToHost(field.Vector(outputs[i])) + } + if err := dev.Sync(); err != nil { + return fmt.Errorf("sync GPU device %d: %w", dev.DeviceID(), err) + } + return nil +} + +func quotientReevalDevices() ([]*gpu.Device, error) { + primary, primaryID, err := gpu.DeviceFromEnvOrCurrent() + if err != nil { + return nil, err + } + if primary == nil { + primary = gpu.GetDevice() + if primary == nil { + return nil, nil + } + primaryID = primary.DeviceID() + } + + devices := []*gpu.Device{primary} + if os.Getenv(envPIQuotientGPUDisableSecondaryDevice) != "" { + return devices, nil + } + + secondaryID, ok, err := quotientReevalSecondaryDeviceID(primaryID) + if err != nil || !ok { + return devices, err + } + secondary := gpu.GetDeviceN(secondaryID) + if secondary == nil { + return devices, fmt.Errorf("secondary GPU device %d is unavailable", secondaryID) + } + return append(devices, secondary), nil +} + +func quotientReevalSecondaryDeviceID(primaryID int) (int, bool, error) { + raw := os.Getenv(envPIQuotientGPUSecondaryDeviceID) + if raw != "" { + id, err := strconv.Atoi(raw) + if err != nil { + return 0, false, fmt.Errorf("invalid %s %q: %w", envPIQuotientGPUSecondaryDeviceID, raw, err) + } + if id < 0 { + return 0, false, fmt.Errorf("%s must be non-negative, got %d", envPIQuotientGPUSecondaryDeviceID, id) + } + if id == primaryID { + return 0, false, fmt.Errorf("PI quotient secondary device matches primary device %d", primaryID) + } + return id, true, nil + } + + n := gpu.PhysicalDeviceCount() + if n < 2 { + return 0, false, nil + } + return (primaryID + 1) % n, true, nil +} diff --git a/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda_test.go b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda_test.go new file mode 100644 index 00000000000..cafac1e1047 --- /dev/null +++ b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_cuda_test.go @@ -0,0 +1,50 @@ +//go:build cuda + +package globalcs + +import ( + "testing" + + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft" + "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/stretchr/testify/require" +) + +func TestGPUQuotientReevalCosetMatchesCPU(t *testing.T) { + if gpu.PhysicalDeviceCount() == 0 { + t.Skip("requires a visible CUDA device") + } + t.Setenv(envPIQuotientGPUReeval, "1") + + const ( + domainSize = gpuQuotientReevalMinDomain + numRoots = 5 + ) + shift := computeShift(domainSize, 4, 1) + inputs := make([][]field.Element, numRoots) + outputs := make([][]field.Element, numRoots) + expected := make([][]field.Element, numRoots) + + for i := range inputs { + inputs[i] = make([]field.Element, domainSize) + outputs[i] = make([]field.Element, domainSize) + expected[i] = make([]field.Element, domainSize) + for j := range inputs[i] { + inputs[i][j].SetUint64(uint64(17 + i*domainSize + j*3)) + } + copy(expected[i], inputs[i]) + } + + require.True( + t, + tryGPUQuotientReevalCoset(domainSize, shift, inputs, outputs), + "GPU quotient reevaluation should run when enabled", + ) + + cpuDomain := fft.NewDomain(domainSize, fft.WithCache(), fft.WithShift(shift)) + for i := range expected { + cpuDomain.FFT(expected[i], fft.DIT, fft.OnCoset(), fft.WithNbTasks(2)) + require.Equal(t, expected[i], outputs[i], "GPU coset reevaluation should match CPU for root %d", i) + } +} diff --git a/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_stub.go b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_stub.go new file mode 100644 index 00000000000..b2bb8c31025 --- /dev/null +++ b/prover/circuits/pi-interconnection/keccak/prover/protocol/compiler/globalcs/quotient_gpu_stub.go @@ -0,0 +1,14 @@ +//go:build !cuda + +package globalcs + +import "github.com/consensys/linea-monorepo/prover/circuits/pi-interconnection/keccak/prover/maths/field" + +func tryGPUQuotientReevalCoset( + _ int, + _ field.Element, + _ [][]field.Element, + _ [][]field.Element, +) bool { + return false +} diff --git a/prover/circuits/prove.go b/prover/circuits/prove.go index 34a488d99e2..e46747da2ce 100644 --- a/prover/circuits/prove.go +++ b/prover/circuits/prove.go @@ -16,10 +16,13 @@ import ( "github.com/sirupsen/logrus" plonk_bn254 "github.com/consensys/gnark/backend/plonk/bn254" + "github.com/consensys/linea-monorepo/prover/gpu" + gpuplonk2 "github.com/consensys/linea-monorepo/prover/gpu/plonk2" ) type proveCheckSettings struct { cachedProofPath string + useGPU bool } type ProveCheckOption func(*proveCheckSettings) @@ -30,10 +33,47 @@ func WithCachedProof(path string) ProveCheckOption { } } +// WithGPU enables (or explicitly disables) the gpu/plonk2 prover for this +// ProveCheck call. When enabled but no GPU device is reachable (e.g. CPU build +// or device init failed), ProveCheck returns an error rather than silently +// falling back to CPU — callers gate this option on gpu.HasDevice() (for +// compression) or gpu.IsAggregationEnabled() (for aggregation phases) so the +// fallback decision is made once at the call site, not buried here. +func WithGPU(enabled bool) ProveCheckOption { + return func(s *proveCheckSettings) { + s.useGPU = enabled + } +} + // Generates a PlonkProof and sanity-checks it against the verifying key. Can // take a list of options which can of either backend.ProverOption of backend. // VerifierOption. func ProveCheck(setup *Setup, assignment frontend.Circuit, opts ...any) (plonk.Proof, error) { + proof, witness, verifierOpts, err := proveNoCheck(setup, assignment, opts...) + if err != nil { + return nil, err + } + logrus.Infof("Sanity-checking the proof") + // Sanity-check : the proof must pass + pubwitness, err := witness.Public() + if err != nil { + panic(err) + } + + err = plonk.Verify(proof, setup.VerifyingKey, pubwitness, verifierOpts...) + if err != nil { + panic(err) + } + // logrus.Infof("the proof passed with\nproof=%++v\nwit=%++v\nvkey=%++v\n", proof, pubwitness, pp.VK) + + return proof, nil +} + +func proveNoCheck( + setup *Setup, + assignment frontend.Circuit, + opts ...any, +) (plonk.Proof, witness.Witness, []backend.VerifierOption, error) { proverOpts := []backend.ProverOption{} verifierOpts := []backend.VerifierOption{} @@ -55,7 +95,7 @@ func ProveCheck(setup *Setup, assignment frontend.Circuit, opts ...any) (plonk.P case ProveCheckOption: o(&settings) default: - return nil, fmt.Errorf("unknown option type to prove-check: %++v", o) + return nil, nil, nil, fmt.Errorf("unknown option type to prove-check: %++v", o) } } @@ -64,21 +104,12 @@ func ProveCheck(setup *Setup, assignment frontend.Circuit, opts ...any) (plonk.P logrus.Infof("Creating the witness") witness, err := frontend.NewWitness(assignment, setup.Circuit.Field()) if err != nil { - return nil, fmt.Errorf("while generating the gnark witness: %w", err) + return nil, nil, nil, fmt.Errorf("while generating the gnark witness: %w", err) } logrus.Infof("Generating the proof") var proof plonk.Proof - - if settings.cachedProofPath != "" { - proof = tryReadCachedProof(*setup, settings.cachedProofPath, verifierOpts, witness) - if proof != nil { - return proof, nil - } - } - - proof, err = plonk.Prove(setup.Circuit, setup.ProvingKey, witness, proverOpts...) - if err != nil { + proveErr := func(err error) error { // The error returned by the Plonk prover is usually not helpful at // all. So, in order to get more details, we run the "test" Solver. logrus.Errorf("plonk.Prove returned an error, using the test.IsSolved to get more details: %s", err.Error()) @@ -89,29 +120,65 @@ func ProveCheck(setup *Setup, assignment frontend.Circuit, opts ...any) (plonk.P // this test engine prover option was no-op before and it was removed // test.WithBackendProverOptions(proverOpts...), ) - return nil, fmt.Errorf("while running the plonk prover: %w", errDetail) + return fmt.Errorf("while running the plonk prover: %w", errDetail) } - logrus.Infof("Sanity-checking the proof") - // Sanity-check : the proof must pass - { - pubwitness, err := witness.Public() - if err != nil { - panic(err) + if settings.cachedProofPath != "" { + proof = tryReadCachedProof(*setup, settings.cachedProofPath, verifierOpts, witness) + if proof != nil { + return proof, witness, verifierOpts, nil } + } - err = plonk.Verify(proof, setup.VerifyingKey, pubwitness, verifierOpts...) + if settings.useGPU { + if !gpu.Enabled { + return nil, nil, nil, errors.New("circuits.WithGPU: binary not built with the cuda tag") + } + dev, deviceID, err := gpu.DeviceFromEnvOrCurrent() if err != nil { - panic(err) + return nil, nil, nil, err + } + if dev == nil { + return nil, nil, nil, errors.New("circuits.WithGPU: no GPU device is available") + } + logrus.Infof( + "Generating the proof with gpu/plonk2 on GPU device %d: circuit=%T provingKey=%T verifyingKey=%T", + deviceID, + setup.Circuit, + setup.ProvingKey, + setup.VerifyingKey, + ) + var gpuProver *gpuplonk2.Prover + gpuProver, err = gpuplonk2.NewProver( + dev, + setup.Circuit, + setup.ProvingKey, + setup.VerifyingKey, + gpuplonk2.WithEnabled(true), + gpuplonk2.WithStrictMode(true), + ) + if err != nil { + return nil, nil, nil, fmt.Errorf("while creating the gpu/plonk2 prover: %w", err) + } + defer gpuProver.Close() + gpuProof, err := gpuProver.Prove(witness, proverOpts...) + if err != nil { + return nil, nil, nil, proveErr(err) + } + proof = gpuProof + } else { + proof, err = plonk.Prove(setup.Circuit, setup.ProvingKey, witness, proverOpts...) + if err != nil { + return nil, nil, nil, proveErr(err) } - // logrus.Infof("the proof passed with\nproof=%++v\nwit=%++v\nvkey=%++v\n", proof, pubwitness, pp.VK) } + logrus.Infof("Generated proof type %T", proof) if settings.cachedProofPath != "" { tryCacheProof(settings.cachedProofPath, proof) } - return proof, nil + return proof, witness, verifierOpts, nil } // Serializes the proof in an 0x prefixed hexstring diff --git a/prover/circuits/setup.go b/prover/circuits/setup.go index c5741e6436e..4f88cbb769a 100644 --- a/prover/circuits/setup.go +++ b/prover/circuits/setup.go @@ -12,9 +12,11 @@ import ( "os" "path/filepath" "runtime" + "time" "github.com/consensys/gnark" "github.com/consensys/gnark-crypto/ecc" + "github.com/consensys/gnark-crypto/kzg" "github.com/consensys/gnark/backend/plonk" plonk_bls12377 "github.com/consensys/gnark/backend/plonk/bls12-377" plonk_bn254 "github.com/consensys/gnark/backend/plonk/bn254" @@ -146,11 +148,35 @@ func (s *Setup) WriteTo(rootDir string) error { return nil } -func LoadSetup(cfg *config.Config, circuitID CircuitID) (Setup, error) { +// LoadSetupOption tweaks how LoadSetup reads the setup directory from disk. +type LoadSetupOption func(*loadSetupConfig) + +type loadSetupConfig struct { + canonicalSRSOnly bool +} + +// WithoutLagrangeSRS skips reading the Lagrange-form SRS from disk and only +// loads the canonical SRS. The gpu/plonk2 prover does not need the Lagrange +// SRS, so callers that have already chosen the GPU path can pass this option +// to shave the Lagrange-SRS read (~17s for the 126M-constraint compression +// circuit) off setup-load time. +func WithoutLagrangeSRS() LoadSetupOption { + return func(c *loadSetupConfig) { + c.canonicalSRSOnly = true + } +} + +func LoadSetup(cfg *config.Config, circuitID CircuitID, opts ...LoadSetupOption) (Setup, error) { + + var setupCfg loadSetupConfig + for _, o := range opts { + o(&setupCfg) + } gnarkutil.RegisterHintsAndGkrGates() runtime.GC() + start := time.Now() rootDir := cfg.PathForSetup(string(circuitID)) manifestPath := filepath.Join(rootDir, config.ManifestFileName) @@ -190,9 +216,19 @@ func LoadSetup(cfg *config.Config, circuitID CircuitID) (Setup, error) { if err != nil { return Setup{}, fmt.Errorf("creating SRS provider: %w", err) } - srsCanonical, srsLagrange, err := srsProvider.GetSRS(context.Background(), circuit) - if err != nil { - return Setup{}, fmt.Errorf("fetching SRS: %w", err) + var srsCanonical kzg.SRS + var srsLagrange kzg.SRS + if setupCfg.canonicalSRSOnly { + logrus.Infof("loading canonical SRS only (Lagrange SRS skipped — caller is using the GPU plonk2 prover)") + srsCanonical, err = srsProvider.GetCanonicalSRS(context.Background(), circuit) + if err != nil { + return Setup{}, fmt.Errorf("fetching canonical SRS: %w", err) + } + } else { + srsCanonical, srsLagrange, err = srsProvider.GetSRS(context.Background(), circuit) + if err != nil { + return Setup{}, fmt.Errorf("fetching SRS: %w", err) + } } pk := plonk.NewProvingKey(curveID) var kzgVkFromVk, kzgVkFromSrs io.WriterTo @@ -201,21 +237,27 @@ func LoadSetup(cfg *config.Config, circuitID CircuitID) (Setup, error) { pk.Vk = vk.(*plonk_bn254.VerifyingKey) srsC := srsCanonical.(*kzg254.SRS) pk.Kzg = srsC.Pk - pk.KzgLagrange = srsLagrange.(*kzg254.SRS).Pk + if srsLagrange != nil { + pk.KzgLagrange = srsLagrange.(*kzg254.SRS).Pk + } kzgVkFromVk = &pk.Vk.Kzg kzgVkFromSrs = &srsC.Vk case *plonk_bls12377.ProvingKey: pk.Vk = vk.(*plonk_bls12377.VerifyingKey) srsC := srsCanonical.(*kzg377.SRS) pk.Kzg = srsC.Pk - pk.KzgLagrange = srsLagrange.(*kzg377.SRS).Pk + if srsLagrange != nil { + pk.KzgLagrange = srsLagrange.(*kzg377.SRS).Pk + } kzgVkFromVk = &pk.Vk.Kzg kzgVkFromSrs = &srsC.Vk case *plonk_bw6761.ProvingKey: pk.Vk = vk.(*plonk_bw6761.VerifyingKey) srsC := srsCanonical.(*kzgbw6.SRS) pk.Kzg = srsC.Pk - pk.KzgLagrange = srsLagrange.(*kzgbw6.SRS).Pk + if srsLagrange != nil { + pk.KzgLagrange = srsLagrange.(*kzgbw6.SRS).Pk + } kzgVkFromVk = &pk.Vk.Kzg kzgVkFromSrs = &srsC.Vk default: @@ -226,6 +268,14 @@ func LoadSetup(cfg *config.Config, circuitID CircuitID) (Setup, error) { return Setup{}, fmt.Errorf("verifying key <> SRS mismatch: %w", err) } + logrus.Infof( + "loaded setup circuitID=%s curve=%s nbConstraints=%d duration=%s", + circuitID, + manifest.CurveID, + manifest.NbConstraints, + time.Since(start), + ) + return Setup{ Manifest: *manifest, Circuit: circuit, diff --git a/prover/circuits/srs_store.go b/prover/circuits/srs_store.go index 590d937f0c5..3fe642985a0 100644 --- a/prover/circuits/srs_store.go +++ b/prover/circuits/srs_store.go @@ -102,6 +102,40 @@ func (store *SRSStore) GetSRS(ctx context.Context, ccs constraint.ConstraintSyst sizeCanonical, sizeLagrange := plonk.SRSSize(ccs) curveID := fieldToCurve(ccs.Field()) + canonicalSRS, err := store.getCanonicalSRS(ctx, curveID, sizeCanonical) + if err != nil { + return nil, nil, err + } + + lagrangeSRS, err := store.getLagrangeSRS(ctx, curveID, sizeLagrange) + if err != nil { + return nil, nil, err + } + + if lagrangeSRS == nil { + // we can compute it from the canonical one. + if sizeCanonical < sizeLagrange { + panic("canonical SRS is smaller than lagrange SRS") + } + logrus.Debugf("computing lagrange SRS from canonical SRS %d -> %d\n", sizeCanonical, sizeLagrange) + lagrangeSRS, err = toLagrange(canonicalSRS, sizeLagrange) + if err != nil { + return nil, nil, err + } + } + + return canonicalSRS, lagrangeSRS, nil +} + +func (store *SRSStore) GetCanonicalSRS(ctx context.Context, ccs constraint.ConstraintSystem) (kzg.SRS, error) { + sizeCanonical, _ := plonk.SRSSize(ccs) + curveID := fieldToCurve(ccs.Field()) + return store.getCanonicalSRS(ctx, curveID, sizeCanonical) +} + +func (store *SRSStore) getCanonicalSRS(ctx context.Context, curveID ecc.ID, sizeCanonical int) (kzg.SRS, error) { + _ = ctx + // find the canonical srs var canonicalSRS kzg.SRS for _, entry := range store.entries[curveID] { @@ -109,19 +143,25 @@ func (store *SRSStore) GetSRS(ctx context.Context, ccs constraint.ConstraintSyst canonicalSRS = kzg.NewSRS(curveID) data, err := os.ReadFile(entry.path) if err != nil { - return nil, nil, err + return nil, err } if err := canonicalSRS.ReadDump(bytes.NewReader(data), sizeCanonical); err != nil { - return nil, nil, err + return nil, err } break } } if canonicalSRS == nil { - return nil, nil, fmt.Errorf("could not find canonical SRS for curve %s and size %d", curveID, sizeCanonical) + return nil, fmt.Errorf("could not find canonical SRS for curve %s and size %d", curveID, sizeCanonical) } + return canonicalSRS, nil +} + +func (store *SRSStore) getLagrangeSRS(ctx context.Context, curveID ecc.ID, sizeLagrange int) (kzg.SRS, error) { + _ = ctx + // find the lagrange srs var lagrangeSRS kzg.SRS for _, entry := range store.entries[curveID] { @@ -129,29 +169,16 @@ func (store *SRSStore) GetSRS(ctx context.Context, ccs constraint.ConstraintSyst lagrangeSRS = kzg.NewSRS(curveID) data, err := os.ReadFile(entry.path) if err != nil { - return nil, nil, err + return nil, err } if err := lagrangeSRS.ReadDump(bytes.NewReader(data)); err != nil { - return nil, nil, err + return nil, err } break } } - if lagrangeSRS == nil { - // we can compute it from the canonical one. - if sizeCanonical < sizeLagrange { - panic("canonical SRS is smaller than lagrange SRS") - } - logrus.Debugf("computing lagrange SRS from canonical SRS %d -> %d\n", sizeCanonical, sizeLagrange) - var err error - lagrangeSRS, err = toLagrange(canonicalSRS, sizeLagrange) - if err != nil { - return nil, nil, err - } - } - - return canonicalSRS, lagrangeSRS, nil + return lagrangeSRS, nil } func toLagrange(srs kzg.SRS, sizeLagrange int) (kzg.SRS, error) { diff --git a/prover/cmd/controller/controller/fs_watcher.go b/prover/cmd/controller/controller/fs_watcher.go index b3b6054c020..c31c6f0ff5c 100644 --- a/prover/cmd/controller/controller/fs_watcher.go +++ b/prover/cmd/controller/controller/fs_watcher.go @@ -11,6 +11,7 @@ import ( "github.com/consensys/linea-monorepo/prover/cmd/controller/controller/metrics" "github.com/consensys/linea-monorepo/prover/config" + "github.com/consensys/linea-monorepo/prover/gpu" "github.com/sirupsen/logrus" "golang.org/x/exp/slices" ) @@ -35,7 +36,17 @@ func NewFsWatcher(conf *config.Config) *FsWatcher { Logger: conf.Logger().WithField("component", "filesystem-watcher"), } - if conf.Controller.EnableExecution { + // On a GPU host the prover only has GPU acceleration for the compression + // proof; running execution / aggregation / invalidity here would fall + // back to CPU and be much slower than scheduling them onto the dedicated + // CPU pool. Force the controller to only accept compression jobs in that + // case, even if the operator left the other Enable* flags on. + gpuOnlyCompression := gpu.HasDevice() + if gpuOnlyCompression { + fs.Logger.Infof("GPU detected — restricting job types to compression (data-availability) only") + } + + if conf.Controller.EnableExecution && !gpuOnlyCompression { fs.JobToWatch = append(fs.JobToWatch, ExecutionDefinition(conf)) } @@ -43,11 +54,11 @@ func NewFsWatcher(conf *config.Config) *FsWatcher { fs.JobToWatch = append(fs.JobToWatch, CompressionDefinition(conf)) } - if conf.Controller.EnableAggregation { + if conf.Controller.EnableAggregation && !gpuOnlyCompression { fs.JobToWatch = append(fs.JobToWatch, AggregatedDefinition(conf)) } - if conf.Controller.EnableInvalidity { + if conf.Controller.EnableInvalidity && !gpuOnlyCompression { fs.JobToWatch = append(fs.JobToWatch, InvalidityDefinition(conf)) } diff --git a/prover/cmd/prover/cmd/prove.go b/prover/cmd/prover/cmd/prove.go index 5db31dd2010..5233056fcbd 100644 --- a/prover/cmd/prover/cmd/prove.go +++ b/prover/cmd/prover/cmd/prove.go @@ -16,7 +16,9 @@ import ( invalidityLimitless "github.com/consensys/linea-monorepo/prover/backend/invalidity/limitless" invalidityCir "github.com/consensys/linea-monorepo/prover/circuits/invalidity" "github.com/consensys/linea-monorepo/prover/config" + "github.com/consensys/linea-monorepo/prover/gpu" "github.com/consensys/linea-monorepo/prover/utils/signal" + "github.com/sirupsen/logrus" ) type ProverArgs struct { @@ -44,6 +46,32 @@ func Prove(args ProverArgs) error { return fmt.Errorf("%s failed to read config file at %v: %w", cmdName, args.ConfigFile, err) } + gpuDeviceID, gpuConfigured, cleanupGPU, err := gpu.PinConfiguredDevice() + if err != nil { + return err + } + defer cleanupGPU() + if gpuConfigured { + logrus.Infof("pinned prover process to GPU device %d via %s", gpuDeviceID, gpu.EnvDeviceID) + } + + // When the operator opts into the aggregation GPU path, propagate the + // master flag to the keccak-vendored sub-flags that the public-input + // wizard reads (PI Vortex GPU MiMC, ring-SIS, quotient reevaluation). + // Operators can still pin individual sub-flags to "0" before launching + // the binary to disable a sub-path for triage. + if gpu.IsAggregationEnabled() { + for _, name := range []string{ + "LINEA_PROVER_GPU_PI_MIMC", + "LINEA_PROVER_GPU_PI_SIS", + "LINEA_PROVER_GPU_PI_QUOTIENT_REEVAL", + } { + if os.Getenv(name) == "" { + _ = os.Setenv(name, "1") + } + } + } + // Determine job type from input file name var ( jobExecution = strings.Contains(args.Input, "getZkProof") diff --git a/prover/config/config-mainnet-limitless.toml b/prover/config/config-mainnet-limitless.toml index 91dd81b2dcd..6d4f6d1c4d6 100644 --- a/prover/config/config-mainnet-limitless.toml +++ b/prover/config/config-mainnet-limitless.toml @@ -1,5 +1,5 @@ environment = "mainnet" -version = "7.0.1" # TODO @gbotrel hunt all version definitions. +version = "7.1.0" # TODO @gbotrel hunt all version definitions. assets_dir = "./prover-assets" log_level = 4 # TODO @gbotrel will be refactored with new logger. @@ -10,8 +10,8 @@ termination_grace_period_seconds = 1800 [execution] prover_mode = "limitless" -conflated_traces_dir = "./" -requests_root_dir = "./" +conflated_traces_dir = "/home/ubuntu/testdata" +requests_root_dir = "/home/ubuntu/testdata/execution" limitless_with_debug = false ignore_compatibility_check = false keep_traces_until_block = 0 diff --git a/prover/go.mod b/prover/go.mod index 8118225f2b3..509537326f5 100644 --- a/prover/go.mod +++ b/prover/go.mod @@ -5,96 +5,93 @@ go 1.25.7 require ( github.com/bits-and-blooms/bitset v1.24.4 github.com/consensys/compress v0.3.0 - github.com/consensys/gnark v0.14.1-0.20260505192735-3460cedcac43 - github.com/consensys/gnark-crypto v0.20.2-0.20260402204920-39238e584b99 - github.com/consensys/go-corset v1.2.10 + github.com/consensys/gnark v0.14.1-0.20260508134514-a9bb4257c480 + github.com/consensys/gnark-crypto v0.20.2-0.20260504203407-0dce6009ca13 + github.com/consensys/go-corset v1.2.14 github.com/crate-crypto/go-kzg-4844 v1.1.0 - github.com/dlclark/regexp2 v1.11.2 - github.com/fxamacker/cbor/v2 v2.9.0 + github.com/dlclark/regexp2 v1.12.0 + github.com/fxamacker/cbor/v2 v2.9.2 github.com/go-playground/assert/v2 v2.2.0 - github.com/go-playground/validator/v10 v10.28.0 + github.com/go-playground/validator/v10 v10.30.2 github.com/google/uuid v1.6.0 github.com/holiman/uint256 v1.3.2 github.com/icza/bitio v1.1.0 github.com/leanovate/gopter v0.2.11 - github.com/pierrec/lz4/v4 v4.1.22 + github.com/pierrec/lz4/v4 v4.1.26 github.com/pkg/errors v0.9.1 - github.com/prometheus/client_golang v1.19.1 - github.com/rs/zerolog v1.34.0 + github.com/prometheus/client_golang v1.23.2 + github.com/rs/zerolog v1.35.1 github.com/shirou/gopsutil v3.21.11+incompatible - github.com/sirupsen/logrus v1.9.3 + github.com/sirupsen/logrus v1.9.4 github.com/spf13/cobra v1.10.2 - github.com/spf13/viper v1.19.0 + github.com/spf13/viper v1.21.0 github.com/stretchr/testify v1.11.1 - golang.org/x/crypto v0.48.0 - golang.org/x/net v0.49.0 + golang.org/x/crypto v0.50.0 + golang.org/x/net v0.53.0 golang.org/x/sync v0.20.0 - golang.org/x/time v0.9.0 + golang.org/x/time v0.15.0 ) require ( - github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20251001021608-1fe7b43fc4d6 // indirect - github.com/VictoriaMetrics/fastcache v1.13.0 // indirect + github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20260416073033-7c2071eaa8d4 // indirect + github.com/VictoriaMetrics/fastcache v1.13.3 // indirect github.com/beorn7/perks v1.0.1 // indirect github.com/blang/semver/v4 v4.0.0 // indirect github.com/cespare/xxhash/v2 v2.3.0 // indirect - github.com/crate-crypto/go-eth-kzg v1.4.0 // indirect - github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 // indirect - github.com/emicklei/dot v1.6.2 // indirect - github.com/ethereum/c-kzg-4844/v2 v2.1.5 // indirect + github.com/crate-crypto/go-eth-kzg v1.5.0 // indirect + github.com/decred/dcrd/dcrec/secp256k1/v4 v4.4.1 // indirect + github.com/emicklei/dot v1.11.0 // indirect + github.com/ethereum/c-kzg-4844/v2 v2.1.7 // indirect github.com/ethereum/go-bigmodexpfix v0.0.0-20250911101455-f9e208c548ab // indirect - github.com/felixge/fgprof v0.9.4 // indirect - github.com/ferranbt/fastssz v0.1.4 // indirect - github.com/fsnotify/fsnotify v1.7.0 // indirect - github.com/gabriel-vasile/mimetype v1.4.10 // indirect + github.com/felixge/fgprof v0.9.5 // indirect + github.com/ferranbt/fastssz v1.0.0 // indirect + github.com/fsnotify/fsnotify v1.10.1 // indirect + github.com/gabriel-vasile/mimetype v1.4.13 // indirect github.com/go-ole/go-ole v1.3.0 // indirect github.com/go-playground/locales v0.14.1 // indirect github.com/go-playground/universal-translator v0.18.1 // indirect - github.com/gofrs/flock v0.12.1 // indirect + github.com/go-viper/mapstructure/v2 v2.5.0 // indirect + github.com/gofrs/flock v0.13.0 // indirect github.com/golang/snappy v1.0.0 // indirect - github.com/google/pprof v0.0.0-20260202012954-cb029daf43ef // indirect - github.com/hashicorp/hcl v1.0.0 // indirect + github.com/google/pprof v0.0.0-20260507013755-92041b743c96 // indirect github.com/holiman/bloomfilter/v2 v2.0.3 // indirect github.com/inconshreveable/mousetrap v1.1.0 // indirect - github.com/klauspost/cpuid/v2 v2.0.9 // indirect + github.com/klauspost/cpuid/v2 v2.3.0 // indirect github.com/leodido/go-urn v1.4.0 // indirect - github.com/magiconair/properties v1.8.7 // indirect github.com/mattn/go-colorable v0.1.14 // indirect - github.com/mattn/go-isatty v0.0.20 // indirect - github.com/minio/sha256-simd v1.0.0 // indirect + github.com/mattn/go-isatty v0.0.22 // indirect + github.com/minio/sha256-simd v1.0.1 // indirect github.com/mitchellh/mapstructure v1.5.0 // indirect github.com/munnerz/goautoneg v0.0.0-20191010083416-a7dc8b61c822 // indirect - github.com/pelletier/go-toml/v2 v2.2.2 // indirect - github.com/prometheus/client_model v0.6.1 // indirect - github.com/prometheus/common v0.55.0 // indirect - github.com/prometheus/procfs v0.15.1 // indirect + github.com/pelletier/go-toml/v2 v2.3.1 // indirect + github.com/prometheus/client_model v0.6.2 // indirect + github.com/prometheus/common v0.67.5 // indirect + github.com/prometheus/procfs v0.20.1 // indirect github.com/ronanh/intcomp v1.1.1 // indirect - github.com/sagikazarmark/locafero v0.6.0 // indirect - github.com/sagikazarmark/slog-shim v0.1.0 // indirect - github.com/sourcegraph/conc v0.3.0 // indirect - github.com/spf13/afero v1.11.0 // indirect - github.com/spf13/cast v1.6.0 // indirect - github.com/spf13/pflag v1.0.9 // indirect + github.com/sagikazarmark/locafero v0.12.0 // indirect + github.com/spf13/afero v1.15.0 // indirect + github.com/spf13/cast v1.10.0 // indirect + github.com/spf13/pflag v1.0.10 // indirect github.com/subosito/gotenv v1.6.0 // indirect - github.com/supranational/blst v0.3.16-0.20250831170142-f48500c1fdbe // indirect - github.com/tklauser/go-sysconf v0.3.14 // indirect - github.com/tklauser/numcpus v0.8.0 // indirect + github.com/supranational/blst v0.3.16 // indirect + github.com/tklauser/go-sysconf v0.3.16 // indirect + github.com/tklauser/numcpus v0.11.0 // indirect github.com/x448/float16 v0.8.4 // indirect github.com/yusufpapurcu/wmi v1.2.4 // indirect - go.uber.org/multierr v1.11.0 // indirect - golang.org/x/text v0.34.0 // indirect + go.yaml.in/yaml/v2 v2.4.4 // indirect + go.yaml.in/yaml/v3 v3.0.4 // indirect + golang.org/x/text v0.36.0 // indirect google.golang.org/protobuf v1.36.11 // indirect - gopkg.in/ini.v1 v1.67.0 // indirect gopkg.in/yaml.v2 v2.4.0 // indirect ) require ( github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc // indirect - github.com/ethereum/go-ethereum v1.17.0 - github.com/klauspost/compress v1.18.3 + github.com/ethereum/go-ethereum v1.17.2 + github.com/klauspost/compress v1.18.6 github.com/pkg/profile v1.7.0 github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 // indirect - golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 - golang.org/x/sys v0.42.0 // indirect + golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f + golang.org/x/sys v0.44.0 gopkg.in/yaml.v3 v3.0.1 // indirect ) diff --git a/prover/go.sum b/prover/go.sum index 9560afbf266..462fc6c63bb 100644 --- a/prover/go.sum +++ b/prover/go.sum @@ -39,10 +39,10 @@ cloud.google.com/go/storage v1.10.0/go.mod h1:FLPqc6j+Ki4BU591ie1oL6qBQGu2Bl/tZ9 dmitri.shuralyov.com/gpu/mtl v0.0.0-20190408044501-666a987793e9/go.mod h1:H6x//7gZCb22OMCxBHrMx7a5I7Hp++hsVxbQ4BYO7hU= github.com/BurntSushi/toml v0.3.1/go.mod h1:xHWCNGjB5oqiDr8zfno3MHue2Ht5sIBksp03qcyfWMU= github.com/BurntSushi/xgb v0.0.0-20160522181843-27f122750802/go.mod h1:IVnqGOEym/WlBOVXweHU+Q+/VP0lqqI8lqeDx9IjBqo= -github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20251001021608-1fe7b43fc4d6 h1:1zYrtlhrZ6/b6SAjLSfKzWtdgqK0U+HtH/VcBWh1BaU= -github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20251001021608-1fe7b43fc4d6/go.mod h1:ioLG6R+5bUSO1oeGSDxOV3FADARuMoytZCSX6MEMQkI= -github.com/VictoriaMetrics/fastcache v1.13.0 h1:AW4mheMR5Vd9FkAPUv+NH6Nhw+fmbTMGMsNAoA/+4G0= -github.com/VictoriaMetrics/fastcache v1.13.0/go.mod h1:hHXhl4DA2fTL2HTZDJFXWgW0LNjo6B+4aj2Wmng3TjU= +github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20260416073033-7c2071eaa8d4 h1:/97whAzwYxMNHXeTfhAtCRzNCpyblmxCtSYpsfzCszM= +github.com/ProjectZKM/Ziren/crates/go-runtime/zkvm_runtime v0.0.0-20260416073033-7c2071eaa8d4/go.mod h1:ioLG6R+5bUSO1oeGSDxOV3FADARuMoytZCSX6MEMQkI= +github.com/VictoriaMetrics/fastcache v1.13.3 h1:rBabE0iIxcqKEMCwUmwHZ9dgEqXerg8FRbRDUvC7OVc= +github.com/VictoriaMetrics/fastcache v1.13.3/go.mod h1:hHXhl4DA2fTL2HTZDJFXWgW0LNjo6B+4aj2Wmng3TjU= github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156 h1:eMwmnE/GDgah4HI848JfFxHt+iPb26b4zyfspmqY0/8= github.com/allegro/bigcache v1.2.1-0.20190218064605-e24eb225f156/go.mod h1:Cb/ax3seSYIx7SuZdm2G2xzfwmv3TPSk2ucNfQESPXM= github.com/antihax/optional v1.0.0/go.mod h1:uupD/76wgC+ih3iEmQUL+0Ugr19nfwCT1kdvxnR2qWY= @@ -75,35 +75,32 @@ github.com/cncf/udpa/go v0.0.0-20200629203442-efcf912fb354/go.mod h1:WmhPx2Nbnht github.com/cncf/udpa/go v0.0.0-20201120205902-5459f2c99403/go.mod h1:WmhPx2Nbnhtbo57+VJT5O0JRkEi1Wbu0z5j0R8u5Hbk= github.com/consensys/compress v0.3.0 h1:HRIcHvWkW9C9req0ZWg7mhYHzBarohXhcszIwHONVkM= github.com/consensys/compress v0.3.0/go.mod h1:pyM+ZXiNUh7/0+AUjUf9RKUM6vSH7T/fsn5LLS0j1Tk= -github.com/consensys/gnark v0.14.1-0.20260219004710-bbfb2f70a565 h1:NlOAmbLYsVb/hcuOBxza6CAA+233tB0eFiunGVEMyv4= -github.com/consensys/gnark v0.14.1-0.20260219004710-bbfb2f70a565/go.mod h1:EoWWbEboQRydCqJDSA7zrFxucIeoy/5R+MDx04oFpF4= -github.com/consensys/gnark v0.14.1-0.20260505192735-3460cedcac43 h1:RpbSPTqYtqLGas8Z65mw12uJBWL3BO2s6amtRmvx2SM= -github.com/consensys/gnark v0.14.1-0.20260505192735-3460cedcac43/go.mod h1:RIWXG9Gl+Ls2enSayeA/NdcM/FI3OOf6AqNdI2Jv8QU= -github.com/consensys/gnark-crypto v0.20.2-0.20260402204920-39238e584b99 h1:FREtFb4IoZWOj6MexddSuReVU7ViMChjuspD7SO8dGY= -github.com/consensys/gnark-crypto v0.20.2-0.20260402204920-39238e584b99/go.mod h1:NzeBHSZ49bIM7RtrNTYYR2kymTqwvI/A4eTgQlyQc+Q= -github.com/consensys/go-corset v1.2.10 h1:uKUICiHmERuMWzDRiRJr285fV2WncNGiCENSdNcQodY= -github.com/consensys/go-corset v1.2.10/go.mod h1:QKFoNJZHdCrDslg9XFjk+GoFMgrhKSVdBNnx4hq7WJA= +github.com/consensys/gnark v0.14.1-0.20260508134514-a9bb4257c480 h1:6SBhL7hzIui0yVyaih+63qFYRCU4PKSVNv9MIpGwPxk= +github.com/consensys/gnark v0.14.1-0.20260508134514-a9bb4257c480/go.mod h1:RIWXG9Gl+Ls2enSayeA/NdcM/FI3OOf6AqNdI2Jv8QU= +github.com/consensys/gnark-crypto v0.20.2-0.20260504203407-0dce6009ca13 h1:wj98yOn9dIiNKraw8mxVFS5kLRzxWqVUhLwTnUonbrE= +github.com/consensys/gnark-crypto v0.20.2-0.20260504203407-0dce6009ca13/go.mod h1:NzeBHSZ49bIM7RtrNTYYR2kymTqwvI/A4eTgQlyQc+Q= +github.com/consensys/go-corset v1.2.14 h1:xTEA4tlXGrhNEDpBiw5hy/E6o8YzS/TumB4MJVpqil4= +github.com/consensys/go-corset v1.2.14/go.mod h1:QKFoNJZHdCrDslg9XFjk+GoFMgrhKSVdBNnx4hq7WJA= github.com/coreos/go-semver v0.3.0/go.mod h1:nnelYz7RCh+5ahJtPPxZlU+153eP4D4r3EedlOD2RNk= github.com/coreos/go-systemd/v22 v22.3.2/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= -github.com/coreos/go-systemd/v22 v22.5.0/go.mod h1:Y58oyj3AT4RCenI/lSvhwexgC+NSVTIJ3seZv2GcEnc= github.com/cpuguy83/go-md2man/v2 v2.0.0/go.mod h1:maD7wRr/U5Z6m/iR4s+kqSMx2CaBsrgA7czyZG/E6dU= github.com/cpuguy83/go-md2man/v2 v2.0.6/go.mod h1:oOW0eioCTA6cOiMLiUPZOpcVxMig6NIQQ7OS05n1F4g= -github.com/crate-crypto/go-eth-kzg v1.4.0 h1:WzDGjHk4gFg6YzV0rJOAsTK4z3Qkz5jd4RE3DAvPFkg= -github.com/crate-crypto/go-eth-kzg v1.4.0/go.mod h1:J9/u5sWfznSObptgfa92Jq8rTswn6ahQWEuiLHOjCUI= +github.com/crate-crypto/go-eth-kzg v1.5.0 h1:FYRiJMJG2iv+2Dy3fi14SVGjcPteZ5HAAUe4YWlJygc= +github.com/crate-crypto/go-eth-kzg v1.5.0/go.mod h1:J9/u5sWfznSObptgfa92Jq8rTswn6ahQWEuiLHOjCUI= github.com/crate-crypto/go-kzg-4844 v1.1.0 h1:EN/u9k2TF6OWSHrCCDBBU6GLNMq88OspHHlMnHfoyU4= github.com/crate-crypto/go-kzg-4844 v1.1.0/go.mod h1:JolLjpSff1tCCJKaJx4psrlEdlXuJEC996PL3tTAFks= github.com/davecgh/go-spew v1.1.0/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc h1:U9qPSI2PIWSS1VwoXQT9A3Wy9MM3WgvqSxFWenqJduM= github.com/davecgh/go-spew v1.1.2-0.20180830191138-d8f796af33cc/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= -github.com/decred/dcrd/crypto/blake256 v1.0.1 h1:7PltbUIQB7u/FfZ39+DGa/ShuMyJ5ilcvdfma9wOH6Y= -github.com/decred/dcrd/crypto/blake256 v1.0.1/go.mod h1:2OfgNZ5wDpcsFmHmCK5gZTPcCXqlm2ArzUIkw9czNJo= -github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0 h1:rpfIENRNNilwHwZeG5+P150SMrnNEcHYvcCuK6dPZSg= -github.com/decred/dcrd/dcrec/secp256k1/v4 v4.3.0/go.mod h1:v57UDF4pDQJcEfFUCRop3lJL149eHGSe9Jvczhzjo/0= -github.com/dlclark/regexp2 v1.11.2 h1:/u628IuisSTwri5/UKloiIsH8+qF2Pu7xEQX+yIKg68= -github.com/dlclark/regexp2 v1.11.2/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= -github.com/emicklei/dot v1.6.2 h1:08GN+DD79cy/tzN6uLCT84+2Wk9u+wvqP+Hkx/dIR8A= -github.com/emicklei/dot v1.6.2/go.mod h1:DeV7GvQtIw4h2u73RKBkkFdvVAz0D9fzeJrgPW6gy/s= +github.com/decred/dcrd/crypto/blake256 v1.1.0 h1:zPMNGQCm0g4QTY27fOCorQW7EryeQ/U0x++OzVrdms8= +github.com/decred/dcrd/crypto/blake256 v1.1.0/go.mod h1:2OfgNZ5wDpcsFmHmCK5gZTPcCXqlm2ArzUIkw9czNJo= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.4.1 h1:5RVFMOWjMyRy8cARdy79nAmgYw3hK/4HUq48LQ6Wwqo= +github.com/decred/dcrd/dcrec/secp256k1/v4 v4.4.1/go.mod h1:ZXNYxsqcloTdSy/rNShjYzMhyjf0LaoftYK0p+A3h40= +github.com/dlclark/regexp2 v1.12.0 h1:0j4c5qQmnC6XOWNjP3PIXURXN2gWx76rd3KvgdPkCz8= +github.com/dlclark/regexp2 v1.12.0/go.mod h1:DHkYz0B9wPfa6wondMfaivmHpzrQ3v9q8cnmRbL6yW8= +github.com/emicklei/dot v1.11.0 h1:zsrhCuFHAJge/aZIC4N4LdHy5tqYu4tWEaUzIwdYj4Y= +github.com/emicklei/dot v1.11.0/go.mod h1:DeV7GvQtIw4h2u73RKBkkFdvVAz0D9fzeJrgPW6gy/s= github.com/envoyproxy/go-control-plane v0.9.0/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.1-0.20191026205805-5f8ba28d4473/go.mod h1:YTl/9mNaCwkRvm6d1a2C3ymFceY/DCBVvsKhRF0iEA4= github.com/envoyproxy/go-control-plane v0.9.4/go.mod h1:6rpuAdCZL397s3pYoYcLgu1mIlRU8Am5FuJP05cCM98= @@ -111,27 +108,27 @@ github.com/envoyproxy/go-control-plane v0.9.7/go.mod h1:cwu0lG7PUMfa9snN8LXBig5y github.com/envoyproxy/go-control-plane v0.9.9-0.20201210154907-fd9021fe5dad/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/go-control-plane v0.9.9-0.20210217033140-668b12f5399d/go.mod h1:cXg6YxExXjJnVBQHBLXeUAgxn2UodCpnH306RInaBQk= github.com/envoyproxy/protoc-gen-validate v0.1.0/go.mod h1:iSmxcyjqTsJpI2R4NaDN7+kN2VEUnK/pcBlmesArF7c= -github.com/ethereum/c-kzg-4844/v2 v2.1.5 h1:aVtoLK5xwJ6c5RiqO8g8ptJ5KU+2Hdquf6G3aXiHh5s= -github.com/ethereum/c-kzg-4844/v2 v2.1.5/go.mod h1:u59hRTTah4Co6i9fDWtiCjTrblJv0UwsqZKCc0GfgUs= +github.com/ethereum/c-kzg-4844/v2 v2.1.7 h1:aat3CuITdDbPC6pmEGRT0zJ5eOxzrZj8TJT5z7Xk//M= +github.com/ethereum/c-kzg-4844/v2 v2.1.7/go.mod h1:8HMkUZ5JRv4hpw/XUrYWSQNAUzhHMg2UDb/U+5m+XNw= github.com/ethereum/go-bigmodexpfix v0.0.0-20250911101455-f9e208c548ab h1:rvv6MJhy07IMfEKuARQ9TKojGqLVNxQajaXEp/BoqSk= github.com/ethereum/go-bigmodexpfix v0.0.0-20250911101455-f9e208c548ab/go.mod h1:IuLm4IsPipXKF7CW5Lzf68PIbZ5yl7FFd74l/E0o9A8= -github.com/ethereum/go-ethereum v1.17.0 h1:2D+1Fe23CwZ5tQoAS5DfwKFNI1HGcTwi65/kRlAVxes= -github.com/ethereum/go-ethereum v1.17.0/go.mod h1:2W3msvdosS/MCWytpqTcqgFiRYbTH59FxDJzqah120o= +github.com/ethereum/go-ethereum v1.17.2 h1:ag6geu0kn8Hv5FLKTpH+Hm2DHD+iuFtuqKxEuwUsDOI= +github.com/ethereum/go-ethereum v1.17.2/go.mod h1:KHcRXfGOUfUmKg51IhQ0IowiqZ6PqZf08CMtk0g5K1o= github.com/fatih/color v1.7.0/go.mod h1:Zm6kSWBoL9eyXnKyktHP6abPY2pDugNf5KwzbycvMj4= github.com/felixge/fgprof v0.9.3/go.mod h1:RdbpDgzqYVh/T9fPELJyV7EYJuHB55UTEULNun8eiPw= -github.com/felixge/fgprof v0.9.4 h1:ocDNwMFlnA0NU0zSB3I52xkO4sFXk80VK9lXjLClu88= -github.com/felixge/fgprof v0.9.4/go.mod h1:yKl+ERSa++RYOs32d8K6WEXCB4uXdLls4ZaZPpayhMM= -github.com/ferranbt/fastssz v0.1.4 h1:OCDB+dYDEQDvAgtAGnTSidK1Pe2tW3nFV40XyMkTeDY= -github.com/ferranbt/fastssz v0.1.4/go.mod h1:Ea3+oeoRGGLGm5shYAeDgu6PGUlcvQhE2fILyD9+tGg= +github.com/felixge/fgprof v0.9.5 h1:8+vR6yu2vvSKn08urWyEuxx75NWPEvybbkBirEpsbVY= +github.com/felixge/fgprof v0.9.5/go.mod h1:yKl+ERSa++RYOs32d8K6WEXCB4uXdLls4ZaZPpayhMM= +github.com/ferranbt/fastssz v1.0.0 h1:9EXXYsracSqQRBQiHeaVsG/KQeYblPf40hsQPb9Dzk8= +github.com/ferranbt/fastssz v1.0.0/go.mod h1:Ea3+oeoRGGLGm5shYAeDgu6PGUlcvQhE2fILyD9+tGg= github.com/frankban/quicktest v1.14.6 h1:7Xjx+VpznH+oBnejlPUj8oUpdxnVs4f8XU8WnHkI4W8= github.com/frankban/quicktest v1.14.6/go.mod h1:4ptaffx2x8+WTWXmUCuVU6aPUX1/Mz7zb5vbUoiM6w0= github.com/fsnotify/fsnotify v1.4.9/go.mod h1:znqG4EE+3YCdAaPaxE2ZRY/06pZUdp0tY4IgpuI1SZQ= -github.com/fsnotify/fsnotify v1.7.0 h1:8JEhPFa5W2WU7YfeZzPNqzMP6Lwt7L2715Ggo0nosvA= -github.com/fsnotify/fsnotify v1.7.0/go.mod h1:40Bi/Hjc2AVfZrqy+aj+yEI+/bRxZnMJyTJwOpGvigM= -github.com/fxamacker/cbor/v2 v2.9.0 h1:NpKPmjDBgUfBms6tr6JZkTHtfFGcMKsw3eGcmD/sapM= -github.com/fxamacker/cbor/v2 v2.9.0/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= -github.com/gabriel-vasile/mimetype v1.4.10 h1:zyueNbySn/z8mJZHLt6IPw0KoZsiQNszIpU+bX4+ZK0= -github.com/gabriel-vasile/mimetype v1.4.10/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s= +github.com/fsnotify/fsnotify v1.10.1 h1:b0/UzAf9yR5rhf3RPm9gf3ehBPpf0oZKIjtpKrx59Ho= +github.com/fsnotify/fsnotify v1.10.1/go.mod h1:TLheqan6HD6GBK6PrDWyDPBaEV8LspOxvPSjC+bVfgo= +github.com/fxamacker/cbor/v2 v2.9.2 h1:X4Ksno9+x3cz0TZv69ec1hxP/+tymuR8PXQJyDwfh78= +github.com/fxamacker/cbor/v2 v2.9.2/go.mod h1:vM4b+DJCtHn+zz7h3FFp/hDAI9WNWCsZj23V5ytsSxQ= +github.com/gabriel-vasile/mimetype v1.4.13 h1:46nXokslUBsAJE/wMsp5gtO500a4F3Nkz9Ufpk2AcUM= +github.com/gabriel-vasile/mimetype v1.4.13/go.mod h1:d+9Oxyo1wTzWdyVUPMmXFvp4F9tea18J8ufA774AB3s= github.com/ghodss/yaml v1.0.0/go.mod h1:4dBDuWmgqj2HViK6kFavaiC9ZROes6MMH2rRYeMEF04= github.com/go-gl/glfw v0.0.0-20190409004039-e6da0acd62b1/go.mod h1:vR7hzQXu2zJy9AVAgeJqvqgH9Q5CA+iKCZ2gyEVpxRU= github.com/go-gl/glfw/v3.3/glfw v0.0.0-20191125211704-12ad95a8df72/go.mod h1:tQ2UAYgL5IevRw8kRxooKSPJfGvJ9fJQFa0TUsXzTg8= @@ -145,14 +142,16 @@ github.com/go-playground/locales v0.14.1 h1:EWaQ/wswjilfKLTECiXz7Rh+3BjFhfDFKv/o github.com/go-playground/locales v0.14.1/go.mod h1:hxrqLVvrK65+Rwrd5Fc6F2O76J/NuW9t0sjnWqG1slY= github.com/go-playground/universal-translator v0.18.1 h1:Bcnm0ZwsGyWbCzImXv+pAJnYK9S473LQFuzCbDbfSFY= github.com/go-playground/universal-translator v0.18.1/go.mod h1:xekY+UJKNuX9WP91TpwSH2VMlDf28Uj24BCp08ZFTUY= -github.com/go-playground/validator/v10 v10.28.0 h1:Q7ibns33JjyW48gHkuFT91qX48KG0ktULL6FgHdG688= -github.com/go-playground/validator/v10 v10.28.0/go.mod h1:GoI6I1SjPBh9p7ykNE/yj3fFYbyDOpwMn5KXd+m2hUU= +github.com/go-playground/validator/v10 v10.30.2 h1:JiFIMtSSHb2/XBUbWM4i/MpeQm9ZK2xqPNk8vgvu5JQ= +github.com/go-playground/validator/v10 v10.30.2/go.mod h1:mAf2pIOVXjTEBrwUMGKkCWKKPs9NheYGabeB04txQSc= +github.com/go-viper/mapstructure/v2 v2.5.0 h1:vM5IJoUAy3d7zRSVtIwQgBj7BiWtMPfmPEgAXnvj1Ro= +github.com/go-viper/mapstructure/v2 v2.5.0/go.mod h1:oJDH3BJKyqBA2TXFhDsKDGDTlndYOZ6rGS0BRZIxGhM= github.com/gobwas/httphead v0.1.0/go.mod h1:O/RXo79gxV8G+RqlR/otEwx4Q36zl9rqC5u12GKvMCM= github.com/gobwas/pool v0.2.1/go.mod h1:q8bcK0KcYlCgd9e7WYLm9LpyS+YeLd8JVDW6WezmKEw= github.com/gobwas/ws v1.2.1/go.mod h1:hRKAFb8wOxFROYNsT1bqfWnhX+b5MFeJM9r2ZSwg/KY= github.com/godbus/dbus/v5 v5.0.4/go.mod h1:xhWf0FNVPg57R7Z0UbKHbJfkEywrmjJnf7w5xrFpKfA= -github.com/gofrs/flock v0.12.1 h1:MTLVXXHf8ekldpJk3AKicLij9MdwOWkZ+a/jHHZby9E= -github.com/gofrs/flock v0.12.1/go.mod h1:9zxTsyu5xtJ9DK+1tFZyibEV7y3uwDxPPfbxeeHCoD0= +github.com/gofrs/flock v0.13.0 h1:95JolYOvGMqeH31+FC7D2+uULf6mG61mEZ/A8dRYMzw= +github.com/gofrs/flock v0.13.0/go.mod h1:jxeyy9R1auM5S6JYDBhDt+E2TCo7DkratH4Pgi8P+Z0= github.com/gogo/protobuf v1.3.2/go.mod h1:P1XiOD3dCwIKUDQYPy72D8LYyHL2YPYrpS2s69NZV8Q= github.com/golang/glog v0.0.0-20160126235308-23def4e6c14b/go.mod h1:SBH7ygxi8pfUlaOkMMuAQtPIUF8ecWP5IEl/CR7VP2Q= github.com/golang/groupcache v0.0.0-20190702054246-869f871628b6/go.mod h1:cIg4eruTrX1D+g88fzRXU5OdNfaM+9IcxsU14FzY7Hc= @@ -218,8 +217,8 @@ github.com/google/pprof v0.0.0-20210122040257-d980be63207e/go.mod h1:kpwsk12EmLe github.com/google/pprof v0.0.0-20210226084205-cbba55b83ad5/go.mod h1:kpwsk12EmLew5upagYY7GY0pfYCcupk39gWOCRROcvE= github.com/google/pprof v0.0.0-20211214055906-6f57359322fd/go.mod h1:KgnwoLYCZ8IQu3XUZ8Nc/bM9CCZFOyjUNOSygVozoDg= github.com/google/pprof v0.0.0-20240227163752-401108e1b7e7/go.mod h1:czg5+yv1E0ZGTi6S6vVK1mke0fV+FaUhNGcd6VRS9Ik= -github.com/google/pprof v0.0.0-20260202012954-cb029daf43ef h1:xpF9fUHpoIrrjX24DURVKiwHcFpw19ndIs+FwTSMbno= -github.com/google/pprof v0.0.0-20260202012954-cb029daf43ef/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= +github.com/google/pprof v0.0.0-20260507013755-92041b743c96 h1:YDDnaZ9afWajDboPMt9Vikqca/yWAX7KAxVzb4lJU1M= +github.com/google/pprof v0.0.0-20260507013755-92041b743c96/go.mod h1:MxpfABSjhmINe3F1It9d+8exIHFvUqtLIRCdOGNXqiI= github.com/google/renameio v0.1.0/go.mod h1:KWCgfxg9yswjAJkECMjeO8J8rahYeXnNhOm40UhjYkI= github.com/google/uuid v1.1.2/go.mod h1:TIyPZe4MgqvfeYDBFedMoGGpEw/LqOeaOT+nhxU+yHo= github.com/google/uuid v1.6.0 h1:NIvaJDMOsjHA8n1jAhLSgzrAzy1Hgr+hNrb57e+94F0= @@ -244,7 +243,6 @@ github.com/hashicorp/go-uuid v1.0.1/go.mod h1:6SBZvOh/SIDV7/2o3Jml5SYk/TvGqwFJ/b github.com/hashicorp/go.net v0.0.1/go.mod h1:hjKkEWcCURg++eb33jQU7oqQcI9XDCnUzHA0oac0k90= github.com/hashicorp/golang-lru v0.5.0/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= github.com/hashicorp/golang-lru v0.5.1/go.mod h1:/m3WP610KZHVQ1SGc6re/UDhFvYD7pJ4Ao+sR/qLZy8= -github.com/hashicorp/hcl v1.0.0 h1:0Anlzjpi4vEasTeNFn2mLJgTSwt0+6sfsiTG8qcWGx4= github.com/hashicorp/hcl v1.0.0/go.mod h1:E5yfLk+7swimpb2L/Alb/PJmXilQ/rhwaUYs4T20WEQ= github.com/hashicorp/logutils v1.0.0/go.mod h1:QIAnNjmIWmVIIkWDTG1z5v++HQmx9WQRO+LraFDTW64= github.com/hashicorp/mdns v1.0.0/go.mod h1:tL+uN++7HEJ6SQLQ2/p+z2pH24WQKWjBPkE0mNTz8vQ= @@ -272,11 +270,10 @@ github.com/jstemmer/go-junit-report v0.9.1/go.mod h1:Brl9GWCQeLvo8nXZwPNNblvFj/X github.com/jtolds/gls v4.20.0+incompatible/go.mod h1:QJZ7F/aHp+rZTRtaJ1ow/lLfFfVYBRgL+9YlvaHOwJU= github.com/kisielk/errcheck v1.5.0/go.mod h1:pFxgyoBC7bSaBwPgfKdkLd5X25qrDl4LWUI2bnpBCr8= github.com/kisielk/gotool v1.0.0/go.mod h1:XhKaO+MFFWcvkIS/tQcRk01m1F5IRFswLeQ+oQHNcck= -github.com/klauspost/compress v1.18.3 h1:9PJRvfbmTabkOX8moIpXPbMMbYN60bWImDDU7L+/6zw= -github.com/klauspost/compress v1.18.3/go.mod h1:R0h/fSBs8DE4ENlcrlib3PsXS61voFxhIs2DeRhCvJ4= -github.com/klauspost/cpuid/v2 v2.0.4/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= -github.com/klauspost/cpuid/v2 v2.0.9 h1:lgaqFMSdTdQYdZ04uHyN2d/eKdOMyi2YLSvlQIBFYa4= -github.com/klauspost/cpuid/v2 v2.0.9/go.mod h1:FInQzS24/EEf25PyTYn52gqo7WaD8xa0213Md/qVLRg= +github.com/klauspost/compress v1.18.6 h1:2jupLlAwFm95+YDR+NwD2MEfFO9d4z4Prjl1XXDjuao= +github.com/klauspost/compress v1.18.6/go.mod h1:cwPg85FWrGar70rWktvGQj8/hthj3wpl0PGDogxkrSQ= +github.com/klauspost/cpuid/v2 v2.3.0 h1:S4CRMLnYUhGeDFDqkGriYKdfoFlDnMtqTiI/sFzhA9Y= +github.com/klauspost/cpuid/v2 v2.3.0/go.mod h1:hqwkgyIinND0mEev00jJYCxPNVRVXFQeu1XKlok6oO0= github.com/kr/fs v0.1.0/go.mod h1:FFnZGqtBN9Gxj7eW1uZ42v5BccTP0vu6NEaFoC2HwRg= github.com/kr/pretty v0.1.0/go.mod h1:dAy3ld7l9f0ibDNOQOHHMYYIIbhfbHSm3C4ZsoJORNo= github.com/kr/pretty v0.3.1 h1:flRD4NNwYAUpkphVc1HcthR4KEIFJ65n8Mw5qdRn3LE= @@ -293,21 +290,16 @@ github.com/ledongthuc/pdf v0.0.0-20220302134840-0c2507a12d80/go.mod h1:imJHygn/1 github.com/leodido/go-urn v1.4.0 h1:WT9HwE9SGECu3lg4d/dIA+jxlljEa1/ffXKmRjqdmIQ= github.com/leodido/go-urn v1.4.0/go.mod h1:bvxc+MVxLKB4z00jd1z+Dvzr47oO32F/QSNjSBOlFxI= github.com/magiconair/properties v1.8.5/go.mod h1:y3VJvCyxH9uVvJTWEGAELF3aiYNyPKd5NZ3oSwXrF60= -github.com/magiconair/properties v1.8.7 h1:IeQXZAiQcpL9mgcAe1Nu6cX9LLw6ExEHKjN0VQdvPDY= -github.com/magiconair/properties v1.8.7/go.mod h1:Dhd985XPs7jluiymwWYZ0G4Z61jb3vdS329zhj2hYo0= github.com/mailru/easyjson v0.7.7/go.mod h1:xzfreul335JAWq5oZzymOObrkdz5UnU4kGfJJLY9Nlc= github.com/mattn/go-colorable v0.0.9/go.mod h1:9vuHe8Xs5qXnSaW/c/ABM9alt+Vo+STaOChaDxuIBZU= -github.com/mattn/go-colorable v0.1.13/go.mod h1:7S9/ev0klgBDR4GtXTXX8a3vIGJpMovkB8vQcUbaXHg= github.com/mattn/go-colorable v0.1.14 h1:9A9LHSqF/7dyVVX6g0U9cwm9pG3kP9gSzcuIPHPsaIE= github.com/mattn/go-colorable v0.1.14/go.mod h1:6LmQG8QLFO4G5z1gPvYEzlUgJ2wF+stgPZH1UqBm1s8= github.com/mattn/go-isatty v0.0.3/go.mod h1:M+lRXTBqGeGNdLjl/ufCoiOlB5xdOkqRJdNxMWT7Zi4= -github.com/mattn/go-isatty v0.0.16/go.mod h1:kYGgaQfpe5nmfYZH+SKPsOc2e4SrIfOl2e/yFXSvRLM= -github.com/mattn/go-isatty v0.0.19/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= -github.com/mattn/go-isatty v0.0.20 h1:xfD0iDuEKnDkl03q4limB+vH+GxLEtL/jb4xVJSWWEY= -github.com/mattn/go-isatty v0.0.20/go.mod h1:W+V8PltTTMOvKvAeJH7IuucS94S2C6jfK/D7dTCTo3Y= +github.com/mattn/go-isatty v0.0.22 h1:j8l17JJ9i6VGPUFUYoTUKPSgKe/83EYU2zBC7YNKMw4= +github.com/mattn/go-isatty v0.0.22/go.mod h1:ZXfXG4SQHsB/w3ZeOYbR0PrPwLy+n6xiMrJlRFqopa4= github.com/miekg/dns v1.0.14/go.mod h1:W1PPwlIAgtquWBMBEV9nkV9Cazfe8ScdGz/Lj7v3Nrg= -github.com/minio/sha256-simd v1.0.0 h1:v1ta+49hkWZyvaKwrQB8elexRqm6Y0aMLjCNsrYxo6g= -github.com/minio/sha256-simd v1.0.0/go.mod h1:OuYzVNI5vcoYIAmbIvHPl3N3jUzVedXbKy5RFepssQM= +github.com/minio/sha256-simd v1.0.1 h1:6kaan5IFmwTNynnKKpDHe6FWHohJOHhCPchzK49dzMM= +github.com/minio/sha256-simd v1.0.1/go.mod h1:Pz6AKMiUdngCLpeTL/RJY1M9rUuPMYujV5xJjtbRSN8= github.com/mitchellh/cli v1.0.0/go.mod h1:hNIlj7HEI86fIcpObd7a0FcrxTWetlwJDGcceTlRvqc= github.com/mitchellh/go-homedir v1.0.0/go.mod h1:SfyaCUpYCn1Vlf4IUYiD9fPX4A5wJrkLzIz1N1q0pr0= github.com/mitchellh/go-testing-interface v1.0.0/go.mod h1:kRemZodwjscx+RGhAo8eIhFbs2+BFgRtFPeD/KE+zxI= @@ -328,10 +320,10 @@ github.com/neelance/sourcemap v0.0.0-20200213170602-2833bce08e4c/go.mod h1:Qr6/a github.com/orisano/pixelmatch v0.0.0-20220722002657-fb0b55479cde/go.mod h1:nZgzbfBr3hhjoZnS66nKrHmduYNpc34ny7RK4z5/HM0= github.com/pascaldekloe/goe v0.0.0-20180627143212-57f6aae5913c/go.mod h1:lzWF7FIEvWOWxwDKqyGYQf6ZUaNfKdP144TG7ZOy1lc= github.com/pelletier/go-toml v1.9.3/go.mod h1:u1nR/EPcESfeI/szUZKdtJ0xRNbUoANCkoOuaOx1Y+c= -github.com/pelletier/go-toml/v2 v2.2.2 h1:aYUidT7k73Pcl9nb2gScu7NSrKCSHIDE89b3+6Wq+LM= -github.com/pelletier/go-toml/v2 v2.2.2/go.mod h1:1t835xjRzz80PqgE6HHgN2JOsmgYu/h4qDAS4n929Rs= -github.com/pierrec/lz4/v4 v4.1.22 h1:cKFw6uJDK+/gfw5BcDL0JL5aBsAFdsIT18eRtLj7VIU= -github.com/pierrec/lz4/v4 v4.1.22/go.mod h1:gZWDp/Ze/IJXGXf23ltt2EXimqmTUXEy0GFuRQyBid4= +github.com/pelletier/go-toml/v2 v2.3.1 h1:MYEvvGnQjeNkRF1qUuGolNtNExTDwct51yp7olPtrEc= +github.com/pelletier/go-toml/v2 v2.3.1/go.mod h1:2gIqNv+qfxSVS7cM2xJQKtLSTLUE9V8t9Stt+h56mCY= +github.com/pierrec/lz4/v4 v4.1.26 h1:GrpZw1gZttORinvzBdXPUXATeqlJjqUG/D87TKMnhjY= +github.com/pierrec/lz4/v4 v4.1.26/go.mod h1:EoQMVJgeeEOMsCqCzqFm2O0cJvljX2nGZjcRIPL34O4= github.com/pkg/errors v0.8.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= github.com/pkg/errors v0.9.1 h1:FEBLx1zS214owpjy7qsBeixbURkuhQAwrK5UwLGTwt4= github.com/pkg/errors v0.9.1/go.mod h1:bwawxfHBFNV+L2hUp1rHADufV3IMtnDRdf1r5NINEl0= @@ -342,15 +334,15 @@ github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZN github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2 h1:Jamvg5psRIccs7FGNTlIRMkT8wgtp5eCXdBlqhYGL6U= github.com/pmezard/go-difflib v1.0.1-0.20181226105442-5d4384ee4fb2/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/posener/complete v1.1.1/go.mod h1:em0nMJCgc9GFtwrmVmEMR/ZL6WyhyjMBndrE9hABlRI= -github.com/prometheus/client_golang v1.19.1 h1:wZWJDwK+NameRJuPGDhlnFgx8e8HN3XHQeLaYJFJBOE= -github.com/prometheus/client_golang v1.19.1/go.mod h1:mP78NwGzrVks5S2H6ab8+ZZGJLZUq1hoULYBAYBw1Ho= +github.com/prometheus/client_golang v1.23.2 h1:Je96obch5RDVy3FDMndoUsjAhG5Edi49h0RJWRi/o0o= +github.com/prometheus/client_golang v1.23.2/go.mod h1:Tb1a6LWHB3/SPIzCoaDXI4I8UHKeFTEQ1YCr+0Gyqmg= github.com/prometheus/client_model v0.0.0-20190812154241-14fe0d1b01d4/go.mod h1:xMI15A0UPsDsEKsMN9yxemIoYk6Tm2C1GtYGdfGttqA= -github.com/prometheus/client_model v0.6.1 h1:ZKSh/rekM+n3CeS952MLRAdFwIKqeY8b62p8ais2e9E= -github.com/prometheus/client_model v0.6.1/go.mod h1:OrxVMOVHjw3lKMa8+x6HeMGkHMQyHDk9E3jmP2AmGiY= -github.com/prometheus/common v0.55.0 h1:KEi6DK7lXW/m7Ig5i47x0vRzuBsHuvJdi5ee6Y3G1dc= -github.com/prometheus/common v0.55.0/go.mod h1:2SECS4xJG1kd8XF9IcM1gMX6510RAEL65zxzNImwdc8= -github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc= -github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk= +github.com/prometheus/client_model v0.6.2 h1:oBsgwpGs7iVziMvrGhE53c/GrLUsZdHnqNwqPLxwZyk= +github.com/prometheus/client_model v0.6.2/go.mod h1:y3m2F6Gdpfy6Ut/GBsUqTWZqCUvMVzSfMLjcu6wAwpE= +github.com/prometheus/common v0.67.5 h1:pIgK94WWlQt1WLwAC5j2ynLaBRDiinoAb86HZHTUGI4= +github.com/prometheus/common v0.67.5/go.mod h1:SjE/0MzDEEAyrdr5Gqc6G+sXI67maCxzaT3A2+HqjUw= +github.com/prometheus/procfs v0.20.1 h1:XwbrGOIplXW/AU3YhIhLODXMJYyC1isLFfYCsTEycfc= +github.com/prometheus/procfs v0.20.1/go.mod h1:o9EMBZGRyvDrSPH1RqdxhojkuXstoe4UlK79eF5TGGo= github.com/prysmaticlabs/gohashtree v0.0.4-beta h1:H/EbCuXPeTV3lpKeXGPpEV9gsUpkqOOVnWapUyeWro4= github.com/prysmaticlabs/gohashtree v0.0.4-beta/go.mod h1:BFdtALS+Ffhg3lGQIHv9HDWuHS8cTvHZzrHWxwOtGOs= github.com/rogpeppe/fastuuid v1.2.0/go.mod h1:jVj6XXZzXRy/MSR5jhDC/2q6DgLz+nrA6LYCDYWNEvQ= @@ -359,16 +351,13 @@ github.com/rogpeppe/go-internal v1.14.1 h1:UQB4HGPB6osV0SQTLymcB4TgvyWu6ZyliaW0t github.com/rogpeppe/go-internal v1.14.1/go.mod h1:MaRKkUm5W0goXpeCfT7UZI6fk/L7L7so1lCWt35ZSgc= github.com/ronanh/intcomp v1.1.1 h1:+1bGV/wEBiHI0FvzS7RHgzqOpfbBJzLIxkqMJ9e6yxY= github.com/ronanh/intcomp v1.1.1/go.mod h1:7FOLy3P3Zj3er/kVrU/pl+Ql7JFZj7bwliMGketo0IU= -github.com/rs/xid v1.6.0/go.mod h1:7XoLgs4eV+QndskICGsho+ADou8ySMSjJKDIan90Nz0= -github.com/rs/zerolog v1.34.0 h1:k43nTLIwcTVQAncfCw4KZ2VY6ukYoZaBPNOE8txlOeY= -github.com/rs/zerolog v1.34.0/go.mod h1:bJsvje4Z08ROH4Nhs5iH600c3IkWhwp44iRc54W6wYQ= +github.com/rs/zerolog v1.35.1 h1:m7xQeoiLIiV0BCEY4Hs+j2NG4Gp2o2KPKmhnnLiazKI= +github.com/rs/zerolog v1.35.1/go.mod h1:EjML9kdfa/RMA7h/6z6pYmq1ykOuA8/mjWaEvGI+jcw= github.com/russross/blackfriday/v2 v2.0.1/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/russross/blackfriday/v2 v2.1.0/go.mod h1:+Rmxgy9KzJVeS9/2gXHxylqXiyQDYRxCVz55jmeOWTM= github.com/ryanuber/columnize v0.0.0-20160712163229-9b3edd62028f/go.mod h1:sm1tb6uqfes/u+d4ooFouqFdy9/2g9QGwK3SQygK0Ts= -github.com/sagikazarmark/locafero v0.6.0 h1:ON7AQg37yzcRPU69mt7gwhFEBwxI6P9T4Qu3N51bwOk= -github.com/sagikazarmark/locafero v0.6.0/go.mod h1:77OmuIc6VTraTXKXIs/uvUxKGUXjE1GbemJYHqdNjX0= -github.com/sagikazarmark/slog-shim v0.1.0 h1:diDBnUNK9N/354PgrxMywXnAwEr1QZcOr6gto+ugjYE= -github.com/sagikazarmark/slog-shim v0.1.0/go.mod h1:SrcSrq8aKtyuqEI1uvTDTK1arOWRIczQRv+GVI1AkeQ= +github.com/sagikazarmark/locafero v0.12.0 h1:/NQhBAkUb4+fH1jivKHWusDYFjMOOKU88eegjfxfHb4= +github.com/sagikazarmark/locafero v0.12.0/go.mod h1:sZh36u/YSZ918v0Io+U9ogLYQJ9tLLBmM4eneO6WwsI= github.com/sean-/seed v0.0.0-20170313163322-e2103e2c3529/go.mod h1:DxrIzT+xaE7yg65j358z/aeFdxmN0P9QXhEzd20vsDc= github.com/shirou/gopsutil v3.21.11+incompatible h1:+1+c1VGhc88SSonWP6foOcLhvnKlUeu/erjjvaPEYiI= github.com/shirou/gopsutil v3.21.11+incompatible/go.mod h1:5b4v6he4MtMOwMlS0TUMTu2PcXUg8+E1lC7eC3UO/RA= @@ -377,34 +366,31 @@ github.com/shurcooL/httpfs v0.0.0-20190707220628-8d4bc4ba7749/go.mod h1:ZY1cvUeJ github.com/shurcooL/sanitized_anchor_name v1.0.0/go.mod h1:1NzhyTcUVG4SuEtjjoZeVRXNmyL/1OwPU0+IJeTBvfc= github.com/shurcooL/vfsgen v0.0.0-20200824052919-0d455de96546/go.mod h1:TrYk7fJVaAttu97ZZKrO9UbRa8izdowaMIZcxYMbVaw= github.com/sirupsen/logrus v1.8.1/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= -github.com/sirupsen/logrus v1.9.3 h1:dueUQJ1C2q9oE3F7wvmSGAaVtTmUizReu6fjN8uqzbQ= -github.com/sirupsen/logrus v1.9.3/go.mod h1:naHLuLoDiP4jHNo9R0sCBMtWGeIprob74mVsIT4qYEQ= +github.com/sirupsen/logrus v1.9.4 h1:TsZE7l11zFCLZnZ+teH4Umoq5BhEIfIzfRDZ1Uzql2w= +github.com/sirupsen/logrus v1.9.4/go.mod h1:ftWc9WdOfJ0a92nsE2jF5u5ZwH8Bv2zdeOC42RjbV2g= github.com/smarty/assertions v1.15.0/go.mod h1:yABtdzeQs6l1brC900WlRNwj6ZR55d7B+E8C6HtKdec= github.com/smartystreets/assertions v0.0.0-20180927180507-b2de0cb4f26d/go.mod h1:OnSkiWE9lh6wB0YB77sQom3nweQdgAjqCqsofrRNTgc= github.com/smartystreets/goconvey v1.6.4/go.mod h1:syvi0/a8iFYH4r/RixwvyeAJjdLS9QV7WQ/tjFTllLA= github.com/smartystreets/goconvey v1.8.1/go.mod h1:+/u4qLyY6x1jReYOp7GOM2FSt8aP9CzCZL03bI28W60= -github.com/sourcegraph/conc v0.3.0 h1:OQTbbt6P72L20UqAkXXuLOj79LfEanQ+YQFNpLA9ySo= -github.com/sourcegraph/conc v0.3.0/go.mod h1:Sdozi7LEKbFPqYX2/J+iBAM6HpqSLTASQIKqDmF7Mt0= github.com/spf13/afero v1.6.0/go.mod h1:Ai8FlHk4v/PARR026UzYexafAt9roJ7LcLMAmO6Z93I= -github.com/spf13/afero v1.11.0 h1:WJQKhtpdm3v2IzqG8VMqrr6Rf3UYpEF239Jy9wNepM8= -github.com/spf13/afero v1.11.0/go.mod h1:GH9Y3pIexgf1MTIWtNGyogA5MwRIDXGUr+hbWNoBjkY= +github.com/spf13/afero v1.15.0 h1:b/YBCLWAJdFWJTN9cLhiXXcD7mzKn9Dm86dNnfyQw1I= +github.com/spf13/afero v1.15.0/go.mod h1:NC2ByUVxtQs4b3sIUphxK0NioZnmxgyCrfzeuq8lxMg= github.com/spf13/cast v1.3.1/go.mod h1:Qx5cxh0v+4UWYiBimWS+eyWzqEqokIECu5etghLkUJE= -github.com/spf13/cast v1.6.0 h1:GEiTHELF+vaR5dhz3VqZfFSzZjYbgeKDpBxQVS4GYJ0= -github.com/spf13/cast v1.6.0/go.mod h1:ancEpBxwJDODSW/UG4rDrAqiKolqNNh2DX3mk86cAdo= +github.com/spf13/cast v1.10.0 h1:h2x0u2shc1QuLHfxi+cTJvs30+ZAHOGRic8uyGTDWxY= +github.com/spf13/cast v1.10.0/go.mod h1:jNfB8QC9IA6ZuY2ZjDp0KtFO2LZZlg4S/7bzP6qqeHo= github.com/spf13/cobra v1.2.1/go.mod h1:ExllRjgxM/piMAM+3tAZvg8fsklGAf3tPfi+i8t68Nk= github.com/spf13/cobra v1.10.2 h1:DMTTonx5m65Ic0GOoRY2c16WCbHxOOw6xxezuLaBpcU= github.com/spf13/cobra v1.10.2/go.mod h1:7C1pvHqHw5A4vrJfjNwvOdzYu0Gml16OCs2GRiTUUS4= github.com/spf13/jwalterweatherman v1.1.0/go.mod h1:aNWZUN0dPAAO/Ljvb5BEdw96iTZ0EXowPYD95IqWIGo= github.com/spf13/pflag v1.0.5/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= -github.com/spf13/pflag v1.0.9 h1:9exaQaMOCwffKiiiYk6/BndUBv+iRViNW+4lEMi0PvY= github.com/spf13/pflag v1.0.9/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= +github.com/spf13/pflag v1.0.10 h1:4EBh2KAYBwaONj6b2Ye1GiHfwjqyROoF4RwYO+vPwFk= +github.com/spf13/pflag v1.0.10/go.mod h1:McXfInJRrz4CZXVZOBLb0bTZqETkiAhM9Iw0y3An2Bg= github.com/spf13/viper v1.8.1/go.mod h1:o0Pch8wJ9BVSWGQMbra6iw0oQ5oktSIBaujf1rJH9Ns= -github.com/spf13/viper v1.19.0 h1:RWq5SEjt8o25SROyN3z2OrDB9l7RPd3lwTWU8EcEdcI= -github.com/spf13/viper v1.19.0/go.mod h1:GQUN9bilAbhU/jgc1bKs99f/suXKeUMct8Adx5+Ntkg= +github.com/spf13/viper v1.21.0 h1:x5S+0EU27Lbphp4UKm1C+1oQO+rKx36vfCoaVebLFSU= +github.com/spf13/viper v1.21.0/go.mod h1:P0lhsswPGWD/1lZJ9ny3fYnVqxiegrlNrEmgLjbTCAY= github.com/stretchr/objx v0.1.0/go.mod h1:HFkY916IF+rwdDfMAkV7OtwuqBVzrE8GR6GFx+wExME= github.com/stretchr/objx v0.4.0/go.mod h1:YvHI0jy2hoMjB+UWwv71VJQ9isScKT/TqJzVSSt89Yw= -github.com/stretchr/objx v0.5.0/go.mod h1:Yh+to48EsGEfYuaHDzXPcE3xhTkx73EhmCGUpEOglKo= -github.com/stretchr/objx v0.5.2/go.mod h1:FRsXN1f5AsAjCGJKqEizvkpNtU+EGNCLh3NxZ/8L+MA= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= github.com/stretchr/testify v1.3.0/go.mod h1:M5WIy9Dh21IEIfnGCwXGc5bZfKNJtfHm1UVUgZn+9EI= github.com/stretchr/testify v1.4.0/go.mod h1:j7eGeouHqKxXV5pUuKE4zz7dFj8WfuZ+81PSLYec5m4= @@ -413,19 +399,17 @@ github.com/stretchr/testify v1.6.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/ github.com/stretchr/testify v1.7.0/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.7.1/go.mod h1:6Fq8oRcR53rry900zMqJjRRixrwX3KX962/h/Wwjteg= github.com/stretchr/testify v1.8.0/go.mod h1:yNjHg4UonilssWZ8iaSj1OCr/vHnekPRkoO+kdMU+MU= -github.com/stretchr/testify v1.8.4/go.mod h1:sz/lmYIOXD/1dqDmKjjqLyZ2RngseejIcXlSw2iwfAo= -github.com/stretchr/testify v1.9.0/go.mod h1:r2ic/lqez/lEtzL7wO/rwa5dbSLXVDPFyf8C91i36aY= github.com/stretchr/testify v1.11.1 h1:7s2iGBzp5EwR7/aIZr8ao5+dra3wiQyKjjFuvgVKu7U= github.com/stretchr/testify v1.11.1/go.mod h1:wZwfW3scLgRK+23gO65QZefKpKQRnfz6sD981Nm4B6U= github.com/subosito/gotenv v1.2.0/go.mod h1:N0PQaV/YGNqwC0u51sEeR/aUtSLEXKX9iv69rRypqCw= github.com/subosito/gotenv v1.6.0 h1:9NlTDc1FTs4qu0DDq7AEtTPNw6SVm7uBMsUCUjABIf8= github.com/subosito/gotenv v1.6.0/go.mod h1:Dk4QP5c2W3ibzajGcXpNraDfq2IrhjMIvMSWPKKo0FU= -github.com/supranational/blst v0.3.16-0.20250831170142-f48500c1fdbe h1:nbdqkIGOGfUAD54q1s2YBcBz/WcsxCO9HUQ4aGV5hUw= -github.com/supranational/blst v0.3.16-0.20250831170142-f48500c1fdbe/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= -github.com/tklauser/go-sysconf v0.3.14 h1:g5vzr9iPFFz24v2KZXs/pvpvh8/V9Fw6vQK5ZZb78yU= -github.com/tklauser/go-sysconf v0.3.14/go.mod h1:1ym4lWMLUOhuBOPGtRcJm7tEGX4SCYNEEEtghGG/8uY= -github.com/tklauser/numcpus v0.8.0 h1:Mx4Wwe/FjZLeQsK/6kt2EOepwwSl7SmJrK5bV/dXYgY= -github.com/tklauser/numcpus v0.8.0/go.mod h1:ZJZlAY+dmR4eut8epnzf0u/VwodKmryxR8txiloSqBE= +github.com/supranational/blst v0.3.16 h1:bTDadT+3fK497EvLdWRQEjiGnUtzJ7jjIUMF0jqwYhE= +github.com/supranational/blst v0.3.16/go.mod h1:jZJtfjgudtNl4en1tzwPIV3KjUnQUvG3/j+w+fVonLw= +github.com/tklauser/go-sysconf v0.3.16 h1:frioLaCQSsF5Cy1jgRBrzr6t502KIIwQ0MArYICU0nA= +github.com/tklauser/go-sysconf v0.3.16/go.mod h1:/qNL9xxDhc7tx3HSRsLWNnuzbVfh3e7gh/BmM179nYI= +github.com/tklauser/numcpus v0.11.0 h1:nSTwhKH5e1dMNsCdVBukSZrURJRoHbSEQjdEbY+9RXw= +github.com/tklauser/numcpus v0.11.0/go.mod h1:z+LwcLq54uWZTX0u/bGobaV34u6V7KNlTZejzM6/3MQ= github.com/x448/float16 v0.8.4 h1:qLwI1I70+NjRFUR3zs1JPUCgaCXSh3SW62uAKT1mSBM= github.com/x448/float16 v0.8.4/go.mod h1:14CWIYCyZA/cWjXOioeEpHeN/83MdbZDRQHoFcYsOfg= github.com/yuin/goldmark v1.1.25/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= @@ -447,10 +431,13 @@ go.opencensus.io v0.22.4/go.mod h1:yxeiOL68Rb0Xd1ddK5vPZ/oVn4vY4Ynel7k9FzqtOIw= go.opencensus.io v0.22.5/go.mod h1:5pWMHQbX5EPX2/62yrJeAkowc+lfs/XD7Uxpq3pI6kk= go.opencensus.io v0.23.0/go.mod h1:XItmlyltB5F7CS4xOC1DcqMoFqwtC6OG2xF7mCv7P7E= go.uber.org/atomic v1.7.0/go.mod h1:fEN4uk6kAWBTFdckzkM89CLk9XfWZrxpCo0nPH17wJc= +go.uber.org/goleak v1.3.0 h1:2K3zAYmnTNqV73imy9J1T3WC+gmCePx2hEGkimedGto= +go.uber.org/goleak v1.3.0/go.mod h1:CoHD4mav9JJNrW/WLlf7HGZPjdw8EucARQHekz1X6bE= go.uber.org/multierr v1.6.0/go.mod h1:cdWPpRnG4AhwMwsgIHip0KRBQjJy5kYEpYjJxpXp9iU= -go.uber.org/multierr v1.11.0 h1:blXXJkSxSSfBVBlC76pxqeO+LN3aDfLQo+309xJstO0= -go.uber.org/multierr v1.11.0/go.mod h1:20+QtiLqy0Nd6FdQB9TLXag12DsQkrbs3htMFfDN80Y= go.uber.org/zap v1.17.0/go.mod h1:MXVU+bhUf/A7Xi2HNOnopQOrmycQ5Ih87HtOu4q5SSo= +go.yaml.in/yaml/v2 v2.4.4 h1:tuyd0P+2Ont/d6e2rl3be67goVK4R6deVxCUX5vyPaQ= +go.yaml.in/yaml/v2 v2.4.4/go.mod h1:gMZqIpDtDqOfM0uNfy0SkpRhvUryYH0Z6wdMYcacYXQ= +go.yaml.in/yaml/v3 v3.0.4 h1:tfq32ie2Jv2UxXFdLJdh3jXuOzWiL1fo0bu/FbuKpbc= go.yaml.in/yaml/v3 v3.0.4/go.mod h1:DhzuOOF2ATzADvBadXxruRBLzYTpT36CKvDb3+aBEFg= golang.org/x/crypto v0.0.0-20181029021203-45a5f77698d3/go.mod h1:6SG95UA2DQfeDnfUPMdvaQW0Q7yPrPDi9nlGo2tz2b4= golang.org/x/crypto v0.0.0-20190308221718-c2843e01d9a2/go.mod h1:djNgcEr1/C05ACkg1iLfiJU5Ep61QUkGW8qpdssI0+w= @@ -461,8 +448,8 @@ golang.org/x/crypto v0.0.0-20191011191535-87dc89f01550/go.mod h1:yigFU9vqHzYiE8U golang.org/x/crypto v0.0.0-20200622213623-75b288015ac9/go.mod h1:LzIPMQfyMNhhGPhUkYOs5KpL4U8rLKemX1yGLhDgUto= golang.org/x/crypto v0.0.0-20210711020723-a769d52b0f97/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= golang.org/x/crypto v0.0.0-20210921155107-089bfa567519/go.mod h1:GvvjBRRGRdwPK5ydBHafDWAxML/pGHZbMvKqRZ5+Abc= -golang.org/x/crypto v0.48.0 h1:/VRzVqiRSggnhY7gNRxPauEQ5Drw9haKdM0jqfcCFts= -golang.org/x/crypto v0.48.0/go.mod h1:r0kV5h3qnFPlQnBSrULhlsRfryS2pmewsg+XfMgkVos= +golang.org/x/crypto v0.50.0 h1:zO47/JPrL6vsNkINmLoo/PH1gcxpls50DNogFvB5ZGI= +golang.org/x/crypto v0.50.0/go.mod h1:3muZ7vA7PBCE6xgPX7nkzzjiUq87kRItoJQM1Yo8S+Q= golang.org/x/exp v0.0.0-20190121172915-509febef88a4/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190306152737-a1d7652674e8/go.mod h1:CJ0aWSM057203Lf6IL+f9T1iT9GByDxfZKAQTCR3kQA= golang.org/x/exp v0.0.0-20190510132918-efd6b22b2522/go.mod h1:ZjyILWgesfNpC6sMxTJOJm9Kp84zZh5NQWvqDGG3Qr8= @@ -473,8 +460,8 @@ golang.org/x/exp v0.0.0-20191227195350-da58074b4299/go.mod h1:2RIsYlXP63K8oxa1u0 golang.org/x/exp v0.0.0-20200119233911-0405dc783f0a/go.mod h1:2RIsYlXP63K8oxa1u096TMicItID8zy7Y6sNkU49FU4= golang.org/x/exp v0.0.0-20200207192155-f17229e696bd/go.mod h1:J/WKrq2StrnmMY6+EHIKF9dgMWnmCNThgcyBT1FY9mM= golang.org/x/exp v0.0.0-20200224162631-6cc2880d07d6/go.mod h1:3jZMyOhIsHpP37uCMkUooju7aAi5cS1Q23tOzKc+0MU= -golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93 h1:fQsdNF2N+/YewlRZiricy4P1iimyPKZ/xwniHj8Q2a0= -golang.org/x/exp v0.0.0-20251219203646-944ab1f22d93/go.mod h1:EPRbTFwzwjXj9NpYyyrvenVh9Y+GFeEvMNh7Xuz7xgU= +golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f h1:W3F4c+6OLc6H2lb//N1q4WpJkhzJCK5J6kUi1NTVXfM= +golang.org/x/exp v0.0.0-20260410095643-746e56fc9e2f/go.mod h1:J1xhfL/vlindoeF/aINzNzt2Bket5bjo9sdOYzOsU80= golang.org/x/image v0.0.0-20190227222117-0694c2d4d067/go.mod h1:kZ7UVZpmo3dzQBMxlp+ypCbDeSB+sBbTgSJuh5dn5js= golang.org/x/image v0.0.0-20190802002840-cff245a6509b/go.mod h1:FeLwcggjj3mMvU+oOTbSwawSJRM1uh48EjtB4UJZlP0= golang.org/x/lint v0.0.0-20181026193005-c67002cb31c3/go.mod h1:UVdnD1Gm6xHRNCYTkRU2/jEulfH38KcIWyp/GAMgvoE= @@ -542,8 +529,8 @@ golang.org/x/net v0.0.0-20210405180319-a5a99cb37ef4/go.mod h1:p54w0d4576C0XHj96b golang.org/x/net v0.0.0-20220722155237-a158d28d115b/go.mod h1:XRhObCWvk6IyKnWLug+ECip1KBveYUHfp+8e9klMJ9c= golang.org/x/net v0.6.0/go.mod h1:2Tu9+aMcznHK/AK1HMvgo6xiTLG5rD5rZLDS+rp2Bjs= golang.org/x/net v0.8.0/go.mod h1:QVkue5JL9kW//ek3r6jTKnTFis1tRmNAW2P1shuFdJc= -golang.org/x/net v0.49.0 h1:eeHFmOGUTtaaPSGNmjBKpbng9MulQsJURQUAfUwY++o= -golang.org/x/net v0.49.0/go.mod h1:/ysNB2EvaqvesRkuLAyjI1ycPZlQHM3q01F02UY/MV8= +golang.org/x/net v0.53.0 h1:d+qAbo5L0orcWAr0a9JweQpjXF19LMXJE8Ey7hwOdUA= +golang.org/x/net v0.53.0/go.mod h1:JvMuJH7rrdiCfbeHoo3fCQU24Lf5JJwT9W3sJFulfgs= golang.org/x/oauth2 v0.0.0-20180821212333-d2e6202438be/go.mod h1:N/0e6XlmueqKjAGxoOufVs8QHGRruUQn6yWY3a++T0U= golang.org/x/oauth2 v0.0.0-20190226205417-e64efc72b421/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= golang.org/x/oauth2 v0.0.0-20190604053449-0f29369cfe45/go.mod h1:gOpvHmFTYa4IltrdGE7lF6nIHvwfUNPOp7c8zoXwtLw= @@ -619,15 +606,12 @@ golang.org/x/sys v0.0.0-20210630005230-0f9fa26af87c/go.mod h1:oPkhp1MJrh7nUepCBc golang.org/x/sys v0.0.0-20211007075335-d3039528d8ac/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220310020820-b874c991c1a5/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220520151302-bc2c85ada10a/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220715151400-c0bba94af5f8/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.0.0-20220722155257-8c9f86f7a55f/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.0.0-20220811171246-fbc7d0a398ab/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.1.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.5.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= golang.org/x/sys v0.6.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.12.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= -golang.org/x/sys v0.42.0 h1:omrd2nAlyT5ESRdCLYdm3+fMfNFE/+Rf4bDIQImRJeo= -golang.org/x/sys v0.42.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= +golang.org/x/sys v0.44.0 h1:ildZl3J4uzeKP07r2F++Op7E9B29JRUy+a27EibtBTQ= +golang.org/x/sys v0.44.0/go.mod h1:4GL1E5IUh+htKOUEOaiffhrAeqysfVGipDYzABqnCmw= golang.org/x/term v0.0.0-20201126162022-7de9c90e9dd1/go.mod h1:bj7SfCRtBDWHUb9snDiAeCFNEtKQo2Wmx5Cou7ajbmo= golang.org/x/term v0.0.0-20210927222741-03fcf44c2211/go.mod h1:jbD1KX2456YbFQfuXm/mYQcufACuNUgVhRMnK/tPxf8= golang.org/x/term v0.5.0/go.mod h1:jMB1sMXY+tzblOD4FWmEbocvup2/aLOaQEp7JmGp78k= @@ -642,13 +626,13 @@ golang.org/x/text v0.3.5/go.mod h1:5Zoc/QRtKVWzQhOtBMvqHzDpF6irO9z98xDceosuGiQ= golang.org/x/text v0.3.7/go.mod h1:u+2+/6zg+i71rQMx5EYifcz6MCKuco9NR6JIITiCfzQ= golang.org/x/text v0.7.0/go.mod h1:mrYo+phRRbMaCq/xk9113O4dZlRixOauAjOtrjsXDZ8= golang.org/x/text v0.8.0/go.mod h1:e1OnstbJyHTd6l/uOt8jFFHp6TRDWZR/bV3emEE/zU8= -golang.org/x/text v0.34.0 h1:oL/Qq0Kdaqxa1KbNeMKwQq0reLCCaFtqu2eNuSeNHbk= -golang.org/x/text v0.34.0/go.mod h1:homfLqTYRFyVYemLBFl5GgL/DWEiH5wcsQ5gSh1yziA= +golang.org/x/text v0.36.0 h1:JfKh3XmcRPqZPKevfXVpI1wXPTqbkE5f7JA92a55Yxg= +golang.org/x/text v0.36.0/go.mod h1:NIdBknypM8iqVmPiuco0Dh6P5Jcdk8lJL0CUebqK164= golang.org/x/time v0.0.0-20181108054448-85acf8d2951c/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20190308202827-9d24e82272b4/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= golang.org/x/time v0.0.0-20191024005414-555d28b269f0/go.mod h1:tRJNPiyCQ0inRvYxbN9jk5I+vvW/OXSQhTDSoE431IQ= -golang.org/x/time v0.9.0 h1:EsRrnYcQiGH+5FfbgvV4AP7qEZstoyrHB0DzarOQ4ZY= -golang.org/x/time v0.9.0/go.mod h1:3BpzKBy/shNhVucY/MWOyx10tF3SFh9QdLuxbVysPQM= +golang.org/x/time v0.15.0 h1:bbrp8t3bGUeFOx08pvsMYRTCVSMk89u4tKbNOZbp88U= +golang.org/x/time v0.15.0/go.mod h1:Y4YMaQmXwGQZoFaVFk4YpCt4FLQMYKZe9oeV/f4MSno= golang.org/x/tools v0.0.0-20180917221912-90fa682c2a6e/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190114222345-bf090417da8b/go.mod h1:n7NCudcB/nEzxVGmLbDWY5pfWTLqBcC2KZ6jyYvM4mQ= golang.org/x/tools v0.0.0-20190226205152-f727befe758c/go.mod h1:9Yl7xja0Znq3iFh3HoIrodX9oNMXvdceNzlUR8zjMvY= @@ -818,8 +802,6 @@ gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntN gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/errgo.v2 v2.1.0/go.mod h1:hNsd1EY+bozCKY1Ytp96fpM3vjJbqLJn88ws8XvfDNI= gopkg.in/ini.v1 v1.62.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= -gopkg.in/ini.v1 v1.67.0 h1:Dgnx+6+nfE+IfzjUEISNeydPJh9AXNNsWbGP9KzCsOA= -gopkg.in/ini.v1 v1.67.0/go.mod h1:pNLf8WUiyNEtQjuu5G5vTm06TEv9tsIgeAvK8hOrP4k= gopkg.in/yaml.v2 v2.2.2/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.3/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= gopkg.in/yaml.v2 v2.2.8/go.mod h1:hI93XBmqTisBFMUTm0b8Fm+jr3Dg1NNxqwp+5A1VGuI= diff --git a/prover/gpu/.gitignore b/prover/gpu/.gitignore new file mode 100644 index 00000000000..f8934159a09 --- /dev/null +++ b/prover/gpu/.gitignore @@ -0,0 +1,8 @@ +# CUDA build artifacts +cuda/build/ +cuda/build-release/ + +# Sandbox binary +cuda/sandbox/sandbox + +plonk/tmp/plonk_cache \ No newline at end of file diff --git a/prover/gpu/cuda/CMakeLists.txt b/prover/gpu/cuda/CMakeLists.txt new file mode 100644 index 00000000000..9e8cf3996ec --- /dev/null +++ b/prover/gpu/cuda/CMakeLists.txt @@ -0,0 +1,46 @@ +cmake_minimum_required(VERSION 4.0) + +project(gnark-gpu + VERSION 0.1.0 + LANGUAGES CXX CUDA + DESCRIPTION "CUDA accelerated field arithmetic for gnark (BLS12-377 + KoalaBear/Vortex)" +) + +set(CMAKE_CXX_STANDARD 20) +set(CMAKE_CXX_STANDARD_REQUIRED ON) +set(CMAKE_CUDA_STANDARD 20) +set(CMAKE_CUDA_STANDARD_REQUIRED ON) +set(CMAKE_EXPORT_COMPILE_COMMANDS ON) +set(CMAKE_CUDA_ARCHITECTURES native) + +# Build position-independent code for shared library compatibility +set(CMAKE_POSITION_INDEPENDENT_CODE ON) + +# Library (static for CGO integration) +add_library(gnark_gpu STATIC + src/plonk/kernels.cu + src/plonk/msm.cu + src/plonk/ntt.cu + src/plonk/fr_ops.cu + src/plonk/plonk_z.cu + src/plonk/plonk_eval.cu + src/plonk/api.cu + src/plonk2/g1.cu + src/plonk2/kernels.cu + src/plonk2/mimc.cu + src/plonk2/msm.cu + src/vortex/kb.cu +) +set_target_properties(gnark_gpu PROPERTIES + OUTPUT_NAME gnark_gpu + CUDA_SEPARABLE_COMPILATION OFF + CUDA_RESOLVE_DEVICE_SYMBOLS ON +) +target_include_directories(gnark_gpu + PUBLIC + $ + PRIVATE + ${CMAKE_CURRENT_SOURCE_DIR}/src/plonk + ${CMAKE_CURRENT_SOURCE_DIR}/src/plonk2 + ${CMAKE_CURRENT_SOURCE_DIR}/src/vortex +) diff --git a/prover/gpu/cuda/CMakePresets.json b/prover/gpu/cuda/CMakePresets.json new file mode 100644 index 00000000000..b80a3a43a85 --- /dev/null +++ b/prover/gpu/cuda/CMakePresets.json @@ -0,0 +1,34 @@ +{ + "version": 6, + "configurePresets": [ + { + "name": "dev", + "displayName": "Development", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Debug", + "CMAKE_EXPORT_COMPILE_COMMANDS": "ON" + } + }, + { + "name": "release", + "displayName": "Release", + "generator": "Ninja", + "binaryDir": "${sourceDir}/build-release", + "cacheVariables": { + "CMAKE_BUILD_TYPE": "Release" + } + } + ], + "buildPresets": [ + { + "name": "dev", + "configurePreset": "dev" + }, + { + "name": "release", + "configurePreset": "release" + } + ] +} diff --git a/prover/gpu/cuda/include/gnark_gpu.h b/prover/gpu/cuda/include/gnark_gpu.h new file mode 100644 index 00000000000..496e87a1224 --- /dev/null +++ b/prover/gpu/cuda/include/gnark_gpu.h @@ -0,0 +1,1022 @@ +// gnark-gpu C API for Go bindings +// This header provides extern "C" functions for CGO integration + +#ifndef GNARK_GPU_H +#define GNARK_GPU_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// ============================================================================= +// Opaque handles +// ============================================================================= + +typedef struct GnarkGPUContext *gnark_gpu_context_t; +typedef struct GnarkGPUFrVector *gnark_gpu_fr_vector_t; +typedef struct GnarkGPUMSM *gnark_gpu_msm_t; +typedef struct GnarkGPUPlonk2FrVector *gnark_gpu_plonk2_fr_vector_t; +typedef struct GnarkGPUPlonk2NTTDomain *gnark_gpu_plonk2_ntt_domain_t; +typedef struct GnarkGPUPlonk2MSM *gnark_gpu_plonk2_msm_t; + +// ============================================================================= +// Error codes +// ============================================================================= + +typedef enum { + GNARK_GPU_SUCCESS = 0, + GNARK_GPU_ERROR_CUDA = 1, + GNARK_GPU_ERROR_INVALID_ARG = 2, + GNARK_GPU_ERROR_OUT_OF_MEMORY = 3, + GNARK_GPU_ERROR_SIZE_MISMATCH = 4, +} gnark_gpu_error_t; + +typedef enum { + GNARK_GPU_PLONK2_CURVE_BN254 = 1, + GNARK_GPU_PLONK2_CURVE_BLS12_377 = 2, + GNARK_GPU_PLONK2_CURVE_BW6_761 = 3, +} gnark_gpu_plonk2_curve_id_t; + +// ============================================================================= +// Context lifecycle +// ============================================================================= + +// Initialize GPU context on specified device +// Returns GNARK_GPU_SUCCESS on success, error code otherwise +gnark_gpu_error_t gnark_gpu_init(int device_id, gnark_gpu_context_t *ctx); + +// Destroy GPU context and release resources +void gnark_gpu_destroy(gnark_gpu_context_t ctx); + +// Bind the calling OS thread to a CUDA device. CUDA's "current device" is +// per-thread state; on multi-GPU hosts every host thread that issues CUDA +// calls (allocations, launches, transfers) must set its target device once, +// otherwise everything falls through to device 0. +// +// Used in tandem with runtime.LockOSThread on the Go side: pin a goroutine +// to its OS thread, set the target device, run the GPU pipeline, unpin. +// +// Idempotent and cheap (single cudaSetDevice). +gnark_gpu_error_t gnark_gpu_set_device(int device_id); + +// ============================================================================= +// Fr vector operations +// ============================================================================= + +// Allocate GPU memory for `count` Fr elements +gnark_gpu_error_t gnark_gpu_fr_vector_alloc(gnark_gpu_context_t ctx, size_t count, + gnark_gpu_fr_vector_t *vec); + +// Free GPU memory +void gnark_gpu_fr_vector_free(gnark_gpu_fr_vector_t vec); + +// Get the number of elements in the vector +size_t gnark_gpu_fr_vector_len(gnark_gpu_fr_vector_t vec); + +// ============================================================================= +// Data transfer +// Host data is in AoS format (gnark-crypto layout): [e0.l0, e0.l1, e0.l2, e0.l3, e1.l0, ...] +// GPU storage is SoA format for coalesced memory access +// Transpose happens on GPU during copy operations +// ============================================================================= + +// Copy from host (AoS) to device (SoA) +gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_device(gnark_gpu_fr_vector_t vec, + const uint64_t *host_data, + size_t count); + +// Copy from device (SoA) to host (AoS) +gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_host(gnark_gpu_fr_vector_t vec, + uint64_t *host_data, size_t count); + +// ============================================================================= +// Arithmetic operations (async - call gnark_gpu_sync to wait) +// All operations are element-wise: result[i] = op(a[i], b[i]) +// ============================================================================= + +// Element-wise Montgomery multiplication: result = a * b (mod p) +gnark_gpu_error_t gnark_gpu_fr_vector_mul(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b); + +// Element-wise addition: result = a + b (mod p) +gnark_gpu_error_t gnark_gpu_fr_vector_add(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b); + +// Element-wise subtraction: result = a - b (mod p) +gnark_gpu_error_t gnark_gpu_fr_vector_sub(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b); + +// Ensure the context's shared staging buffer can hold at least min_count elements. +// Called automatically by copy operations; call explicitly to pre-allocate. +gnark_gpu_error_t gnark_gpu_staging_ensure(gnark_gpu_context_t ctx, size_t min_count); + +// v[i] *= g^i for i in [0, count). g is 4 uint64s in Montgomery form. +gnark_gpu_error_t gnark_gpu_fr_vector_scale_by_powers(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + const uint64_t g[4]); + +// v[i] *= c for all i. c is 4 uint64s in Montgomery form. +gnark_gpu_error_t gnark_gpu_fr_vector_scalar_mul(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + const uint64_t c[4]); + +// dst[i] = src[i] (device-to-device copy of SoA limbs) +gnark_gpu_error_t gnark_gpu_fr_vector_copy_d2d(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src); + +// Set all elements to zero +gnark_gpu_error_t gnark_gpu_fr_vector_set_zero(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v); + +// v[i] += a[i] * b[i] (fused multiply-add) +gnark_gpu_error_t gnark_gpu_fr_vector_addmul(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b); + +// v[i] = 1/v[i] using Montgomery batch inversion (parallel two-level scan). +// temp must be a separate FrVector of the same size (used as scratch space). +gnark_gpu_error_t gnark_gpu_fr_vector_batch_invert(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t temp); + +// Size-4 inverse DFT butterfly for decomposed iFFT(4n). +// b0,b1,b2,b3 are 4 FrVectors of size n (modified in-place). +// omega4_inv: inverse of primitive 4th root of unity (4 uint64s, Montgomery form). +// quarter: 1/4 in Montgomery form (4 uint64s). +gnark_gpu_error_t gnark_gpu_fr_vector_butterfly4(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t b0, + gnark_gpu_fr_vector_t b1, + gnark_gpu_fr_vector_t b2, + gnark_gpu_fr_vector_t b3, + const uint64_t omega4_inv[4], + const uint64_t quarter[4]); + +// ============================================================================= +// MSM (Multi-Scalar Multiplication) using Twisted Edwards coordinates +// Points are in compact TE XY format: (x_te, y_te) +// ============================================================================= + +// Create MSM context for up to max_points compact TE XY points +gnark_gpu_error_t gnark_gpu_msm_create(gnark_gpu_context_t ctx, size_t max_points, + gnark_gpu_msm_t *msm); + +// Upload compact TE XY points to GPU (kept resident for reuse) +// points_data layout: [x0.l0..x0.l5, y0.l0..y0.l5, ...] +// Each point is 12 uint64s (96 bytes) in Montgomery form +gnark_gpu_error_t gnark_gpu_msm_load_points(gnark_gpu_msm_t msm, + const uint64_t *points_data, + size_t count); + +// Run MSM: result = sum(scalars[i] * points[i]) for i in [0, count) +// scalars: count * 4 uint64s in Montgomery form (kernel converts) +// result: num_windows * 24 uint64s representing per-window TE extended points +gnark_gpu_error_t gnark_gpu_msm_run(gnark_gpu_msm_t msm, uint64_t *result, + const uint64_t *scalars, size_t count); + +// Destroy MSM context and free GPU memory +void gnark_gpu_msm_destroy(gnark_gpu_msm_t msm); + +// Query MSM configuration (c = window bits, num_windows = ceil(253/c)) +void gnark_gpu_msm_get_config(gnark_gpu_msm_t msm, int *c, int *num_windows); + +// Upload Short-Weierstrass affine points (gnark bls12377.G1Affine layout — +// 12 uint64s per point, Montgomery form) into the optional d_points_sw GPU +// buffer. Used only by the batched-affine accumulate kernel +// (GNARK_GPU_MSM_BATCHED_AFFINE=1). Allocates the buffer on first call. +gnark_gpu_error_t gnark_gpu_msm_load_points_sw(gnark_gpu_msm_t msm, + const uint64_t *points_data, + size_t count); + +// Pin work buffers (sort buffers + cudaHostRegister of caller scalars) across +// gnark_gpu_msm_run calls. Without this, msm_run lazily allocates several +// GB of sort buffers and registers caller memory at the start of each call, +// then frees them at the end — costing 5–10 ms of host overhead per call. +// Use this when running back-to-back MSMs (e.g., a wave of PlonK commitments). +// Caller MUST release before any phase that needs the VRAM (e.g., quotient). +gnark_gpu_error_t gnark_gpu_msm_pin_work_buffers(gnark_gpu_msm_t msm); + +// Release pinned work buffers immediately (frees VRAM, drops host +// registration). Subsequent gnark_gpu_msm_run calls re-allocate lazily. +gnark_gpu_error_t gnark_gpu_msm_release_work_buffers(gnark_gpu_msm_t msm); + +// Test entrypoints for SW affine primitives — used to validate the GPU +// arithmetic against gnark-crypto host reference. Inputs/outputs use +// gnark's bls12377.G1Affine memory layout (12 uint64 limbs, Montgomery form). +gnark_gpu_error_t gnark_gpu_test_sw_pair_add( + const uint64_t *p0, const uint64_t *p1, uint64_t *out); + +// Convert SW affine to TE extended (X, Y, T, Z) — output is 24 uint64s. +gnark_gpu_error_t gnark_gpu_test_sw_to_te( + const uint64_t *p_sw, uint64_t *out_te); + +// Reduce N affine SW points (≤ 256) via batched-affine pairwise reduction +// in shared memory. Output is the SW affine sum (12 uint64s). Used to +// isolate bugs in the multi-wave reduction logic. +gnark_gpu_error_t gnark_gpu_test_batched_affine_reduce( + const uint64_t *points_aos, uint64_t *out_aos, int N); + +// Per-phase timings of the last gnark_gpu_msm_run call. Phase order +// (9 floats, milliseconds): +// 0: H2D (scalar upload) +// 1: build_pairs (signed-digit decomposition) +// 2: sort (CUB radix sort) +// 3: boundaries (memset + detect_bucket_boundaries) +// 4: accumulate_seq (sequential bucket accumulation, with cap) +// 5: accumulate_par (parallel overflow tail; 0 if no overflow buckets) +// 6: reduce_partial (per-window range scan) +// 7: reduce_finalize (combine ranges into per-window result) +// 8: D2H (window results download) +// Returns the number of phases written (9 on success, 0 if msm/out is null). +int gnark_gpu_msm_get_phase_timings(gnark_gpu_msm_t msm, float *out); + +// Offload: free d_points from GPU, keep working buffers +gnark_gpu_error_t gnark_gpu_msm_offload_points(gnark_gpu_msm_t msm); + +// Reload: re-allocate d_points and upload from (pinned) host memory +gnark_gpu_error_t gnark_gpu_msm_reload_points(gnark_gpu_msm_t msm, + const uint64_t *points_data, size_t count); + +// ============================================================================= +// NTT (Number Theoretic Transform) +// ============================================================================= + +typedef struct GnarkGPUNTTDomain *gnark_gpu_ntt_domain_t; + +// Create NTT domain with precomputed twiddle factors. +// size: must be a power of 2 +// fwd_twiddles_aos: n/2 elements in AoS format (4 uint64s each), Montgomery form +// These are w^0, w^1, ..., w^(n/2-1) where w is the n-th root of unity. +// inv_twiddles_aos: n/2 elements in AoS format, twiddles for inverse NTT +// These are w_inv^0, w_inv^1, ..., w_inv^(n/2-1) where w_inv = w^{-1}. +// inv_n: 1/n in Montgomery form (4 uint64s) +gnark_gpu_error_t gnark_gpu_ntt_domain_create(gnark_gpu_context_t ctx, size_t size, + const uint64_t *fwd_twiddles_aos, + const uint64_t *inv_twiddles_aos, + const uint64_t *inv_n, + gnark_gpu_ntt_domain_t *domain); + +// Destroy NTT domain and free GPU twiddle memory. +void gnark_gpu_ntt_domain_destroy(gnark_gpu_ntt_domain_t domain); + +// Forward NTT (DIF): natural-order input -> bit-reversed output. +// data must be an allocated FrVector of domain size. +gnark_gpu_error_t gnark_gpu_ntt_forward(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data); + +// Inverse NTT (DIT): bit-reversed input -> natural-order output, scaled by 1/n. +// data must be an allocated FrVector of domain size. +gnark_gpu_error_t gnark_gpu_ntt_inverse(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data); + +// Bit-reversal permutation on an FrVector. +gnark_gpu_error_t gnark_gpu_ntt_bit_reverse(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data); + +// Fused CosetFFT forward: ScaleByPowers + DIF NTT + BitReverse in one call. +// Eliminates one memory round-trip vs separate ScaleByPowers + FFT. +// g: coset generator (4 uint64s, Montgomery form) +// g_half: g^(n/2) precomputed by caller (4 uint64s, Montgomery form) +gnark_gpu_error_t gnark_gpu_ntt_forward_coset(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + const uint64_t g[4], + const uint64_t g_half[4]); + +gnark_gpu_error_t gnark_gpu_ntt_forward_coset_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + const uint64_t g[4], + const uint64_t g_half[4], + int stream_id); + +// Fused permutation + boundary constraint kernel for PlonK. +// Computes: result[i] = alpha * (ordering_constraint + alpha * boundary_constraint) +// Inputs: L,R,O,Z (wire evals), S1,S2,S3 (perm evals), L1_denInv (batch-inverted denoms). +// Scalars packed into params array (7 * 4 uint64s): +// [alpha, beta, gamma, l1_scalar, coset_shift, coset_shift_sq, coset_gen] +// twiddles: forward twiddle array from NTT domain (n/2 elements, SoA layout). +gnark_gpu_error_t gnark_gpu_plonk_perm_boundary( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t L, gnark_gpu_fr_vector_t R, gnark_gpu_fr_vector_t O, + gnark_gpu_fr_vector_t Z, + gnark_gpu_fr_vector_t S1, gnark_gpu_fr_vector_t S2, gnark_gpu_fr_vector_t S3, + gnark_gpu_fr_vector_t L1_denInv, + const uint64_t params[28], + gnark_gpu_ntt_domain_t domain); + +// ============================================================================= +// Device memory helpers (for permutation table etc.) +// ============================================================================= + +// Allocate device memory and copy int64 data from host. +// Caller must free with gnark_gpu_device_free_ptr. +gnark_gpu_error_t gnark_gpu_device_alloc_copy_int64(gnark_gpu_context_t ctx, + const int64_t *host_data, size_t count, + void **d_ptr); + +// Free device memory allocated by gnark_gpu_device_alloc_copy_int64. +void gnark_gpu_device_free_ptr(void *d_ptr); + +// ============================================================================= +// PlonK Z-polynomial ratio computation +// ============================================================================= + +// Compute per-element Z ratio factors on GPU. +// For each i: num[i] and den[i] are computed from wire evaluations L, R, O, +// the permutation table, and identity polynomial evaluations. +// +// On exit: L_inout contains numerators, R_inout contains denominators. +// O is read-only (not modified). The caller should then: +// 1. BatchInvert R (denominators → 1/den) +// 2. Mul(L, L, R) to get ratios = num / den +// 3. Download and do CPU prefix product to build Z +// +// params layout: [beta[4], gamma[4], g_mul[4], g_sq[4]] (16 uint64s, Montgomery form) +// d_perm: device pointer to permutation table (3n int64s), from gnark_gpu_device_alloc_copy_int64 +// log2n: log2 of domain size n +gnark_gpu_error_t gnark_gpu_plonk_z_compute_factors( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t L_inout, + gnark_gpu_fr_vector_t R_inout, + gnark_gpu_fr_vector_t O_in, + const void *d_perm, + const uint64_t params[16], + unsigned log2n, + gnark_gpu_ntt_domain_t domain); + +// ============================================================================= +// Pinned memory management +// ============================================================================= + +// Allocate pinned (page-locked) host memory for fast DMA transfers +gnark_gpu_error_t gnark_gpu_alloc_pinned(void **ptr, size_t bytes); + +// Free pinned host memory +void gnark_gpu_free_pinned(void *ptr); + +// ============================================================================= +// GPU L1 denominator computation +// ============================================================================= + +// Compute out[i] = cosetGen * omega^i - 1 for i in [0, n). +// Uses forward twiddle factors from the NTT domain. +// The caller should BatchInvert the result to get L1DenInv. +gnark_gpu_error_t gnark_gpu_compute_l1_den( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t out, + const uint64_t coset_gen[4], + gnark_gpu_ntt_domain_t domain); + +// ============================================================================= +// Patch elements +// ============================================================================= + +// Write `count` AoS elements from host into the SoA GPU vector starting at `offset`. +// host_data_aos layout: [e0.l0, e0.l1, e0.l2, e0.l3, e1.l0, ...] +// Useful for patching a few blinding correction elements without a full H2D transfer. +gnark_gpu_error_t gnark_gpu_fr_vector_patch( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t vec, + size_t offset, + const uint64_t *host_data_aos, + size_t count); + +// ============================================================================= +// Synchronization +// ============================================================================= + +// Wait for all GPU operations on the context to complete +gnark_gpu_error_t gnark_gpu_sync(gnark_gpu_context_t ctx); + +// ============================================================================= +// Multi-stream support +// ============================================================================= + +#define GNARK_GPU_MAX_STREAMS 4 +#define GNARK_GPU_MAX_EVENTS 16 + +// Create a CUDA stream at the given index. Stream 0 is created automatically. +// stream_id must be in [1, GNARK_GPU_MAX_STREAMS). +gnark_gpu_error_t gnark_gpu_create_stream(gnark_gpu_context_t ctx, int stream_id); + +// Record an event on a stream. The event can later be waited on by another stream. +// event_id must be in [0, GNARK_GPU_MAX_EVENTS). +gnark_gpu_error_t gnark_gpu_record_event(gnark_gpu_context_t ctx, int stream_id, int event_id); + +// Make a stream wait for an event recorded on another stream. +gnark_gpu_error_t gnark_gpu_wait_event(gnark_gpu_context_t ctx, int stream_id, int event_id); + +// Synchronize a specific stream (wait for all operations on it to complete). +gnark_gpu_error_t gnark_gpu_sync_stream(gnark_gpu_context_t ctx, int stream_id); + +// ============================================================================= +// Stream-aware data transfer +// ============================================================================= + +// Copy from host (AoS) to device (SoA) on a specific stream. +// For truly async transfers, host_data should be pinned memory. +gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_device_stream( + gnark_gpu_fr_vector_t vec, const uint64_t *host_data, + size_t count, int stream_id); + +// Copy from device (SoA) to host (AoS) on a specific stream. +// Synchronizes the stream before returning to ensure data is available. +gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_host_stream( + gnark_gpu_fr_vector_t vec, uint64_t *host_data, + size_t count, int stream_id); + +// Device-to-device copy on a specific stream. +gnark_gpu_error_t gnark_gpu_fr_vector_copy_d2d_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src, int stream_id); + +// ============================================================================= +// Stream-aware NTT operations +// ============================================================================= + +gnark_gpu_error_t gnark_gpu_ntt_forward_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + int stream_id); + +gnark_gpu_error_t gnark_gpu_ntt_inverse_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + int stream_id); + +gnark_gpu_error_t gnark_gpu_ntt_bit_reverse_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + int stream_id); + +// ============================================================================= +// Stream-aware arithmetic operations +// ============================================================================= + +gnark_gpu_error_t gnark_gpu_fr_vector_scale_by_powers_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + const uint64_t g[4], int stream_id); + +gnark_gpu_error_t gnark_gpu_fr_vector_scalar_mul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + const uint64_t c[4], int stream_id); + +gnark_gpu_error_t gnark_gpu_fr_vector_set_zero_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, int stream_id); + +gnark_gpu_error_t gnark_gpu_fr_vector_add_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id); + +gnark_gpu_error_t gnark_gpu_fr_vector_sub_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id); + +gnark_gpu_error_t gnark_gpu_fr_vector_mul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id); + +gnark_gpu_error_t gnark_gpu_fr_vector_addmul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id); + +gnark_gpu_error_t gnark_gpu_fr_vector_batch_invert_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t temp, int stream_id); + +// ============================================================================= +// AddScalarMul: v[i] += a[i] * scalar (broadcast scalar multiply-add) +// ============================================================================= + +gnark_gpu_error_t gnark_gpu_fr_vector_add_scalar_mul( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, const uint64_t scalar[4]); + +gnark_gpu_error_t gnark_gpu_fr_vector_add_scalar_mul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, const uint64_t scalar[4], int stream_id); + +// ============================================================================= +// Stream-aware PlonK operations +// ============================================================================= + +gnark_gpu_error_t gnark_gpu_compute_l1_den_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t out, + const uint64_t coset_gen[4], gnark_gpu_ntt_domain_t domain, + int stream_id); + +gnark_gpu_error_t gnark_gpu_plonk_perm_boundary_stream( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t L, gnark_gpu_fr_vector_t R, gnark_gpu_fr_vector_t O, + gnark_gpu_fr_vector_t Z, + gnark_gpu_fr_vector_t S1, gnark_gpu_fr_vector_t S2, gnark_gpu_fr_vector_t S3, + gnark_gpu_fr_vector_t L1_denInv, + const uint64_t params[28], + gnark_gpu_ntt_domain_t domain, int stream_id); + +// ============================================================================= +// PlonK2 curve-generic Fr primitives +// ============================================================================= + +// The plonk2 API is curve-indexed and uses raw AoS host buffers. Element +// width is inferred from curve_id: +// BN254 Fr: 4 uint64 limbs +// BLS12-377 Fr: 4 uint64 limbs +// BW6-761 Fr: 6 uint64 limbs +// +// This API intentionally does not replace the BLS12-377-specialized plonk API +// above. It is the validation-first foundation for generalized FFT/MSM work. + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_alloc( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + size_t count, + gnark_gpu_plonk2_fr_vector_t *vec); + +void gnark_gpu_plonk2_fr_vector_free(gnark_gpu_plonk2_fr_vector_t vec); + +size_t gnark_gpu_plonk2_fr_vector_len(gnark_gpu_plonk2_fr_vector_t vec); + +int gnark_gpu_plonk2_fr_vector_limbs(gnark_gpu_plonk2_fr_vector_t vec); + +gnark_gpu_plonk2_curve_id_t gnark_gpu_plonk2_fr_vector_curve( + gnark_gpu_plonk2_fr_vector_t vec); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_to_device( + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *host_data, + size_t count); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_to_host( + gnark_gpu_plonk2_fr_vector_t vec, + uint64_t *host_data, + size_t count); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_d2d( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t dst, + gnark_gpu_plonk2_fr_vector_t src); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_set_zero( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_add( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_sub( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_mul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_addmul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_scalar_mul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *scalar); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_add_scalar_mul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + gnark_gpu_plonk2_fr_vector_t a, + const uint64_t *scalar); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_batch_invert( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + gnark_gpu_plonk2_fr_vector_t temp); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_butterfly4_inverse( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t b0, + gnark_gpu_plonk2_fr_vector_t b1, + gnark_gpu_plonk2_fr_vector_t b2, + gnark_gpu_plonk2_fr_vector_t b3, + const uint64_t *omega4_inv, + const uint64_t *quarter); + +gnark_gpu_error_t gnark_gpu_plonk2_reduce_blinded_coset( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t dst, + gnark_gpu_plonk2_fr_vector_t src, + const uint64_t *tail, + size_t tail_len, + const uint64_t *coset_pow_n); + +gnark_gpu_error_t gnark_gpu_plonk2_compute_l1_den( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t out, + const uint64_t *coset_gen); + +gnark_gpu_error_t gnark_gpu_plonk2_gate_accum( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t ql, + gnark_gpu_plonk2_fr_vector_t qr, + gnark_gpu_plonk2_fr_vector_t qm, + gnark_gpu_plonk2_fr_vector_t qo, + gnark_gpu_plonk2_fr_vector_t qk, + gnark_gpu_plonk2_fr_vector_t l, + gnark_gpu_plonk2_fr_vector_t r, + gnark_gpu_plonk2_fr_vector_t o, + const uint64_t *zh_k_inv); + +gnark_gpu_error_t gnark_gpu_plonk2_linearize_static( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t z, + gnark_gpu_plonk2_fr_vector_t s3, + gnark_gpu_plonk2_fr_vector_t ql, + gnark_gpu_plonk2_fr_vector_t qr, + gnark_gpu_plonk2_fr_vector_t qm, + gnark_gpu_plonk2_fr_vector_t qo, + gnark_gpu_plonk2_fr_vector_t qk, + const uint64_t *scalars); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_subtract_head( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *tail, + size_t tail_len); + +gnark_gpu_error_t gnark_gpu_plonk2_perm_boundary( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t l, + gnark_gpu_plonk2_fr_vector_t r, + gnark_gpu_plonk2_fr_vector_t o, + gnark_gpu_plonk2_fr_vector_t z, + gnark_gpu_plonk2_fr_vector_t s1, + gnark_gpu_plonk2_fr_vector_t s2, + gnark_gpu_plonk2_fr_vector_t s3, + gnark_gpu_plonk2_fr_vector_t l1_den_inv, + const uint64_t *params, + gnark_gpu_plonk2_ntt_domain_t domain); + +gnark_gpu_error_t gnark_gpu_plonk2_z_compute_factors( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t l_inout, + gnark_gpu_plonk2_fr_vector_t r_inout, + gnark_gpu_plonk2_fr_vector_t o_in, + const void *d_perm, + const uint64_t *params, + unsigned log2n, + gnark_gpu_plonk2_ntt_domain_t domain); + +gnark_gpu_error_t gnark_gpu_plonk2_z_prefix_phase1( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t z_vec, + gnark_gpu_plonk2_fr_vector_t ratio_vec, + uint64_t *chunk_products_host, + size_t *num_chunks_out); + +gnark_gpu_error_t gnark_gpu_plonk2_z_prefix_phase3( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t z_vec, + gnark_gpu_plonk2_fr_vector_t temp_vec, + const uint64_t *scanned_prefixes_host, + size_t num_chunks); + +gnark_gpu_error_t gnark_gpu_plonk2_poly_eval_chunks( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t coeffs, + const uint64_t *z, + uint64_t *partials_host, + size_t *num_chunks_out); + +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_scale_by_powers( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *generator); + +gnark_gpu_error_t gnark_gpu_plonk2_ntt_domain_create( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + size_t size, + const uint64_t *fwd_twiddles_aos, + const uint64_t *inv_twiddles_aos, + const uint64_t *inv_n, + gnark_gpu_plonk2_ntt_domain_t *domain); + +void gnark_gpu_plonk2_ntt_domain_destroy(gnark_gpu_plonk2_ntt_domain_t domain); + +size_t gnark_gpu_plonk2_ntt_domain_size(gnark_gpu_plonk2_ntt_domain_t domain); + +gnark_gpu_plonk2_curve_id_t gnark_gpu_plonk2_ntt_domain_curve( + gnark_gpu_plonk2_ntt_domain_t domain); + +gnark_gpu_error_t gnark_gpu_plonk2_ntt_forward( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data); + +gnark_gpu_error_t gnark_gpu_plonk2_ntt_inverse( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data); + +gnark_gpu_error_t gnark_gpu_plonk2_ntt_bit_reverse( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data); + +// Build the BLS12-377 MiMC leaves and complete Merkle tree used by the +// PI-interconnection Vortex commitment path. +// +// col_hashes is a num_leaves*chunk_size array of BLS12-377 scalar-field +// elements in gnark-crypto AoS Montgomery layout. constants is the 62 MiMC +// round constants in the same layout. out_nodes receives field elements in +// bottom-up level order: +// leaves, parents, grandparents, ..., root +// and must have room for (2*num_leaves-1) field elements. +gnark_gpu_error_t gnark_gpu_bls12377_mimc_sis_tree( + gnark_gpu_context_t ctx, + const uint64_t *col_hashes, + size_t num_leaves, + size_t chunk_size, + const uint64_t *constants, + uint64_t *out_nodes); + +// Compute the PI-interconnection BLS12-377 ring-SIS transversal hash for +// degree=64/logTwoBound=16, then build the MiMC leaves and complete Merkle +// tree. row_ptrs contains host pointers to row-major Regular smartvector data +// for row_kinds[i] == 0. row_constants contains one BLS12-377 field element per +// row for row_kinds[i] == 1. out_col_hashes receives num_cols*64 field +// elements; out_nodes receives (2*num_cols-1) field elements in bottom-up +// level order. +gnark_gpu_error_t gnark_gpu_bls12377_sis_mimc_tree( + gnark_gpu_context_t ctx, + const uintptr_t *row_ptrs, + const uint8_t *row_kinds, + const uint64_t *row_constants, + size_t num_rows, + size_t num_cols, + const uint64_t *ag, + size_t num_polys, + const uint64_t *twiddles, + const uint64_t *twiddles_inv, + const uint64_t *coset, + const uint64_t *coset_inv, + const uint64_t *cardinality_inv, + const uint64_t *mimc_constants, + uint64_t *out_col_hashes, + uint64_t *out_nodes); + +// Test/validation entrypoints for the curve-generic G1 foundation. +// Inputs are gnark-crypto G1Affine raw memory: X limbs then Y limbs, both in +// Montgomery form. Output is Jacobian/projective raw memory: X, Y, Z. +gnark_gpu_error_t gnark_gpu_plonk2_test_g1_affine_add( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *p, + const uint64_t *q, + uint64_t *out); + +gnark_gpu_error_t gnark_gpu_plonk2_test_g1_affine_double( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *p, + uint64_t *out); + +gnark_gpu_error_t gnark_gpu_plonk2_test_msm_naive( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *points, + const uint64_t *scalars, + size_t count, + uint64_t *out); + +gnark_gpu_error_t gnark_gpu_plonk2_msm_pippenger( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out); + +gnark_gpu_error_t gnark_gpu_plonk2_msm_create( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *points, + size_t point_count, + int window_bits, + gnark_gpu_plonk2_msm_t *msm); + +void gnark_gpu_plonk2_msm_destroy(gnark_gpu_plonk2_msm_t msm); + +gnark_gpu_error_t gnark_gpu_plonk2_msm_pin_work_buffers( + gnark_gpu_plonk2_msm_t msm); + +gnark_gpu_error_t gnark_gpu_plonk2_msm_release_work_buffers( + gnark_gpu_plonk2_msm_t msm); + +gnark_gpu_error_t gnark_gpu_plonk2_msm_offload_points( + gnark_gpu_plonk2_msm_t msm); + +gnark_gpu_error_t gnark_gpu_plonk2_msm_reload_points( + gnark_gpu_plonk2_msm_t msm, + const uint64_t *points, + size_t point_count); + +gnark_gpu_error_t gnark_gpu_plonk2_msm_run( + gnark_gpu_plonk2_msm_t msm, + const uint64_t *scalars, + size_t count, + uint64_t *out); + +// Per-phase timings of the last gnark_gpu_plonk2_msm_run call. Phase order +// matches gnark_gpu_msm_get_phase_timings. +// Returns the number of phases written (9 on success, 0 if msm/out is null). +int gnark_gpu_plonk2_msm_get_phase_timings( + gnark_gpu_plonk2_msm_t msm, + float *out); + +// ============================================================================= +// GPU Z prefix product (two-level parallel scan) +// ============================================================================= + +// Phase 1: Compute local prefix products and extract chunk products. +// z_vec: output vector (n elements), receives local prefix products +// ratio_vec: input vector (n elements), the per-element ratios +// chunk_products_host: output (num_chunks * 4 uint64s, AoS), chunk products downloaded to host +// num_chunks_out: output, number of chunks +// After this call, the host must compute a sequential prefix scan of chunk_products_host, +// then call gnark_gpu_z_prefix_phase3 with the scanned prefixes. +gnark_gpu_error_t gnark_gpu_z_prefix_phase1( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t z_vec, + gnark_gpu_fr_vector_t ratio_vec, + uint64_t *chunk_products_host, + size_t *num_chunks_out); + +// Phase 3: Upload scanned chunk prefixes and apply fixup + shift. +// z_vec: the vector from phase1 (modified in-place) +// temp_vec: scratch vector (same size as z_vec) +// scanned_prefixes_host: the CPU-scanned prefix products (num_chunks * 4 uint64s, AoS) +// num_chunks: from phase1 +gnark_gpu_error_t gnark_gpu_z_prefix_phase3( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t z_vec, + gnark_gpu_fr_vector_t temp_vec, + const uint64_t *scanned_prefixes_host, + size_t num_chunks); + +// ============================================================================= +// GPU polynomial evaluation (chunked Horner) +// ============================================================================= + +// Evaluate a polynomial at a single point using chunked Horner on GPU. +// coeffs: FrVector of n coefficients (on GPU, SoA format). +// z: evaluation point (4 uint64s, Montgomery form). +// partials_host: output buffer for partial chunk results (num_chunks * 4 uint64s, AoS). +// Caller must pre-allocate at least ceil(n/1024) * 4 uint64s. +// num_chunks_out: output, number of chunks. +// After this call, the caller combines partials on CPU: +// zK = z^1024 +// result = partials[C-1] +// for j = C-2 downto 0: result = result * zK + partials[j] +gnark_gpu_error_t gnark_gpu_poly_eval_chunks( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t coeffs, + const uint64_t z[4], + uint64_t *partials_host, + size_t *num_chunks_out); + +gnark_gpu_error_t gnark_gpu_poly_eval_chunks_stream( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t coeffs, + const uint64_t z[4], + uint64_t *partials_host, + size_t *num_chunks_out, + int stream_id); + +// ============================================================================= +// Fused gate constraint accumulation for PlonK quotient +// ============================================================================= + +// Compute result[i] = (result[i] + Ql[i]*L[i] + Qr[i]*R[i] + Qm[i]*L[i]*R[i] +// + Qo[i]*O[i] + Qk[i]) * zhKInv +// in a single pass. result already contains permutation+boundary contributions. +// zhKInv is the inverse of Z_H(coset_gen^n - 1), 4 uint64s in Montgomery form. +gnark_gpu_error_t gnark_gpu_plonk_gate_accum( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t Ql, gnark_gpu_fr_vector_t Qr, + gnark_gpu_fr_vector_t Qm, gnark_gpu_fr_vector_t Qo, + gnark_gpu_fr_vector_t Qk, + gnark_gpu_fr_vector_t L, gnark_gpu_fr_vector_t R, gnark_gpu_fr_vector_t O, + const uint64_t zhKInv[4]); + +// ============================================================================= +// Reduce blinded polynomial for coset evaluation +// +// Computes: dst[i] = src[i] + src[n+j] * cosetPowN for j in [0, tail_len) +// dst[i] = src[i] for i in [tail_len, n) +// +// src: GPU FrVector of length n (first n coefficients of blinded poly) +// blinding_tail_host: pointer to tail coefficients in AoS layout (host memory) +// tail_len: number of tail coefficients (typically 2 or 3) +// cosetPowN: coset generator raised to power n (4 uint64s, Montgomery form) +// dst: output FrVector of length n (receives reduced coefficients) +// ============================================================================= + +gnark_gpu_error_t gnark_gpu_reduce_blinded_coset( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src, + const uint64_t *blinding_tail_host, + size_t tail_len, + const uint64_t cosetPowN[4]); + +gnark_gpu_error_t gnark_gpu_reduce_blinded_coset_stream( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src, + const uint64_t *blinding_tail_host, + size_t tail_len, + const uint64_t cosetPowN[4], + int stream_id); + +// ============================================================================= +// GPU Horner quotient: h(X) = (p(X) - p(z)) / (X - z) in-place +// +// Computes the quotient polynomial on GPU. The input FrVector is modified +// in-place: after completion, poly[0] = p(z) (evaluation), poly[1:] = quotient. +// ============================================================================= + +gnark_gpu_error_t gnark_gpu_horner_quotient( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t poly, + gnark_gpu_fr_vector_t temp, + const uint64_t z[4]); + +// ============================================================================= +// GPU memory info +// ============================================================================= + +// Query free and total GPU memory in bytes. +gnark_gpu_error_t gnark_gpu_mem_get_info(gnark_gpu_context_t ctx, + size_t *free_bytes, size_t *total_bytes); + +// ============================================================================= +// Stream-aware plonk2 primitives (plonk2 quotient pipeline) +// ============================================================================= + +// D2D copy on a specific stream. +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_d2d_stream( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t dst, + gnark_gpu_plonk2_fr_vector_t src, + int stream_id); + +// Forward NTT on a specific stream. +gnark_gpu_error_t gnark_gpu_plonk2_ntt_forward_stream( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data, + int stream_id); + +// Inverse NTT on a specific stream. +gnark_gpu_error_t gnark_gpu_plonk2_ntt_inverse_stream( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data, + int stream_id); + +// Bit-reverse permutation on a specific stream. +gnark_gpu_error_t gnark_gpu_plonk2_ntt_bit_reverse_stream( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data, + int stream_id); + +// Scale-by-powers on a specific stream (for CosetFFT). +gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_scale_by_powers_stream( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *generator, + int stream_id); + +#ifdef __cplusplus +} +#endif + +#endif // GNARK_GPU_H diff --git a/prover/gpu/cuda/include/gnark_gpu_kb.h b/prover/gpu/cuda/include/gnark_gpu_kb.h new file mode 100644 index 00000000000..7422fe643d1 --- /dev/null +++ b/prover/gpu/cuda/include/gnark_gpu_kb.h @@ -0,0 +1,320 @@ +// gnark-gpu KoalaBear + Vortex C API +// +// KoalaBear: P = 2³¹ − 2²⁴ + 1, single uint32 Montgomery elements. +// Vectors are flat uint32_t arrays (no SoA/AoS distinction for 1-limb field). +// E4 elements are 4 consecutive uint32: (b0.a0, b0.a1, b1.a0, b1.a1). + +#ifndef GNARK_GPU_KB_H +#define GNARK_GPU_KB_H + +#include +#include + +#ifdef __cplusplus +extern "C" { +#endif + +// Reuse context from main gnark_gpu if available, otherwise define here. +#ifndef GNARK_GPU_H +typedef struct GnarkGPUContext *gnark_gpu_context_t; +typedef enum { + KB_SUCCESS = 0, + KB_ERROR_CUDA = 1, + KB_ERROR_INVALID = 2, + KB_ERROR_OOM = 3, + KB_ERROR_SIZE = 4, +} kb_error_t; +#else +typedef gnark_gpu_error_t kb_error_t; +#define KB_SUCCESS GNARK_GPU_SUCCESS +#define KB_ERROR_CUDA GNARK_GPU_ERROR_CUDA +#define KB_ERROR_INVALID GNARK_GPU_ERROR_INVALID_ARG +#define KB_ERROR_OOM GNARK_GPU_ERROR_OUT_OF_MEMORY +#define KB_ERROR_SIZE GNARK_GPU_ERROR_SIZE_MISMATCH +#endif + +// ═══════════════════════════════════════════════════════════════════════════ +// KoalaBear vector (flat uint32 on GPU) +// ═══════════════════════════════════════════════════════════════════════════ + +typedef struct KBVec *kb_vec_t; + +kb_error_t kb_vec_alloc(gnark_gpu_context_t ctx, size_t n, kb_vec_t *out); +void kb_vec_free (kb_vec_t v); +size_t kb_vec_len (kb_vec_t v); + +kb_error_t kb_vec_h2d(gnark_gpu_context_t ctx, kb_vec_t dst, + const uint32_t *src, size_t n); +kb_error_t kb_vec_h2d_pinned(gnark_gpu_context_t ctx, kb_vec_t dst, + const uint32_t *src, size_t n); + +// Page-locked host memory for fast H2D transfers. +kb_error_t kb_pinned_alloc(size_t bytes, uint32_t **out); +void kb_pinned_free (uint32_t *ptr); +kb_error_t kb_vec_d2h(gnark_gpu_context_t ctx, uint32_t *dst, + kb_vec_t src, size_t n); +kb_error_t kb_vec_d2d(gnark_gpu_context_t ctx, kb_vec_t dst, kb_vec_t src); +kb_error_t kb_vec_d2d_offset(gnark_gpu_context_t ctx, uint32_t *dst, + const uint32_t *src, size_t n); +kb_error_t kb_vec_d2h_raw(gnark_gpu_context_t ctx, uint32_t *dst, + const uint32_t *src, size_t n); +kb_error_t kb_sync(gnark_gpu_context_t ctx); + +kb_error_t kb_vec_add(gnark_gpu_context_t ctx, kb_vec_t c, kb_vec_t a, kb_vec_t b); +kb_error_t kb_vec_sub(gnark_gpu_context_t ctx, kb_vec_t c, kb_vec_t a, kb_vec_t b); +kb_error_t kb_vec_mul(gnark_gpu_context_t ctx, kb_vec_t c, kb_vec_t a, kb_vec_t b); +kb_error_t kb_vec_scale(gnark_gpu_context_t ctx, kb_vec_t v, uint32_t scalar); +kb_error_t kb_vec_scale_by_powers(gnark_gpu_context_t ctx, kb_vec_t v, uint32_t g); +kb_error_t kb_vec_batch_invert(gnark_gpu_context_t ctx, kb_vec_t v, kb_vec_t temp); +kb_error_t kb_vec_bitrev(gnark_gpu_context_t ctx, kb_vec_t v); + +// ═══════════════════════════════════════════════════════════════════════════ +// NTT domain +// ═══════════════════════════════════════════════════════════════════════════ + +typedef struct KBNtt *kb_ntt_t; + +kb_error_t kb_ntt_init(gnark_gpu_context_t ctx, size_t n, + const uint32_t *fwd_twiddles, + const uint32_t *inv_twiddles, + kb_ntt_t *out); +void kb_ntt_free(kb_ntt_t d); + +kb_error_t kb_ntt_fwd(gnark_gpu_context_t ctx, kb_ntt_t d, kb_vec_t v); +kb_error_t kb_ntt_inv(gnark_gpu_context_t ctx, kb_ntt_t d, kb_vec_t v); +kb_error_t kb_ntt_coset_fwd(gnark_gpu_context_t ctx, kb_ntt_t d, + kb_vec_t v, uint32_t g); +kb_error_t kb_ntt_coset_fwd_raw(gnark_gpu_context_t ctx, kb_ntt_t d, + uint32_t *data, uint32_t g); +kb_error_t kb_vec_bitrev_raw(gnark_gpu_context_t ctx, uint32_t *data, size_t n); + +// Batch NTT: operates on `batch` vectors packed contiguously in `data`. +// Each vector has `n` uint32 elements. Single call, all kernels queued async. +kb_error_t kb_ntt_batch_coset_fwd_bitrev(gnark_gpu_context_t ctx, kb_ntt_t d, + uint32_t *data, size_t n, size_t batch, + uint32_t g); +kb_error_t kb_ntt_batch_ifft_scale(gnark_gpu_context_t ctx, kb_ntt_t d, + uint32_t *data, size_t n, size_t batch, + uint32_t nInv); + +// ═══════════════════════════════════════════════════════════════════════════ +// Poseidon2 (batch operations) +// ═══════════════════════════════════════════════════════════════════════════ + +typedef struct KBPoseidon2 *kb_p2_t; + +kb_error_t kb_p2_init(gnark_gpu_context_t ctx, int width, + int nb_full_rounds, int nb_partial_rounds, + const uint32_t *round_keys, + const uint32_t *diag, + kb_p2_t *out); +void kb_p2_free(kb_p2_t p); + +kb_error_t kb_p2_compress_batch(gnark_gpu_context_t ctx, kb_p2_t p, + const uint32_t *input, uint32_t *output, + size_t count); + +kb_error_t kb_p2_sponge_batch(gnark_gpu_context_t ctx, kb_p2_t p, + const uint32_t *input, size_t input_len, + uint32_t *output, size_t count); + +// ═══════════════════════════════════════════════════════════════════════════ +// Ring-SIS hash +// ═══════════════════════════════════════════════════════════════════════════ + +typedef struct KBSis *kb_sis_t; + +kb_error_t kb_sis_init(gnark_gpu_context_t ctx, + int degree, int n_polys, int log_two_bound, + const uint32_t *ag, + const uint32_t *fwd_tw, + const uint32_t *inv_tw, + const uint32_t *coset_table, + const uint32_t *coset_inv, + kb_sis_t *out); +void kb_sis_free(kb_sis_t s); + +// ═══════════════════════════════════════════════════════════════════════════ +// Vortex commit pipeline +// ═══════════════════════════════════════════════════════════════════════════ + +kb_error_t kb_merkle_build(gnark_gpu_context_t ctx, kb_p2_t p, + const uint32_t *leaves, size_t n_leaves, + uint32_t *tree_buf); + +// Pre-allocated pipeline: GPU RS encode + SIS + Merkle in one call. +// RS encode runs on GPU via batch NTT (eliminates CPU RS, halves H2D data). +typedef struct KBVortexPipeline *kb_vortex_pipeline_t; + +kb_error_t kb_vortex_pipeline_init(gnark_gpu_context_t ctx, + kb_sis_t sis, + kb_p2_t p2_sponge, + kb_p2_t p2_compress, + size_t max_n_rows, + size_t n_cols, + int rate, + const uint32_t *rs_fwd_tw, + const uint32_t *rs_inv_tw, + const uint32_t *scaled_coset_br, + kb_vortex_pipeline_t *out); +void kb_vortex_pipeline_free(kb_vortex_pipeline_t p); + +// Pinned host buffer accessors (for zero-copy Go slice wrapping). +uint32_t *kb_vortex_pipeline_input_buf(kb_vortex_pipeline_t p); +uint32_t *kb_vortex_pipeline_tree_buf(kb_vortex_pipeline_t p); + +// Full GPU vortex commit: RS encode + SIS hash + Merkle tree. +// raw_rows: [n_rows × n_cols], host pinned (from input_buf), Montgomery. +// SIS hashes stay on device (sponge reads d_sis). Tree → pinned h_tree. +// Encoded matrix retained on GPU for Prove (lincomb, extract_col). +// Set KB_VORTEX_TIMING=1 for per-phase timing on stderr. +kb_error_t kb_vortex_commit(kb_vortex_pipeline_t pipeline, + const uint32_t *raw_rows, + size_t n_rows); + +// GPU Prove: linear combination on encoded matrix (kept on device after commit). +// result[j] = Σᵢ αⁱ · encoded[j][i] ∈ E4, j ∈ [0, scw) +kb_error_t kb_vortex_lincomb(kb_vortex_pipeline_t pipeline, + size_t n_rows, + const uint32_t alpha[4], + uint32_t *result); + +// GPU Prove: extract single column from encoded matrix to host. +kb_error_t kb_vortex_extract_col(kb_vortex_pipeline_t pipeline, + size_t n_rows, int col_idx, + uint32_t *out); + +// Extract full encoded matrix from GPU to host in column-major layout. +// out: [scw × n_rows] uint32, column-major: out[col * n_rows + row]. +kb_error_t kb_vortex_extract_all(kb_vortex_pipeline_t pipeline, + size_t n_rows, uint32_t *out); + +// Extract full encoded matrix from GPU to host in row-major layout. +// Transposes on GPU before D2H to avoid costly CPU transposition. +// out: [n_rows × scw] uint32, row-major: out[row * scw + col]. +kb_error_t kb_vortex_extract_all_rowmajor(kb_vortex_pipeline_t pipeline, + size_t n_rows, uint32_t *out); + +// Return sizeCodeWord for the pipeline. +size_t kb_vortex_scw(kb_vortex_pipeline_t pipeline); + +// Set multi-coset scaling tables for rate > 2 RS encoding. +// coset_tables: [(rate-1) × n_cols] flat array, table k at offset k*n_cols. +// Each table k: coset_k_br[j] = (Ω^{k+1})^{bitrev(j)} / n (bit-reversed, normalized). +// Must be called before kb_vortex_commit when rate > 2. +kb_error_t kb_vortex_pipeline_set_coset_tables(kb_vortex_pipeline_t p, + const uint32_t *coset_tables, + size_t n_tables); + +// Commit + async extract: overlaps D2H of encoded/SIS/leaves with compute. +// After return, pinned host buffers contain the results. +// Use kb_vortex_h_*_pinned() to get pointers to the pinned buffers. +kb_error_t kb_vortex_commit_and_extract(kb_vortex_pipeline_t pipeline, + const uint32_t *raw_rows, + size_t n_rows); + +// Accessors for pinned host buffers (valid after kb_vortex_commit_and_extract). +uint32_t *kb_vortex_h_enc_pinned(kb_vortex_pipeline_t pipeline); +uint32_t *kb_vortex_h_sis_pinned(kb_vortex_pipeline_t pipeline); +uint32_t *kb_vortex_h_leaves_pinned(kb_vortex_pipeline_t pipeline); + +// Extract SIS column hashes from GPU to host. +// out: flat [scw × degree] uint32, same layout as d_sis. +kb_error_t kb_vortex_extract_sis(kb_vortex_pipeline_t pipeline, + size_t n_rows, uint32_t *out); + +// Extract leaf hashes (Poseidon2 digests) from GPU to host. +// out: flat [scw × 8] uint32. +kb_error_t kb_vortex_extract_leaves(kb_vortex_pipeline_t pipeline, + uint32_t *out); + +// Return degree (SIS polynomial degree) for the pipeline. +int kb_vortex_degree(kb_vortex_pipeline_t pipeline); + +// Get raw device pointer to pipeline's column-major encoded matrix. +// Layout: d_encoded[col * n_rows + row], col ∈ [0, scw), row ∈ [0, n_rows). +uint32_t *kb_vortex_encoded_device_ptr(kb_vortex_pipeline_t pipeline); + +// Lincomb from a standalone column-major device buffer (not pipeline-bound). +// result[j] = Σᵢ αⁱ · d_encoded[j * n_rows + i] ∈ E4, j ∈ [0, scw) +kb_error_t kb_lincomb_e4_colmajor(gnark_gpu_context_t ctx, + const uint32_t *d_encoded_col, + size_t n_rows, size_t scw, + const uint32_t alpha[4], + uint32_t *result); + +// Linear combination: result[j] = Σᵢ αⁱ · rows[i][j] +kb_error_t kb_lincomb_e4(gnark_gpu_context_t ctx, + kb_vec_t *rows, size_t n_rows, size_t n_cols, + const uint32_t alpha[4], uint32_t *result); + +// ═══════════════════════════════════════════════════════════════════════════ +// Symbolic expression evaluator (GPU bytecode VM) +// ═══════════════════════════════════════════════════════════════════════════ +// +// Evaluates a compiled arithmetic DAG over n E4 elements in parallel. +// One GPU thread per element, zero warp divergence. +// +// Go compiler CUDA kernel +// ─────────── ─────────── +// ExpressionBoard.Nodes[] kern_symbolic_eval +// │ liveness + regalloc │ +// ▼ │ +// GPUProgram {bytecode, consts} ──H2D──▶ thread i: +// │ E4 slots[S] +// SymInput[] (device ptrs) ──H2D──▶ for pc in pgm: exec(i) +// │ out[i] = slots[R] +// +// Opcodes (same layout as CPU compiler): +// 0 OP_CONST: [0, dst, const_idx] → slots[dst] = consts[ci] +// 1 OP_INPUT: [1, dst, input_id] → slots[dst] = read(inputs[id], i) +// 2 OP_MUL: [2, dst, n, s₀,e₀, ..., sₙ,eₙ] → slots[dst] = Π slots[sₖ]^eₖ +// 3 OP_LINCOMB: [3, dst, n, s₀,c₀, ..., sₙ,cₙ] → slots[dst] = Σ cₖ·slots[sₖ] +// 4 OP_POLYEVAL: [4, dst, n, x, c₀, ..., cₘ] → Horner(x, c₀..cₘ) + +// Self-recursion boards can require up to ~5000 slots. +// Each slot is 16 bytes (E4) in per-thread local memory. +// 8192 × 16 = 128 KB/thread, well within CUDA's 512 KB limit. +#define SYM_MAX_SLOTS 8192 + +// Input descriptor — tells the kernel how to read element [i] for one variable. +// tag=0 (KB): d_ptr[i] → embed as (val,0,0,0) +// tag=1 (CONST_E4): broadcast val[4] → same E4 for all threads +// tag=2 (ROT_KB): d_ptr[(i+offset)%n] → rotated base column +// tag=3 (E4_VEC_AOS): d_ptr[i*4..i*4+3] → E4 AoS vector +// tag=4 (E4_VEC_SOA): d_ptr[c*n + i], c∈{0,1,2,3} → E4 SoA vector +// tag=5 (ROT_E4_SOA): d_ptr[c*n + ((i+offset)%n)] → rotated E4 SoA vector +// tag=6 (ROT_E4_AOS): d_ptr[((i+offset)%n)*4..+3] → rotated E4 AoS vector +typedef struct { + uint32_t *d_ptr; // device pointer to KB elements (NULL for CONST) + uint32_t val[4]; // E4 constant value (tag=CONST only) + uint32_t tag; // 0=KB, 1=CONST_E4, 2=ROTATED_KB, 3=E4_VEC + uint32_t offset; // rotation offset (tag=ROTATED only) +} SymInputDesc; + +// Compiled GPU program handle (device-resident bytecode + constants). +typedef struct KBSymProgram *kb_sym_program_t; + +kb_error_t kb_sym_compile(gnark_gpu_context_t ctx, + const uint32_t *bytecode, uint32_t pgm_len, + const uint32_t *constants, uint32_t num_consts, + uint32_t num_slots, + uint32_t result_slot, + kb_sym_program_t *out); +void kb_sym_free(kb_sym_program_t p); + +// Evaluate: n elements, result written to h_out (host buffer, n × 4 uint32). +kb_error_t kb_sym_eval(gnark_gpu_context_t ctx, + kb_sym_program_t program, + const SymInputDesc *h_inputs, uint32_t num_inputs, + uint32_t n, + uint32_t *h_out); + +// Get raw device pointer from a KBVector (for constructing SymInputDesc). +uint32_t *kb_vec_device_ptr(kb_vec_t v); + +#ifdef __cplusplus +} +#endif + +#endif // GNARK_GPU_KB_H diff --git a/prover/gpu/cuda/src/plonk/api.cu b/prover/gpu/cuda/src/plonk/api.cu new file mode 100644 index 00000000000..d29675ea7c2 --- /dev/null +++ b/prover/gpu/cuda/src/plonk/api.cu @@ -0,0 +1,3381 @@ +// ============================================================================= +// gnark-gpu C API bridge (CGO-facing layer) +// +// Purpose: +// - Keep exported ABI flat and stable (`extern "C"` handles + POD args). +// - Keep CUDA-heavy logic in dedicated modules (`msm.cu`, `ntt.cu`, etc.). +// - Keep this file as a thin router + lifecycle owner. +// +// Layering: +// +// Go wrappers (gpu/*.go) +// | +// v +// C ABI (gnark_gpu.h / this file) +// | +// v +// Internal launchers + contexts (cuda/src/*.cu, *.cuh) +// +// Handle model: +// GnarkGPUContext -> owns CUDA stream(s), reusable staging buffers +// GnarkGPUFrVector -> owns SoA limb allocations +// GnarkGPUMSM -> owns persistent point buffers + MSM work buffers +// GnarkGPUNTTDomain-> owns twiddle tables for one domain size +// +// Design rule: +// No algorithmic complexity here. This file validates arguments, dispatches +// to kernels, and translates CUDA/launcher failures to API error codes. +// ============================================================================= + +#include "gnark_gpu.h" +#include "field.cuh" +#include "../plonk2/field.cuh" +#include +#include + +namespace gnark_gpu { + +// Forward declarations for kernel launchers (defined in kernels.cu) +void launch_mul_mont_fr(uint64_t *c0, uint64_t *c1, uint64_t *c2, uint64_t *c3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, + const uint64_t *a3, const uint64_t *b0, const uint64_t *b1, + const uint64_t *b2, const uint64_t *b3, size_t n, + cudaStream_t stream); + +void launch_add_fr(uint64_t *c0, uint64_t *c1, uint64_t *c2, uint64_t *c3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, + const uint64_t *a3, const uint64_t *b0, const uint64_t *b1, + const uint64_t *b2, const uint64_t *b3, size_t n, cudaStream_t stream); + +void launch_sub_fr(uint64_t *c0, uint64_t *c1, uint64_t *c2, uint64_t *c3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, + const uint64_t *a3, const uint64_t *b0, const uint64_t *b1, + const uint64_t *b2, const uint64_t *b3, size_t n, cudaStream_t stream); + +void launch_transpose_aos_to_soa_fr(uint64_t *limb0, uint64_t *limb1, uint64_t *limb2, + uint64_t *limb3, const uint64_t *aos_data, size_t count, + cudaStream_t stream); + +void launch_transpose_soa_to_aos_fr(uint64_t *aos_data, const uint64_t *limb0, + const uint64_t *limb1, const uint64_t *limb2, + const uint64_t *limb3, size_t count, + cudaStream_t stream); + +// Forward declarations for new Fr operations (defined in fr_ops.cu) +void launch_scale_by_powers(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t g[4], size_t n, cudaStream_t stream); +void launch_scalar_mul(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t c[4], size_t n, cudaStream_t stream); +void launch_addmul(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, const uint64_t *a3, + const uint64_t *b0, const uint64_t *b1, const uint64_t *b2, const uint64_t *b3, + size_t n, cudaStream_t stream); +cudaError_t launch_batch_invert(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + uint64_t *orig0, uint64_t *orig1, uint64_t *orig2, uint64_t *orig3, + size_t n, cudaStream_t stream); +struct PlonkPermBoundaryParams { + uint64_t alpha[4]; + uint64_t beta[4]; + uint64_t gamma[4]; + uint64_t l1_scalar[4]; + uint64_t coset_shift[4]; + uint64_t coset_shift_sq[4]; + uint64_t coset_gen[4]; +}; +void launch_plonk_perm_boundary( + uint64_t *res0, uint64_t *res1, uint64_t *res2, uint64_t *res3, + const uint64_t *L0, const uint64_t *L1, const uint64_t *L2, const uint64_t *L3, + const uint64_t *R0, const uint64_t *R1, const uint64_t *R2, const uint64_t *R3, + const uint64_t *O0, const uint64_t *O1, const uint64_t *O2, const uint64_t *O3, + const uint64_t *Z0, const uint64_t *Z1, const uint64_t *Z2, const uint64_t *Z3, + const uint64_t *S1_0, const uint64_t *S1_1, const uint64_t *S1_2, const uint64_t *S1_3, + const uint64_t *S2_0, const uint64_t *S2_1, const uint64_t *S2_2, const uint64_t *S2_3, + const uint64_t *S3_0, const uint64_t *S3_1, const uint64_t *S3_2, const uint64_t *S3_3, + const uint64_t *dinv0, const uint64_t *dinv1, const uint64_t *dinv2, const uint64_t *dinv3, + const PlonkPermBoundaryParams ¶ms, + const uint64_t *tw0, const uint64_t *tw1, const uint64_t *tw2, const uint64_t *tw3, + size_t n, cudaStream_t stream); + +struct PlonkZRatioParams { + uint64_t beta[4]; + uint64_t gamma[4]; + uint64_t g_mul[4]; + uint64_t g_sq[4]; +}; +void launch_plonk_z_ratio( + uint64_t *LN0, uint64_t *LN1, uint64_t *LN2, uint64_t *LN3, + uint64_t *RD0, uint64_t *RD1, uint64_t *RD2, uint64_t *RD3, + const uint64_t *O0, const uint64_t *O1, const uint64_t *O2, const uint64_t *O3, + const int64_t *d_perm, + const PlonkZRatioParams ¶ms, + const uint64_t *tw0, const uint64_t *tw1, const uint64_t *tw2, const uint64_t *tw3, + size_t n, unsigned log2n, cudaStream_t stream); + +void launch_compute_l1_den( + uint64_t *out0, uint64_t *out1, uint64_t *out2, uint64_t *out3, + const uint64_t *tw0, const uint64_t *tw1, const uint64_t *tw2, const uint64_t *tw3, + const uint64_t cg[4], size_t n, cudaStream_t stream); + +void launch_reduce_blinded_coset( + uint64_t *dst0, uint64_t *dst1, uint64_t *dst2, uint64_t *dst3, + const uint64_t *src0, const uint64_t *src1, + const uint64_t *src2, const uint64_t *src3, + const uint64_t cpn[4], + const uint64_t *tail_device, + uint32_t tail_len, uint32_t n, cudaStream_t stream); + +void launch_add_scalar_mul(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, const uint64_t *a3, + const uint64_t scalar[4], size_t n, cudaStream_t stream); + +void launch_plonk_gate_accum( + uint64_t *res0, uint64_t *res1, uint64_t *res2, uint64_t *res3, + const uint64_t *Ql0, const uint64_t *Ql1, const uint64_t *Ql2, const uint64_t *Ql3, + const uint64_t *Qr0, const uint64_t *Qr1, const uint64_t *Qr2, const uint64_t *Qr3, + const uint64_t *Qm0, const uint64_t *Qm1, const uint64_t *Qm2, const uint64_t *Qm3, + const uint64_t *Qo0, const uint64_t *Qo1, const uint64_t *Qo2, const uint64_t *Qo3, + const uint64_t *Qk0, const uint64_t *Qk1, const uint64_t *Qk2, const uint64_t *Qk3, + const uint64_t *L0, const uint64_t *L1, const uint64_t *L2, const uint64_t *L3, + const uint64_t *R0, const uint64_t *R1, const uint64_t *R2, const uint64_t *R3, + const uint64_t *O0, const uint64_t *O1, const uint64_t *O2, const uint64_t *O3, + const uint64_t zhKInv[4], size_t n, cudaStream_t stream); + +void launch_butterfly4( + uint64_t *b0_0, uint64_t *b0_1, uint64_t *b0_2, uint64_t *b0_3, + uint64_t *b1_0, uint64_t *b1_1, uint64_t *b1_2, uint64_t *b1_3, + uint64_t *b2_0, uint64_t *b2_1, uint64_t *b2_2, uint64_t *b2_3, + uint64_t *b3_0, uint64_t *b3_1, uint64_t *b3_2, uint64_t *b3_3, + const uint64_t omega4_inv[4], const uint64_t quarter[4], + size_t n, cudaStream_t stream); + +// Forward declarations for Z prefix product (defined in plonk_z.cu) +cudaError_t launch_z_prefix_phase1( + uint64_t *z0, uint64_t *z1, uint64_t *z2, uint64_t *z3, + const uint64_t *r0, const uint64_t *r1, const uint64_t *r2, const uint64_t *r3, + uint64_t *cp[4], + size_t n, cudaStream_t stream); +cudaError_t launch_z_prefix_phase3( + uint64_t *z0, uint64_t *z1, uint64_t *z2, uint64_t *z3, + uint64_t *temp0, uint64_t *temp1, uint64_t *temp2, uint64_t *temp3, + uint64_t *sp[4], + size_t num_chunks, size_t n, cudaStream_t stream); + +// Forward declarations for polynomial evaluation (defined in plonk_eval.cu) +void launch_poly_eval_chunks( + const uint64_t *c0, const uint64_t *c1, + const uint64_t *c2, const uint64_t *c3, + const uint64_t z[4], + uint64_t *out0, uint64_t *out1, + uint64_t *out2, uint64_t *out3, + size_t n, size_t *num_chunks_out, + cudaStream_t stream); + +// Forward declarations for MSM functions (defined in msm.cu) +struct MSMContext; +struct G1EdExtended; +MSMContext *msm_create(size_t max_points); +void msm_destroy(MSMContext *ctx); +void msm_load_points(MSMContext *ctx, const void *host_points, size_t count, cudaStream_t stream); +void msm_upload_scalars(MSMContext *ctx, const uint64_t *host_scalars, size_t n, cudaStream_t stream); +void launch_msm(MSMContext *ctx, size_t n, cudaStream_t stream); +void msm_download_results(MSMContext *ctx, G1EdExtended *host_results, cudaStream_t stream); +cudaError_t msm_run_full(MSMContext *ctx, const uint64_t *host_scalars, size_t n, + G1EdExtended *host_results, cudaStream_t compute_stream); +void msm_offload_points(MSMContext *ctx); +void msm_unregister_host(MSMContext *ctx); +cudaError_t msm_reload_points(MSMContext *ctx, const void *host_points, size_t count, cudaStream_t stream); +cudaError_t msm_load_points_sw(MSMContext *ctx, const void *host_sw_points, size_t count, cudaStream_t stream); +int msm_get_c(MSMContext *ctx); +int msm_get_num_windows(MSMContext *ctx); +int msm_get_phase_timings(MSMContext *ctx, float *out); +void msm_pin_buffers(MSMContext *ctx); +void msm_release_buffers(MSMContext *ctx); +cudaError_t test_sw_pair_add_run(const uint64_t *p0, const uint64_t *p1, uint64_t *out); +cudaError_t test_sw_to_te_run(const uint64_t *p_sw, uint64_t *out_te); +cudaError_t test_batched_affine_reduce_run(const uint64_t *points_aos, uint64_t *out_aos, int N); + +// Forward declarations for NTT functions (defined in ntt.cu) +struct NTTDomain; +NTTDomain *ntt_domain_create(size_t size, const uint64_t *fwd_twiddles_aos, + const uint64_t *inv_twiddles_aos, const uint64_t inv_n[4], + cudaStream_t stream); +void ntt_domain_destroy(NTTDomain *dom); +void launch_ntt_forward(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + cudaStream_t stream); +void launch_ntt_forward_coset(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + const uint64_t g[4], const uint64_t g_half[4], + cudaStream_t stream); +void launch_ntt_inverse(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + cudaStream_t stream); +void launch_ntt_bit_reverse(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + cudaStream_t stream); +void ntt_get_fwd_twiddles(const NTTDomain *dom, const uint64_t **out_ptrs); + +} // namespace gnark_gpu + +namespace gnark_gpu::plonk2 { + +void launch_copy_aos_to_soa(gnark_gpu_plonk2_curve_id_t curve, FrView dst, + const uint64_t *src, size_t n, cudaStream_t stream); +void launch_copy_soa_to_aos(gnark_gpu_plonk2_curve_id_t curve, uint64_t *dst, + ConstFrView src, size_t n, cudaStream_t stream); +void launch_set_zero(gnark_gpu_plonk2_curve_id_t curve, FrView v, size_t n, + cudaStream_t stream); +void launch_add(gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, + ConstFrView b, size_t n, cudaStream_t stream); +void launch_sub(gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, + ConstFrView b, size_t n, cudaStream_t stream); +void launch_mul(gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, + ConstFrView b, size_t n, cudaStream_t stream); +void launch_addmul(gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, + ConstFrView b, size_t n, cudaStream_t stream); +void launch_scalar_mul(gnark_gpu_plonk2_curve_id_t curve, FrView out, + const uint64_t *scalar, size_t n, cudaStream_t stream); +void launch_add_scalar_mul(gnark_gpu_plonk2_curve_id_t curve, FrView out, + ConstFrView a, const uint64_t *scalar, size_t n, + cudaStream_t stream); +void launch_batch_invert(gnark_gpu_plonk2_curve_id_t curve, FrView data, + size_t n, cudaStream_t stream); +void launch_butterfly4_inverse(gnark_gpu_plonk2_curve_id_t curve, + FrView b0, FrView b1, FrView b2, FrView b3, + const uint64_t *omega4_inv, + const uint64_t *quarter, size_t n, + cudaStream_t stream); +void launch_reduce_blinded_coset(gnark_gpu_plonk2_curve_id_t curve, + FrView dst, ConstFrView src, + const uint64_t *tail, size_t tail_len, + const uint64_t *coset_pow_n, size_t n, + cudaStream_t stream); +void launch_compute_l1_den(gnark_gpu_plonk2_curve_id_t curve, FrView out, + ConstFrView twiddles, const uint64_t *coset_gen, + size_t n, cudaStream_t stream); +void launch_gate_accum(gnark_gpu_plonk2_curve_id_t curve, FrView result, + ConstFrView ql, ConstFrView qr, ConstFrView qm, + ConstFrView qo, ConstFrView qk, ConstFrView l, + ConstFrView r, ConstFrView o, + const uint64_t *zh_k_inv, size_t n, + cudaStream_t stream); +void launch_linearize_static(gnark_gpu_plonk2_curve_id_t curve, FrView result, + ConstFrView z, ConstFrView s3, + ConstFrView ql, ConstFrView qr, + ConstFrView qm, ConstFrView qo, + ConstFrView qk, const uint64_t *scalars, + size_t n, cudaStream_t stream); +void launch_subtract_head(gnark_gpu_plonk2_curve_id_t curve, FrView data, + const uint64_t *tail, size_t tail_len, + cudaStream_t stream); +void launch_perm_boundary(gnark_gpu_plonk2_curve_id_t curve, FrView result, + ConstFrView l, ConstFrView r, ConstFrView o, + ConstFrView z, ConstFrView s1, ConstFrView s2, + ConstFrView s3, ConstFrView l1_den_inv, + ConstFrView twiddles, const uint64_t *params, + size_t n, cudaStream_t stream); +void launch_z_compute_factors(gnark_gpu_plonk2_curve_id_t curve, + FrView l_inout, FrView r_inout, + ConstFrView o, const int64_t *perm, + ConstFrView twiddles, const uint64_t *params, + size_t n, unsigned log2n, + cudaStream_t stream); +void launch_z_prefix_phase1(gnark_gpu_plonk2_curve_id_t curve, FrView z, + ConstFrView ratio, uint64_t *chunk_products, + size_t n, cudaStream_t stream); +void launch_z_prefix_phase3(gnark_gpu_plonk2_curve_id_t curve, FrView z, + FrView temp, const uint64_t *scanned_prefixes, + size_t num_chunks, size_t n, + cudaStream_t stream); +void launch_poly_eval_chunks(gnark_gpu_plonk2_curve_id_t curve, + ConstFrView coeffs, const uint64_t *z, + uint64_t *partials, size_t n, + cudaStream_t stream); +void launch_ntt_forward(gnark_gpu_plonk2_curve_id_t curve, FrView data, + ConstFrView twiddles, size_t n, cudaStream_t stream); +void launch_ntt_inverse(gnark_gpu_plonk2_curve_id_t curve, FrView data, + ConstFrView twiddles, const uint64_t *inv_n, size_t n, + cudaStream_t stream); +void launch_scale_by_powers(gnark_gpu_plonk2_curve_id_t curve, FrView data, + const uint64_t *generator, uint64_t *local_powers, + size_t n, cudaStream_t stream); +void launch_bit_reverse(gnark_gpu_plonk2_curve_id_t curve, FrView data, + size_t n, cudaStream_t stream); +cudaError_t g1_affine_add_run(gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *p, const uint64_t *q, + uint64_t *out, cudaStream_t stream); +cudaError_t g1_affine_double_run(gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *p, uint64_t *out, + cudaStream_t stream); +cudaError_t msm_naive_run(gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *points, const uint64_t *scalars, + size_t count, uint64_t *out, cudaStream_t stream); +cudaError_t msm_pippenger_run(gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *points, const uint64_t *scalars, + size_t count, int window_bits, uint64_t *out, + cudaStream_t stream); +cudaError_t msm_pippenger_device_points_run(gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *d_points, + const uint64_t *scalars, + size_t count, int window_bits, + uint64_t *out, + cudaStream_t stream); +cudaError_t msm_pippenger_sort_temp_bytes(gnark_gpu_plonk2_curve_id_t curve, + size_t count, int window_bits, + size_t *temp_bytes); +cudaError_t msm_pippenger_device_points_prealloc_run( + gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *d_points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + uint64_t *d_scalars, + uint64_t *d_out, + uint32_t *d_keys_in, + uint32_t *d_keys_out, + uint32_t *d_vals_in, + uint32_t *d_vals_out, + uint32_t *d_bucket_offsets, + uint32_t *d_bucket_ends, + uint32_t *d_overflow_buckets, + uint32_t *d_overflow_count, + void *d_buckets, + void *d_window_results, + void *d_partial_totals, + void *d_partial_sums, + void *d_sort_temp, + size_t sort_temp_bytes, + cudaEvent_t *phase_events, + float *phase_timings_ms, + cudaStream_t stream); + +} // namespace gnark_gpu::plonk2 + +// ============================================================================= +// Internal structures +// ============================================================================= + +// Scratch buffers for Z prefix product (owned by context, not thread_local). +struct ZPrefixScratch { + uint64_t *cp[4] = {}; // chunk products (device) + uint64_t *sp[4] = {}; // scanned prefixes (device) + size_t capacity = 0; +}; + +// Scratch buffers for poly eval chunks (owned by context). +struct PolyEvalScratch { + uint64_t *out[4] = {}; // partial results (device) + size_t capacity = 0; +}; + +struct GnarkGPUContext { + int device_id; + cudaStream_t stream; // default stream (stream 0), alias for streams[0] + // Multi-stream support + cudaStream_t streams[GNARK_GPU_MAX_STREAMS]; + bool stream_created[GNARK_GPU_MAX_STREAMS]; + cudaEvent_t events[GNARK_GPU_MAX_EVENTS]; + bool event_created[GNARK_GPU_MAX_EVENTS]; + // Shared staging buffer for AoS↔SoA transfers (one per context, reused) + uint64_t *staging_buffer; + size_t staging_count; // capacity in Fr elements (buffer is 4*staging_count uint64s) + // Curve-generic staging buffer used by plonk2. Capacity is raw uint64 words. + uint64_t *plonk2_staging_buffer; + size_t plonk2_staging_words; + // Scratch buffers for Z prefix product and poly eval (context-owned) + ZPrefixScratch z_prefix_scratch; + PolyEvalScratch poly_eval_scratch; +}; + +struct GnarkGPUFrVector { + GnarkGPUContext *ctx; + size_t count; + // SoA storage: 4 separate arrays for the 4 limbs + uint64_t *limbs[4]; +}; + +// ============================================================================= +// Helper to convert CUDA errors +// ============================================================================= + +static gnark_gpu_error_t check_cuda(cudaError_t err) { + if (err == cudaSuccess) { + return GNARK_GPU_SUCCESS; + } + if (err == cudaErrorMemoryAllocation) { + return GNARK_GPU_ERROR_OUT_OF_MEMORY; + } + return GNARK_GPU_ERROR_CUDA; +} + +// Get the CUDA stream for a given stream_id. Returns nullptr on invalid ID. +static cudaStream_t get_stream(GnarkGPUContext *ctx, int stream_id) { + if (stream_id < 0 || stream_id >= GNARK_GPU_MAX_STREAMS) return nullptr; + if (!ctx->stream_created[stream_id]) return nullptr; + return ctx->streams[stream_id]; +} + +// ============================================================================= +// Context lifecycle +// ============================================================================= + +// Bind the calling OS thread to a CUDA device. +// See header doc — required before any allocation / kernel launch on +// multi-GPU hosts so that the call lands on the right device. Idempotent. +extern "C" gnark_gpu_error_t gnark_gpu_set_device(int device_id) { + return check_cuda(cudaSetDevice(device_id)); +} + +extern "C" gnark_gpu_error_t gnark_gpu_init(int device_id, gnark_gpu_context_t *ctx) { + if (!ctx) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + cudaError_t err = cudaSetDevice(device_id); + if (err != cudaSuccess) { + return check_cuda(err); + } + + GnarkGPUContext *c = new GnarkGPUContext{}; + c->device_id = device_id; + c->staging_buffer = nullptr; + c->staging_count = 0; + c->plonk2_staging_buffer = nullptr; + c->plonk2_staging_words = 0; + + // Initialize stream/event arrays + for (int i = 0; i < GNARK_GPU_MAX_STREAMS; i++) { + c->streams[i] = nullptr; + c->stream_created[i] = false; + } + for (int i = 0; i < GNARK_GPU_MAX_EVENTS; i++) { + c->events[i] = nullptr; + c->event_created[i] = false; + } + + // Create default stream (stream 0) + err = cudaStreamCreate(&c->streams[0]); + if (err != cudaSuccess) { + delete c; + return check_cuda(err); + } + c->stream_created[0] = true; + c->stream = c->streams[0]; // alias + + *ctx = c; + return GNARK_GPU_SUCCESS; +} + +extern "C" void gnark_gpu_destroy(gnark_gpu_context_t ctx) { + if (ctx) { + if (ctx->staging_buffer) { + cudaFree(ctx->staging_buffer); + } + if (ctx->plonk2_staging_buffer) { + cudaFree(ctx->plonk2_staging_buffer); + } + // Free Z prefix scratch + for (int i = 0; i < 4; i++) { + if (ctx->z_prefix_scratch.cp[i]) cudaFree(ctx->z_prefix_scratch.cp[i]); + if (ctx->z_prefix_scratch.sp[i]) cudaFree(ctx->z_prefix_scratch.sp[i]); + } + // Free poly eval scratch + for (int i = 0; i < 4; i++) { + if (ctx->poly_eval_scratch.out[i]) cudaFree(ctx->poly_eval_scratch.out[i]); + } + for (int i = 0; i < GNARK_GPU_MAX_EVENTS; i++) { + if (ctx->event_created[i]) { + cudaEventDestroy(ctx->events[i]); + } + } + for (int i = 0; i < GNARK_GPU_MAX_STREAMS; i++) { + if (ctx->stream_created[i]) { + cudaStreamDestroy(ctx->streams[i]); + } + } + delete ctx; + } +} + +// ============================================================================= +// Fr vector operations +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_alloc(gnark_gpu_context_t ctx, size_t count, + gnark_gpu_fr_vector_t *vec) { + if (!ctx || !vec || count == 0) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + GnarkGPUFrVector *v = new GnarkGPUFrVector; + v->ctx = ctx; + v->count = count; + + // Allocate SoA limb arrays + for (int i = 0; i < 4; i++) { + cudaError_t err = cudaMalloc(&v->limbs[i], count * sizeof(uint64_t)); + if (err != cudaSuccess) { + // Cleanup on failure + for (int j = 0; j < i; j++) { + cudaFree(v->limbs[j]); + } + delete v; + // Clear sticky error so subsequent CUDA calls aren't poisoned. + cudaGetLastError(); + return check_cuda(err); + } + } + + *vec = v; + return GNARK_GPU_SUCCESS; +} + +extern "C" void gnark_gpu_fr_vector_free(gnark_gpu_fr_vector_t vec) { + if (vec) { + for (int i = 0; i < 4; i++) { + if (vec->limbs[i]) { + cudaFree(vec->limbs[i]); + } + } + delete vec; + } +} + +extern "C" size_t gnark_gpu_fr_vector_len(gnark_gpu_fr_vector_t vec) { + if (!vec) { + return 0; + } + return vec->count; +} + +// ============================================================================= +// Shared staging buffer management +// ============================================================================= + +// Ensure the context's staging buffer can hold at least min_count Fr elements. +static gnark_gpu_error_t ensure_staging(GnarkGPUContext *ctx, size_t min_count) { + if (ctx->staging_count >= min_count) { + return GNARK_GPU_SUCCESS; + } + // Free old buffer if any + if (ctx->staging_buffer) { + // Must sync before freeing — prior operations may still be using it + cudaError_t err = cudaStreamSynchronize(ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + cudaFree(ctx->staging_buffer); + ctx->staging_buffer = nullptr; + ctx->staging_count = 0; + } + cudaError_t err = cudaMalloc(&ctx->staging_buffer, min_count * 4 * sizeof(uint64_t)); + if (err != cudaSuccess) { + return check_cuda(err); + } + ctx->staging_count = min_count; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_staging_ensure(gnark_gpu_context_t ctx, size_t min_count) { + if (!ctx) return GNARK_GPU_ERROR_INVALID_ARG; + return ensure_staging(ctx, min_count); +} + +// ============================================================================= +// Data transfer with AoS↔SoA transpose (using shared staging buffer) +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_device(gnark_gpu_fr_vector_t vec, + const uint64_t *host_data, + size_t count) { + if (!vec || !host_data) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (count != vec->count) { + return GNARK_GPU_ERROR_SIZE_MISMATCH; + } + + GnarkGPUContext *ctx = vec->ctx; + gnark_gpu_error_t gerr = ensure_staging(ctx, count); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + cudaStream_t stream = ctx->stream; + + // Copy AoS data from host to shared staging buffer + cudaError_t err = cudaMemcpyAsync(ctx->staging_buffer, host_data, + count * 4 * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { + return check_cuda(err); + } + + // Transpose from AoS to SoA on GPU + gnark_gpu::launch_transpose_aos_to_soa_fr(vec->limbs[0], vec->limbs[1], vec->limbs[2], + vec->limbs[3], ctx->staging_buffer, count, stream); + + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_host(gnark_gpu_fr_vector_t vec, + uint64_t *host_data, + size_t count) { + if (!vec || !host_data) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (count != vec->count) { + return GNARK_GPU_ERROR_SIZE_MISMATCH; + } + + GnarkGPUContext *ctx = vec->ctx; + gnark_gpu_error_t gerr = ensure_staging(ctx, count); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + cudaStream_t stream = ctx->stream; + + // Transpose from SoA to AoS on GPU into shared staging buffer + gnark_gpu::launch_transpose_soa_to_aos_fr(ctx->staging_buffer, vec->limbs[0], vec->limbs[1], + vec->limbs[2], vec->limbs[3], count, stream); + + // Copy AoS data from staging buffer to host + cudaError_t err = cudaMemcpyAsync(host_data, ctx->staging_buffer, + count * 4 * sizeof(uint64_t), + cudaMemcpyDeviceToHost, stream); + if (err != cudaSuccess) { + return check_cuda(err); + } + + // Must sync to ensure data is available on host + err = cudaStreamSynchronize(stream); + return check_cuda(err); +} + +// ============================================================================= +// Arithmetic operations +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_mul(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b) { + if (!ctx || !result || !a || !b) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (result->count != a->count || a->count != b->count) { + return GNARK_GPU_ERROR_SIZE_MISMATCH; + } + + gnark_gpu::launch_mul_mont_fr(result->limbs[0], result->limbs[1], result->limbs[2], + result->limbs[3], a->limbs[0], a->limbs[1], a->limbs[2], + a->limbs[3], b->limbs[0], b->limbs[1], b->limbs[2], + b->limbs[3], a->count, ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_add(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b) { + if (!ctx || !result || !a || !b) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (result->count != a->count || a->count != b->count) { + return GNARK_GPU_ERROR_SIZE_MISMATCH; + } + + gnark_gpu::launch_add_fr(result->limbs[0], result->limbs[1], result->limbs[2], + result->limbs[3], a->limbs[0], a->limbs[1], a->limbs[2], + a->limbs[3], b->limbs[0], b->limbs[1], b->limbs[2], + b->limbs[3], a->count, ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_sub(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b) { + if (!ctx || !result || !a || !b) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (result->count != a->count || a->count != b->count) { + return GNARK_GPU_ERROR_SIZE_MISMATCH; + } + + gnark_gpu::launch_sub_fr(result->limbs[0], result->limbs[1], result->limbs[2], + result->limbs[3], a->limbs[0], a->limbs[1], a->limbs[2], + a->limbs[3], b->limbs[0], b->limbs[1], b->limbs[2], + b->limbs[3], a->count, ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// MSM operations (Twisted Edwards) +// ============================================================================= + +struct GnarkGPUMSM { + GnarkGPUContext *ctx; + gnark_gpu::MSMContext *msm_ctx; + size_t max_points; + size_t loaded_points; +}; + +extern "C" gnark_gpu_error_t gnark_gpu_msm_create(gnark_gpu_context_t ctx, size_t max_points, + gnark_gpu_msm_t *msm) { + if (!ctx || !msm || max_points == 0) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + cudaError_t err = cudaSetDevice(ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + gnark_gpu::MSMContext *msm_ctx = gnark_gpu::msm_create(max_points); + if (!msm_ctx) return GNARK_GPU_ERROR_OUT_OF_MEMORY; + + GnarkGPUMSM *m = new GnarkGPUMSM; + m->ctx = ctx; + m->msm_ctx = msm_ctx; + m->max_points = max_points; + m->loaded_points = 0; + + *msm = m; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_msm_load_points(gnark_gpu_msm_t msm, + const uint64_t *points_data, + size_t count) { + if (!msm || !points_data || count == 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (count > msm->max_points) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::msm_load_points(msm->msm_ctx, points_data, count, msm->ctx->stream); + + cudaError_t err = cudaStreamSynchronize(msm->ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + + msm->loaded_points = count; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_msm_run(gnark_gpu_msm_t msm, uint64_t *result, + const uint64_t *scalars, size_t count) { + if (!msm || !result || !scalars || count == 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (count > msm->loaded_points) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + // msm_run_full handles: lazy alloc sort buffers → upload → compute → + // sync → unregister host → free sort buffers. + cudaError_t err = gnark_gpu::msm_run_full( + msm->msm_ctx, scalars, count, + reinterpret_cast(result), msm->ctx->stream); + + return check_cuda(err); +} + +extern "C" void gnark_gpu_msm_destroy(gnark_gpu_msm_t msm) { + if (msm) { + gnark_gpu::msm_destroy(msm->msm_ctx); + delete msm; + } +} + +extern "C" void gnark_gpu_msm_get_config(gnark_gpu_msm_t msm, int *c, int *num_windows) { + if (msm && msm->msm_ctx) { + if (c) *c = gnark_gpu::msm_get_c(msm->msm_ctx); + if (num_windows) *num_windows = gnark_gpu::msm_get_num_windows(msm->msm_ctx); + } +} + +extern "C" int gnark_gpu_msm_get_phase_timings(gnark_gpu_msm_t msm, float *out) { + if (!msm || !msm->msm_ctx || !out) return 0; + return gnark_gpu::msm_get_phase_timings(msm->msm_ctx, out); +} + +extern "C" gnark_gpu_error_t gnark_gpu_msm_pin_work_buffers(gnark_gpu_msm_t msm) { + if (!msm || !msm->msm_ctx) return GNARK_GPU_ERROR_INVALID_ARG; + gnark_gpu::msm_pin_buffers(msm->msm_ctx); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_msm_release_work_buffers(gnark_gpu_msm_t msm) { + if (!msm || !msm->msm_ctx) return GNARK_GPU_ERROR_INVALID_ARG; + cudaError_t err = cudaSetDevice(msm->ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + gnark_gpu::msm_release_buffers(msm->msm_ctx); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_msm_load_points_sw(gnark_gpu_msm_t msm, + const uint64_t *points_data, + size_t count) { + if (!msm || !points_data || count == 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (count > msm->max_points) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaError_t err = cudaSetDevice(msm->ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + cudaError_t cuda_err = gnark_gpu::msm_load_points_sw( + msm->msm_ctx, points_data, count, msm->ctx->stream); + if (cuda_err != cudaSuccess) return check_cuda(cuda_err); + + err = cudaStreamSynchronize(msm->ctx->stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_test_sw_pair_add( + const uint64_t *p0, const uint64_t *p1, uint64_t *out) { + if (!p0 || !p1 || !out) return GNARK_GPU_ERROR_INVALID_ARG; + return check_cuda(gnark_gpu::test_sw_pair_add_run(p0, p1, out)); +} + +extern "C" gnark_gpu_error_t gnark_gpu_test_sw_to_te( + const uint64_t *p_sw, uint64_t *out_te) { + if (!p_sw || !out_te) return GNARK_GPU_ERROR_INVALID_ARG; + return check_cuda(gnark_gpu::test_sw_to_te_run(p_sw, out_te)); +} + +extern "C" gnark_gpu_error_t gnark_gpu_test_batched_affine_reduce( + const uint64_t *points_aos, uint64_t *out_aos, int N) { + if (!points_aos || !out_aos || N <= 0) return GNARK_GPU_ERROR_INVALID_ARG; + return check_cuda(gnark_gpu::test_batched_affine_reduce_run(points_aos, out_aos, N)); +} + +extern "C" gnark_gpu_error_t gnark_gpu_msm_offload_points(gnark_gpu_msm_t msm) { + if (!msm) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaError_t err = cudaSetDevice(msm->ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + gnark_gpu::msm_offload_points(msm->msm_ctx); + msm->loaded_points = 0; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_msm_reload_points(gnark_gpu_msm_t msm, + const uint64_t *points_data, + size_t count) { + if (!msm || !points_data || count == 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (count > msm->max_points) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaError_t err = cudaSetDevice(msm->ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + cudaError_t cuda_err = gnark_gpu::msm_reload_points( + msm->msm_ctx, points_data, count, msm->ctx->stream); + if (cuda_err != cudaSuccess) return check_cuda(cuda_err); + + err = cudaStreamSynchronize(msm->ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + + msm->loaded_points = count; + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// NTT operations +// ============================================================================= + +struct GnarkGPUNTTDomain { + GnarkGPUContext *ctx; + gnark_gpu::NTTDomain *ntt_dom; + size_t size; +}; + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_domain_create(gnark_gpu_context_t ctx, size_t size, + const uint64_t *fwd_twiddles_aos, + const uint64_t *inv_twiddles_aos, + const uint64_t *inv_n, + gnark_gpu_ntt_domain_t *domain) { + if (!ctx || !fwd_twiddles_aos || !inv_twiddles_aos || !inv_n || !domain || size == 0) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + // Verify power of 2 + if ((size & (size - 1)) != 0) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + cudaError_t err = cudaSetDevice(ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + gnark_gpu::NTTDomain *ntt_dom = gnark_gpu::ntt_domain_create( + size, fwd_twiddles_aos, inv_twiddles_aos, inv_n, ctx->stream); + if (!ntt_dom) return GNARK_GPU_ERROR_OUT_OF_MEMORY; + + GnarkGPUNTTDomain *d = new GnarkGPUNTTDomain; + d->ctx = ctx; + d->ntt_dom = ntt_dom; + d->size = size; + + *domain = d; + return GNARK_GPU_SUCCESS; +} + +extern "C" void gnark_gpu_ntt_domain_destroy(gnark_gpu_ntt_domain_t domain) { + if (domain) { + gnark_gpu::ntt_domain_destroy(domain->ntt_dom); + delete domain; + } +} + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_forward(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data) { + if (!domain || !data) return GNARK_GPU_ERROR_INVALID_ARG; + if (data->count != domain->size) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_ntt_forward(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], + domain->ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +// Fused CosetFFT: ScaleByPowers + forward NTT + BitReverse +static gnark_gpu_error_t ntt_forward_coset_impl(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + const uint64_t g[4], + const uint64_t g_half[4], + cudaStream_t stream) { + if (!domain || !data || !g || !g_half) return GNARK_GPU_ERROR_INVALID_ARG; + if (data->count != domain->size) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_ntt_forward_coset(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], + g, g_half, stream); + gnark_gpu::launch_ntt_bit_reverse(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], + stream); + + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_forward_coset(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + const uint64_t g[4], + const uint64_t g_half[4]) { + if (!domain) return GNARK_GPU_ERROR_INVALID_ARG; + return ntt_forward_coset_impl(domain, data, g, g_half, domain->ctx->stream); +} + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_forward_coset_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + const uint64_t g[4], + const uint64_t g_half[4], + int stream_id) { + if (!domain) return GNARK_GPU_ERROR_INVALID_ARG; + cudaStream_t stream = get_stream(domain->ctx, stream_id); + return ntt_forward_coset_impl(domain, data, g, g_half, stream); +} + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_inverse(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data) { + if (!domain || !data) return GNARK_GPU_ERROR_INVALID_ARG; + if (data->count != domain->size) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_ntt_inverse(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], + domain->ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_bit_reverse(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data) { + if (!domain || !data) return GNARK_GPU_ERROR_INVALID_ARG; + if (data->count != domain->size) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_ntt_bit_reverse(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], + domain->ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// PlonK2 curve-generic Fr and NTT operations +// ============================================================================= + +struct GnarkGPUPlonk2FrVector { + GnarkGPUContext *ctx; + gnark_gpu_plonk2_curve_id_t curve; + int limbs; + size_t count; + uint64_t *limbs_dev[gnark_gpu::plonk2::MAX_FR_LIMBS]; +}; + +struct GnarkGPUPlonk2NTTDomain { + GnarkGPUContext *ctx; + gnark_gpu_plonk2_curve_id_t curve; + int limbs; + size_t size; + uint64_t *fwd[gnark_gpu::plonk2::MAX_FR_LIMBS]; + uint64_t *inv[gnark_gpu::plonk2::MAX_FR_LIMBS]; + uint64_t *inv_n; +}; + +struct GnarkGPUPlonk2MSM { + GnarkGPUContext *ctx; + gnark_gpu_plonk2_curve_id_t curve; + int base_limbs; + int scalar_limbs; + size_t point_count; + int window_bits; + int num_windows; + int num_buckets; + int total_buckets; + int reduce_bpw; + size_t assignments_capacity; + size_t sort_temp_bytes; + uint64_t *d_points; + uint64_t *d_scalars; + uint64_t *d_out; + uint32_t *d_keys_in; + uint32_t *d_keys_out; + uint32_t *d_vals_in; + uint32_t *d_vals_out; + uint32_t *d_bucket_offsets; + uint32_t *d_bucket_ends; + uint32_t *d_overflow_buckets; + uint32_t *d_overflow_count; + void *d_buckets; + void *d_window_results; + void *d_partial_totals; + void *d_partial_sums; + void *d_sort_temp; + cudaEvent_t phase_event[10]; + float phase_timings_ms[9]; + bool phase_events_valid; +}; + +static int plonk2_limbs(gnark_gpu_plonk2_curve_id_t curve) { + return gnark_gpu::plonk2::curve_limbs(curve); +} + +static int plonk2_base_limbs(gnark_gpu_plonk2_curve_id_t curve) { + return gnark_gpu::plonk2::curve_base_limbs(curve); +} + +static bool plonk2_valid_curve(gnark_gpu_plonk2_curve_id_t curve) { + return plonk2_limbs(curve) > 0; +} + +static bool plonk2_mul_overflows(size_t a, size_t b) { + return a != 0 && b > SIZE_MAX / a; +} + +static int plonk2_scalar_bits(gnark_gpu_plonk2_curve_id_t curve) { + switch (curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return 254; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return 253; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return 377; + default: + return 0; + } +} + +static int plonk2_signed_window_count(int scalar_bits, int window_bits) { + return (scalar_bits + 1 + window_bits - 1) / window_bits; +} + +static int plonk2_reduce_blocks_per_window(int num_windows, int num_buckets) { + constexpr int reduce_threads_per_window = 128; + constexpr int finalize_threads = 32; + int max_bpw = num_buckets / reduce_threads_per_window; + int target_bpw = 752 / num_windows; + int bpw = max_bpw < target_bpw ? max_bpw : target_bpw; + if (bpw < 1) bpw = 1; + if (bpw > finalize_threads) bpw = finalize_threads; + return bpw; +} + +static bool plonk2_msm_has_work_buffers(const GnarkGPUPlonk2MSM *msm) { + if (!(msm && msm->d_scalars && msm->d_out && msm->d_keys_in && + msm->d_keys_out && msm->d_vals_in && msm->d_vals_out && + msm->d_bucket_offsets && msm->d_bucket_ends && + msm->d_overflow_buckets && msm->d_overflow_count && + msm->d_buckets && msm->d_window_results && + msm->d_partial_totals && msm->d_partial_sums && + msm->d_sort_temp)) { + return false; + } + return true; +} + +static void plonk2_msm_free_work_buffers(GnarkGPUPlonk2MSM *msm) { + if (!msm) return; + if (msm->d_scalars) cudaFree(msm->d_scalars); + if (msm->d_out) cudaFree(msm->d_out); + if (msm->d_keys_in) cudaFree(msm->d_keys_in); + if (msm->d_keys_out) cudaFree(msm->d_keys_out); + if (msm->d_vals_in) cudaFree(msm->d_vals_in); + if (msm->d_vals_out) cudaFree(msm->d_vals_out); + if (msm->d_bucket_offsets) cudaFree(msm->d_bucket_offsets); + if (msm->d_bucket_ends) cudaFree(msm->d_bucket_ends); + if (msm->d_overflow_buckets) cudaFree(msm->d_overflow_buckets); + if (msm->d_overflow_count) cudaFree(msm->d_overflow_count); + if (msm->d_buckets) cudaFree(msm->d_buckets); + if (msm->d_window_results) cudaFree(msm->d_window_results); + if (msm->d_partial_totals) cudaFree(msm->d_partial_totals); + if (msm->d_partial_sums) cudaFree(msm->d_partial_sums); + if (msm->d_sort_temp) cudaFree(msm->d_sort_temp); + msm->d_scalars = nullptr; + msm->d_out = nullptr; + msm->d_keys_in = nullptr; + msm->d_keys_out = nullptr; + msm->d_vals_in = nullptr; + msm->d_vals_out = nullptr; + msm->d_bucket_offsets = nullptr; + msm->d_bucket_ends = nullptr; + msm->d_overflow_buckets = nullptr; + msm->d_overflow_count = nullptr; + msm->d_buckets = nullptr; + msm->d_window_results = nullptr; + msm->d_partial_totals = nullptr; + msm->d_partial_sums = nullptr; + msm->d_sort_temp = nullptr; +} + +static void plonk2_msm_free_all(GnarkGPUPlonk2MSM *msm) { + if (!msm) return; + plonk2_msm_free_work_buffers(msm); + if (msm->d_points) cudaFree(msm->d_points); + msm->d_points = nullptr; +} + +static void plonk2_msm_init_phase_events(GnarkGPUPlonk2MSM *msm) { + if (!msm) return; + msm->phase_events_valid = true; + for (int i = 0; i < 10; i++) { + msm->phase_event[i] = nullptr; + if (cudaEventCreate(&msm->phase_event[i]) != cudaSuccess) { + msm->phase_events_valid = false; + cudaGetLastError(); + break; + } + } + if (!msm->phase_events_valid) { + for (int i = 0; i < 10; i++) { + if (msm->phase_event[i]) { + cudaEventDestroy(msm->phase_event[i]); + msm->phase_event[i] = nullptr; + } + } + } + for (int i = 0; i < 9; i++) msm->phase_timings_ms[i] = 0.0f; +} + +static void plonk2_msm_destroy_phase_events(GnarkGPUPlonk2MSM *msm) { + if (!msm) return; + for (int i = 0; i < 10; i++) { + if (msm->phase_event[i]) { + cudaEventDestroy(msm->phase_event[i]); + msm->phase_event[i] = nullptr; + } + } + msm->phase_events_valid = false; +} + +static cudaError_t plonk2_msm_alloc_work_buffers(GnarkGPUPlonk2MSM *m) { + if (!m) return cudaErrorInvalidValue; + if (plonk2_msm_has_work_buffers(m)) return cudaSuccess; + plonk2_msm_free_work_buffers(m); + + size_t scalar_words = m->point_count * (size_t)m->scalar_limbs; + size_t output_words = (size_t)3 * (size_t)m->base_limbs; + size_t bucket_words = + (size_t)m->total_buckets * (size_t)3 * (size_t)m->base_limbs; + size_t window_words = + (size_t)m->num_windows * (size_t)3 * (size_t)m->base_limbs; + size_t partial_words = + (size_t)m->num_windows * (size_t)m->reduce_bpw * + (size_t)3 * (size_t)m->base_limbs; + + cudaError_t err = cudaMalloc(&m->d_scalars, scalar_words * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_out, output_words * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_keys_in, m->assignments_capacity * sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_keys_out, m->assignments_capacity * sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_vals_in, m->assignments_capacity * sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_vals_out, m->assignments_capacity * sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_bucket_offsets, + (size_t)m->total_buckets * sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_bucket_ends, + (size_t)m->total_buckets * sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_overflow_buckets, + (size_t)m->total_buckets * sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_overflow_count, sizeof(uint32_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_buckets, bucket_words * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_window_results, window_words * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_partial_totals, partial_words * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_partial_sums, partial_words * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&m->d_sort_temp, m->sort_temp_bytes); + if (err != cudaSuccess) goto fail; + return cudaSuccess; + +fail: + plonk2_msm_free_work_buffers(m); + return err; +} + +static bool plonk2_power_of_two(size_t n) { + return n != 0 && (n & (n - 1)) == 0; +} + +static gnark_gpu::plonk2::FrView plonk2_view(uint64_t *const limbs[]) { + gnark_gpu::plonk2::FrView out{}; + for (int i = 0; i < gnark_gpu::plonk2::MAX_FR_LIMBS; i++) out.limbs[i] = limbs[i]; + return out; +} + +static gnark_gpu::plonk2::ConstFrView plonk2_const_view(uint64_t *const limbs[]) { + gnark_gpu::plonk2::ConstFrView out{}; + for (int i = 0; i < gnark_gpu::plonk2::MAX_FR_LIMBS; i++) out.limbs[i] = limbs[i]; + return out; +} + +static gnark_gpu::plonk2::FrView plonk2_view(gnark_gpu_plonk2_fr_vector_t v) { + return plonk2_view(v->limbs_dev); +} + +static gnark_gpu::plonk2::ConstFrView plonk2_const_view(gnark_gpu_plonk2_fr_vector_t v) { + return plonk2_const_view(v->limbs_dev); +} + +static bool plonk2_same_shape(gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b) { + return result->ctx == a->ctx && a->ctx == b->ctx && + result->curve == a->curve && a->curve == b->curve && + result->count == a->count && a->count == b->count; +} + +static bool plonk2_same_vector_shape(gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b) { + return a->ctx == b->ctx && a->curve == b->curve && a->count == b->count; +} + +static gnark_gpu_error_t ensure_plonk2_staging_words(GnarkGPUContext *ctx, size_t words) { + if (ctx->plonk2_staging_words >= words) return GNARK_GPU_SUCCESS; + + if (ctx->plonk2_staging_buffer) { + cudaError_t err = cudaStreamSynchronize(ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + cudaFree(ctx->plonk2_staging_buffer); + ctx->plonk2_staging_buffer = nullptr; + ctx->plonk2_staging_words = 0; + } + + cudaError_t err = cudaMalloc(&ctx->plonk2_staging_buffer, words * sizeof(uint64_t)); + if (err != cudaSuccess) return check_cuda(err); + ctx->plonk2_staging_words = words; + return GNARK_GPU_SUCCESS; +} + +static gnark_gpu_error_t plonk2_upload_aos(GnarkGPUContext *ctx, + gnark_gpu_plonk2_curve_id_t curve, + uint64_t *dst_limbs[], + const uint64_t *host_data, + size_t count) { + int limbs = plonk2_limbs(curve); + if (limbs == 0 || plonk2_mul_overflows(count, (size_t)limbs)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + size_t words = count * (size_t)limbs; + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, words); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + cudaError_t err = cudaMemcpyAsync(ctx->plonk2_staging_buffer, host_data, + words * sizeof(uint64_t), + cudaMemcpyHostToDevice, ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + + gnark_gpu::plonk2::launch_copy_aos_to_soa(curve, plonk2_view(dst_limbs), + ctx->plonk2_staging_buffer, + count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_alloc( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + size_t count, + gnark_gpu_plonk2_fr_vector_t *vec) { + if (!ctx || !vec || count == 0 || !plonk2_valid_curve(curve_id)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + GnarkGPUPlonk2FrVector *v = new GnarkGPUPlonk2FrVector{}; + v->ctx = ctx; + v->curve = curve_id; + v->limbs = plonk2_limbs(curve_id); + v->count = count; + + for (int i = 0; i < v->limbs; i++) { + cudaError_t err = cudaMalloc(&v->limbs_dev[i], count * sizeof(uint64_t)); + if (err != cudaSuccess) { + for (int j = 0; j < i; j++) cudaFree(v->limbs_dev[j]); + delete v; + cudaGetLastError(); + return check_cuda(err); + } + } + + *vec = v; + return GNARK_GPU_SUCCESS; +} + +extern "C" void gnark_gpu_plonk2_fr_vector_free(gnark_gpu_plonk2_fr_vector_t vec) { + if (!vec) return; + for (int i = 0; i < vec->limbs; i++) { + if (vec->limbs_dev[i]) cudaFree(vec->limbs_dev[i]); + } + delete vec; +} + +extern "C" size_t gnark_gpu_plonk2_fr_vector_len(gnark_gpu_plonk2_fr_vector_t vec) { + return vec ? vec->count : 0; +} + +extern "C" int gnark_gpu_plonk2_fr_vector_limbs(gnark_gpu_plonk2_fr_vector_t vec) { + return vec ? vec->limbs : 0; +} + +extern "C" gnark_gpu_plonk2_curve_id_t gnark_gpu_plonk2_fr_vector_curve( + gnark_gpu_plonk2_fr_vector_t vec) { + return vec ? vec->curve : (gnark_gpu_plonk2_curve_id_t)0; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_to_device( + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *host_data, + size_t count) { + if (!vec || !host_data) return GNARK_GPU_ERROR_INVALID_ARG; + if (count != vec->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + return plonk2_upload_aos(vec->ctx, vec->curve, vec->limbs_dev, host_data, count); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_to_host( + gnark_gpu_plonk2_fr_vector_t vec, + uint64_t *host_data, + size_t count) { + if (!vec || !host_data) return GNARK_GPU_ERROR_INVALID_ARG; + if (count != vec->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + if (plonk2_mul_overflows(count, (size_t)vec->limbs)) return GNARK_GPU_ERROR_INVALID_ARG; + + size_t words = count * (size_t)vec->limbs; + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(vec->ctx, words); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + gnark_gpu::plonk2::launch_copy_soa_to_aos(vec->curve, vec->ctx->plonk2_staging_buffer, + plonk2_const_view(vec), count, vec->ctx->stream); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) return check_cuda(err); + + err = cudaMemcpyAsync(host_data, vec->ctx->plonk2_staging_buffer, + words * sizeof(uint64_t), + cudaMemcpyDeviceToHost, vec->ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + return check_cuda(cudaStreamSynchronize(vec->ctx->stream)); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_d2d( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t dst, + gnark_gpu_plonk2_fr_vector_t src) { + if (!ctx || !dst || !src || dst->ctx != ctx || !plonk2_same_vector_shape(dst, src)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + for (int i = 0; i < dst->limbs; i++) { + cudaError_t err = cudaMemcpyAsync(dst->limbs_dev[i], src->limbs_dev[i], + dst->count * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_set_zero( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec) { + if (!ctx || !vec || vec->ctx != ctx) return GNARK_GPU_ERROR_INVALID_ARG; + gnark_gpu::plonk2::launch_set_zero(vec->curve, plonk2_view(vec), vec->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_add( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b) { + if (!ctx || !result || !a || !b || result->ctx != ctx || !plonk2_same_shape(result, a, b)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_add(result->curve, plonk2_view(result), + plonk2_const_view(a), plonk2_const_view(b), + result->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_sub( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b) { + if (!ctx || !result || !a || !b || result->ctx != ctx || !plonk2_same_shape(result, a, b)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_sub(result->curve, plonk2_view(result), + plonk2_const_view(a), plonk2_const_view(b), + result->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_mul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b) { + if (!ctx || !result || !a || !b || result->ctx != ctx || !plonk2_same_shape(result, a, b)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_mul(result->curve, plonk2_view(result), + plonk2_const_view(a), plonk2_const_view(b), + result->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_addmul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b) { + if (!ctx || !vec || !a || !b || vec->ctx != ctx || !plonk2_same_shape(vec, a, b)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_addmul(vec->curve, plonk2_view(vec), + plonk2_const_view(a), plonk2_const_view(b), + vec->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_scalar_mul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *scalar) { + if (!ctx || !vec || !scalar || vec->ctx != ctx) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_scalar_mul(vec->curve, plonk2_view(vec), + scalar, vec->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_add_scalar_mul( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + gnark_gpu_plonk2_fr_vector_t a, + const uint64_t *scalar) { + if (!ctx || !vec || !a || !scalar || vec->ctx != ctx || + !plonk2_same_vector_shape(vec, a)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_add_scalar_mul(vec->curve, plonk2_view(vec), + plonk2_const_view(a), scalar, + vec->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_batch_invert( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + gnark_gpu_plonk2_fr_vector_t temp) { + if (!ctx || !vec || !temp || vec->ctx != ctx || !plonk2_same_vector_shape(vec, temp)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_batch_invert(vec->curve, plonk2_view(vec), + vec->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_butterfly4_inverse( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t b0, + gnark_gpu_plonk2_fr_vector_t b1, + gnark_gpu_plonk2_fr_vector_t b2, + gnark_gpu_plonk2_fr_vector_t b3, + const uint64_t *omega4_inv, + const uint64_t *quarter) { + if (!ctx || !b0 || !b1 || !b2 || !b3 || !omega4_inv || !quarter || b0->ctx != ctx || + !plonk2_same_vector_shape(b0, b1) || !plonk2_same_vector_shape(b0, b2) || + !plonk2_same_vector_shape(b0, b3)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + gnark_gpu::plonk2::launch_butterfly4_inverse( + b0->curve, plonk2_view(b0), plonk2_view(b1), plonk2_view(b2), plonk2_view(b3), + omega4_inv, quarter, b0->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_reduce_blinded_coset( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t dst, + gnark_gpu_plonk2_fr_vector_t src, + const uint64_t *tail, + size_t tail_len, + const uint64_t *coset_pow_n) { + if (!ctx || !dst || !src || !tail || !coset_pow_n || dst->ctx != ctx || + !plonk2_same_vector_shape(dst, src)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (tail_len > dst->count) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + size_t tail_words = tail_len * (size_t)dst->limbs; + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, tail_words); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + cudaError_t err = cudaMemcpyAsync(ctx->plonk2_staging_buffer, tail, + tail_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + gnark_gpu::plonk2::launch_reduce_blinded_coset( + dst->curve, plonk2_view(dst), plonk2_const_view(src), + ctx->plonk2_staging_buffer, tail_len, coset_pow_n, dst->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_compute_l1_den( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t out, + const uint64_t *coset_gen) { + if (!domain || !out || !coset_gen || domain->ctx != out->ctx || + domain->curve != out->curve) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (domain->size != out->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::plonk2::launch_compute_l1_den( + domain->curve, plonk2_view(out), plonk2_const_view(domain->fwd), + coset_gen, domain->size, domain->ctx->stream); + return check_cuda(cudaGetLastError()); +} + +static bool plonk2_same_context_curve_size(gnark_gpu_plonk2_fr_vector_t a, + gnark_gpu_plonk2_fr_vector_t b) { + return a && b && a->ctx == b->ctx && a->curve == b->curve && a->count == b->count; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_gate_accum( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t ql, + gnark_gpu_plonk2_fr_vector_t qr, + gnark_gpu_plonk2_fr_vector_t qm, + gnark_gpu_plonk2_fr_vector_t qo, + gnark_gpu_plonk2_fr_vector_t qk, + gnark_gpu_plonk2_fr_vector_t l, + gnark_gpu_plonk2_fr_vector_t r, + gnark_gpu_plonk2_fr_vector_t o, + const uint64_t *zh_k_inv) { + if (!ctx || !result || !ql || !qr || !qm || !qo || !qk || !l || !r || !o || + !zh_k_inv || result->ctx != ctx || + !plonk2_same_context_curve_size(result, ql) || + !plonk2_same_context_curve_size(result, qr) || + !plonk2_same_context_curve_size(result, qm) || + !plonk2_same_context_curve_size(result, qo) || + !plonk2_same_context_curve_size(result, qk) || + !plonk2_same_context_curve_size(result, l) || + !plonk2_same_context_curve_size(result, r) || + !plonk2_same_context_curve_size(result, o)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + gnark_gpu::plonk2::launch_gate_accum( + result->curve, plonk2_view(result), + plonk2_const_view(ql), plonk2_const_view(qr), plonk2_const_view(qm), + plonk2_const_view(qo), plonk2_const_view(qk), + plonk2_const_view(l), plonk2_const_view(r), plonk2_const_view(o), + zh_k_inv, result->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_linearize_static( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t z, + gnark_gpu_plonk2_fr_vector_t s3, + gnark_gpu_plonk2_fr_vector_t ql, + gnark_gpu_plonk2_fr_vector_t qr, + gnark_gpu_plonk2_fr_vector_t qm, + gnark_gpu_plonk2_fr_vector_t qo, + gnark_gpu_plonk2_fr_vector_t qk, + const uint64_t *scalars) { + if (!ctx || !result || !z || !s3 || !ql || !qr || !qm || !qo || !qk || !scalars) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (result->ctx != ctx || z->ctx != ctx || s3->ctx != ctx || + ql->ctx != ctx || qr->ctx != ctx || qm->ctx != ctx || + qo->ctx != ctx || qk->ctx != ctx) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (!plonk2_same_vector_shape(result, z) || + !plonk2_same_vector_shape(result, s3) || + !plonk2_same_vector_shape(result, ql) || + !plonk2_same_vector_shape(result, qr) || + !plonk2_same_vector_shape(result, qm) || + !plonk2_same_vector_shape(result, qo) || + !plonk2_same_vector_shape(result, qk)) { + return GNARK_GPU_ERROR_SIZE_MISMATCH; + } + gnark_gpu::plonk2::launch_linearize_static( + result->curve, plonk2_view(result), plonk2_const_view(z), + plonk2_const_view(s3), plonk2_const_view(ql), plonk2_const_view(qr), + plonk2_const_view(qm), plonk2_const_view(qo), plonk2_const_view(qk), + scalars, result->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_subtract_head( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *tail, + size_t tail_len) { + if (!ctx || !vec || vec->ctx != ctx || (!tail && tail_len != 0)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (tail_len > vec->count) { + return GNARK_GPU_ERROR_SIZE_MISMATCH; + } + size_t tail_words = tail_len * (size_t)vec->limbs; + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, tail_words); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + if (tail_words != 0) { + cudaError_t err = cudaMemcpyAsync(ctx->plonk2_staging_buffer, tail, + tail_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + } + gnark_gpu::plonk2::launch_subtract_head( + vec->curve, plonk2_view(vec), ctx->plonk2_staging_buffer, + tail_len, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_perm_boundary( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t result, + gnark_gpu_plonk2_fr_vector_t l, + gnark_gpu_plonk2_fr_vector_t r, + gnark_gpu_plonk2_fr_vector_t o, + gnark_gpu_plonk2_fr_vector_t z, + gnark_gpu_plonk2_fr_vector_t s1, + gnark_gpu_plonk2_fr_vector_t s2, + gnark_gpu_plonk2_fr_vector_t s3, + gnark_gpu_plonk2_fr_vector_t l1_den_inv, + const uint64_t *params, + gnark_gpu_plonk2_ntt_domain_t domain) { + if (!ctx || !result || !l || !r || !o || !z || !s1 || !s2 || !s3 || + !l1_den_inv || !params || !domain || result->ctx != ctx || + domain->ctx != ctx || domain->curve != result->curve || + domain->size != result->count || + !plonk2_same_context_curve_size(result, l) || + !plonk2_same_context_curve_size(result, r) || + !plonk2_same_context_curve_size(result, o) || + !plonk2_same_context_curve_size(result, z) || + !plonk2_same_context_curve_size(result, s1) || + !plonk2_same_context_curve_size(result, s2) || + !plonk2_same_context_curve_size(result, s3) || + !plonk2_same_context_curve_size(result, l1_den_inv)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + gnark_gpu::plonk2::launch_perm_boundary( + result->curve, plonk2_view(result), + plonk2_const_view(l), plonk2_const_view(r), plonk2_const_view(o), + plonk2_const_view(z), plonk2_const_view(s1), plonk2_const_view(s2), + plonk2_const_view(s3), plonk2_const_view(l1_den_inv), + plonk2_const_view(domain->fwd), params, result->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_z_compute_factors( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t l_inout, + gnark_gpu_plonk2_fr_vector_t r_inout, + gnark_gpu_plonk2_fr_vector_t o_in, + const void *d_perm, + const uint64_t *params, + unsigned log2n, + gnark_gpu_plonk2_ntt_domain_t domain) { + if (!ctx || !l_inout || !r_inout || !o_in || !d_perm || !params || !domain || + l_inout->ctx != ctx || domain->ctx != ctx || domain->curve != l_inout->curve || + domain->size != l_inout->count || + !plonk2_same_context_curve_size(l_inout, r_inout) || + !plonk2_same_context_curve_size(l_inout, o_in)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + gnark_gpu::plonk2::launch_z_compute_factors( + l_inout->curve, plonk2_view(l_inout), plonk2_view(r_inout), + plonk2_const_view(o_in), static_cast(d_perm), + plonk2_const_view(domain->fwd), params, l_inout->count, log2n, + ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_z_prefix_phase1( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t z_vec, + gnark_gpu_plonk2_fr_vector_t ratio_vec, + uint64_t *chunk_products_host, + size_t *num_chunks_out) { + if (!ctx || !z_vec || !ratio_vec || !chunk_products_host || !num_chunks_out || + z_vec->ctx != ctx || !plonk2_same_vector_shape(z_vec, ratio_vec)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + size_t n = ratio_vec->count; + size_t num_chunks = (n + 1023) / 1024; + size_t words = num_chunks * (size_t)ratio_vec->limbs; + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, words); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + gnark_gpu::plonk2::launch_z_prefix_phase1( + z_vec->curve, plonk2_view(z_vec), plonk2_const_view(ratio_vec), + ctx->plonk2_staging_buffer, n, ctx->stream); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) return check_cuda(err); + err = cudaStreamSynchronize(ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + err = cudaMemcpy(chunk_products_host, ctx->plonk2_staging_buffer, + words * sizeof(uint64_t), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) return check_cuda(err); + *num_chunks_out = num_chunks; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_z_prefix_phase3( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t z_vec, + gnark_gpu_plonk2_fr_vector_t temp_vec, + const uint64_t *scanned_prefixes_host, + size_t num_chunks) { + if (!ctx || !z_vec || !temp_vec || !scanned_prefixes_host || + z_vec->ctx != ctx || !plonk2_same_vector_shape(z_vec, temp_vec)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + size_t words = num_chunks * (size_t)z_vec->limbs; + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, words); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + cudaError_t err = cudaMemcpyAsync(ctx->plonk2_staging_buffer, + scanned_prefixes_host, + words * sizeof(uint64_t), + cudaMemcpyHostToDevice, ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + + gnark_gpu::plonk2::launch_z_prefix_phase3( + z_vec->curve, plonk2_view(z_vec), plonk2_view(temp_vec), + ctx->plonk2_staging_buffer, num_chunks, z_vec->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_poly_eval_chunks( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t coeffs, + const uint64_t *z, + uint64_t *partials_host, + size_t *num_chunks_out) { + if (!ctx || !coeffs || !z || !partials_host || !num_chunks_out || + coeffs->ctx != ctx) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + size_t n = coeffs->count; + if (n == 0) { + *num_chunks_out = 0; + return GNARK_GPU_SUCCESS; + } + size_t num_chunks = (n + 1023) / 1024; + size_t words = num_chunks * (size_t)coeffs->limbs; + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, words); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + gnark_gpu::plonk2::launch_poly_eval_chunks( + coeffs->curve, plonk2_const_view(coeffs), z, + ctx->plonk2_staging_buffer, n, ctx->stream); + cudaError_t err = cudaGetLastError(); + if (err != cudaSuccess) return check_cuda(err); + err = cudaStreamSynchronize(ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + err = cudaMemcpy(partials_host, ctx->plonk2_staging_buffer, + words * sizeof(uint64_t), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) return check_cuda(err); + *num_chunks_out = num_chunks; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_scale_by_powers( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *generator) { + if (!ctx || !vec || !generator || vec->ctx != ctx) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + // THREADS=256 powers stored by local_power_table_kernel in staging buffer. + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, 256ULL * (size_t)vec->limbs); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + gnark_gpu::plonk2::launch_scale_by_powers( + vec->curve, plonk2_view(vec), generator, ctx->plonk2_staging_buffer, + vec->count, ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_ntt_domain_create( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + size_t size, + const uint64_t *fwd_twiddles_aos, + const uint64_t *inv_twiddles_aos, + const uint64_t *inv_n, + gnark_gpu_plonk2_ntt_domain_t *domain) { + if (!ctx || !domain || !inv_n || !plonk2_valid_curve(curve_id) || !plonk2_power_of_two(size)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (size > 1 && (!fwd_twiddles_aos || !inv_twiddles_aos)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + cudaError_t err = cudaSetDevice(ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + GnarkGPUPlonk2NTTDomain *d = new GnarkGPUPlonk2NTTDomain{}; + d->ctx = ctx; + d->curve = curve_id; + d->limbs = plonk2_limbs(curve_id); + d->size = size; + + size_t twiddle_count = size / 2; + for (int i = 0; i < d->limbs; i++) { + if (twiddle_count > 0) { + err = cudaMalloc(&d->fwd[i], twiddle_count * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + err = cudaMalloc(&d->inv[i], twiddle_count * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + } + } + + err = cudaMalloc(&d->inv_n, (size_t)d->limbs * sizeof(uint64_t)); + if (err != cudaSuccess) goto fail; + + err = cudaMemcpyAsync(d->inv_n, inv_n, (size_t)d->limbs * sizeof(uint64_t), + cudaMemcpyHostToDevice, ctx->stream); + if (err != cudaSuccess) goto fail; + + if (twiddle_count > 0) { + gnark_gpu_error_t gerr = plonk2_upload_aos(ctx, curve_id, d->fwd, + fwd_twiddles_aos, twiddle_count); + if (gerr != GNARK_GPU_SUCCESS) { + gnark_gpu_plonk2_ntt_domain_destroy(d); + return gerr; + } + gerr = plonk2_upload_aos(ctx, curve_id, d->inv, + inv_twiddles_aos, twiddle_count); + if (gerr != GNARK_GPU_SUCCESS) { + gnark_gpu_plonk2_ntt_domain_destroy(d); + return gerr; + } + } + + *domain = d; + return GNARK_GPU_SUCCESS; + +fail: + gnark_gpu_plonk2_ntt_domain_destroy(d); + cudaGetLastError(); + return check_cuda(err); +} + +extern "C" void gnark_gpu_plonk2_ntt_domain_destroy(gnark_gpu_plonk2_ntt_domain_t domain) { + if (!domain) return; + for (int i = 0; i < domain->limbs; i++) { + if (domain->fwd[i]) cudaFree(domain->fwd[i]); + if (domain->inv[i]) cudaFree(domain->inv[i]); + } + if (domain->inv_n) cudaFree(domain->inv_n); + delete domain; +} + +extern "C" size_t gnark_gpu_plonk2_ntt_domain_size(gnark_gpu_plonk2_ntt_domain_t domain) { + return domain ? domain->size : 0; +} + +extern "C" gnark_gpu_plonk2_curve_id_t gnark_gpu_plonk2_ntt_domain_curve( + gnark_gpu_plonk2_ntt_domain_t domain) { + return domain ? domain->curve : (gnark_gpu_plonk2_curve_id_t)0; +} + +static gnark_gpu_error_t plonk2_check_domain_vector( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data) { + if (!domain || !data) return GNARK_GPU_ERROR_INVALID_ARG; + if (domain->ctx != data->ctx || domain->curve != data->curve) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (domain->size != data->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_ntt_forward( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data) { + gnark_gpu_error_t gerr = plonk2_check_domain_vector(domain, data); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + gnark_gpu::plonk2::launch_ntt_forward(domain->curve, plonk2_view(data), + plonk2_const_view(domain->fwd), + domain->size, domain->ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_ntt_inverse( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data) { + gnark_gpu_error_t gerr = plonk2_check_domain_vector(domain, data); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + gnark_gpu::plonk2::launch_ntt_inverse(domain->curve, plonk2_view(data), + plonk2_const_view(domain->inv), + domain->inv_n, domain->size, + domain->ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_ntt_bit_reverse( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data) { + gnark_gpu_error_t gerr = plonk2_check_domain_vector(domain, data); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + gnark_gpu::plonk2::launch_bit_reverse(domain->curve, plonk2_view(data), + domain->size, domain->ctx->stream); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_test_g1_affine_add( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *p, + const uint64_t *q, + uint64_t *out) { + if (!ctx || !p || !q || !out || !plonk2_valid_curve(curve_id)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + cudaError_t err = gnark_gpu::plonk2::g1_affine_add_run( + curve_id, p, q, out, ctx->stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_test_g1_affine_double( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *p, + uint64_t *out) { + if (!ctx || !p || !out || !plonk2_valid_curve(curve_id)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + cudaError_t err = gnark_gpu::plonk2::g1_affine_double_run( + curve_id, p, out, ctx->stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_test_msm_naive( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *points, + const uint64_t *scalars, + size_t count, + uint64_t *out) { + if (!ctx || !points || !scalars || !out || count == 0 || !plonk2_valid_curve(curve_id)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + cudaError_t err = gnark_gpu::plonk2::msm_naive_run( + curve_id, points, scalars, count, out, ctx->stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_msm_pippenger( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out) { + if (!ctx || !points || !scalars || !out || count == 0 || !plonk2_valid_curve(curve_id)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (window_bits <= 1 || window_bits > 24) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + cudaError_t err = gnark_gpu::plonk2::msm_pippenger_run( + curve_id, points, scalars, count, window_bits, out, ctx->stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_msm_create( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_curve_id_t curve_id, + const uint64_t *points, + size_t point_count, + int window_bits, + gnark_gpu_plonk2_msm_t *msm) { + if (!ctx || !points || !msm || point_count == 0 || !plonk2_valid_curve(curve_id)) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (window_bits <= 1 || window_bits > 24) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + int base_limbs = plonk2_base_limbs(curve_id); + int scalar_limbs = plonk2_limbs(curve_id); + int scalar_bits = plonk2_scalar_bits(curve_id); + if (base_limbs == 0 || scalar_limbs == 0 || + scalar_bits == 0 || + plonk2_mul_overflows(point_count, (size_t)(2 * base_limbs))) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + int num_windows = plonk2_signed_window_count(scalar_bits, window_bits); + int num_buckets = 1 << (window_bits - 1); + if (num_windows <= 0 || num_buckets <= 0 || + num_windows > INT32_MAX / num_buckets) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + int total_buckets = num_windows * num_buckets; + if (plonk2_mul_overflows(point_count, (size_t)num_windows) || + point_count * (size_t)num_windows > UINT32_MAX) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + size_t assignments_capacity = point_count * (size_t)num_windows; + + GnarkGPUPlonk2MSM *m = new GnarkGPUPlonk2MSM{}; + m->ctx = ctx; + m->curve = curve_id; + m->base_limbs = base_limbs; + m->scalar_limbs = scalar_limbs; + m->point_count = point_count; + m->window_bits = window_bits; + m->num_windows = num_windows; + m->num_buckets = num_buckets; + m->total_buckets = total_buckets; + m->reduce_bpw = plonk2_reduce_blocks_per_window(num_windows, num_buckets); + m->assignments_capacity = assignments_capacity; + plonk2_msm_init_phase_events(m); + + size_t point_words = point_count * (size_t)(2 * base_limbs); + cudaError_t err = gnark_gpu::plonk2::msm_pippenger_sort_temp_bytes( + curve_id, point_count, window_bits, &m->sort_temp_bytes); + if (err != cudaSuccess) { + plonk2_msm_destroy_phase_events(m); + delete m; + return check_cuda(err); + } + + err = cudaMalloc(&m->d_points, point_words * sizeof(uint64_t)); + if (err != cudaSuccess) { + plonk2_msm_destroy_phase_events(m); + delete m; + cudaGetLastError(); + return check_cuda(err); + } + + err = cudaMemcpyAsync(m->d_points, points, point_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, ctx->stream); + if (err == cudaSuccess) { + err = cudaStreamSynchronize(ctx->stream); + } + if (err != cudaSuccess) goto fail; + + *msm = m; + return GNARK_GPU_SUCCESS; + +fail: + plonk2_msm_free_all(m); + plonk2_msm_destroy_phase_events(m); + delete m; + return check_cuda(err); +} + +extern "C" void gnark_gpu_plonk2_msm_destroy(gnark_gpu_plonk2_msm_t msm) { + if (!msm) return; + plonk2_msm_free_all(msm); + plonk2_msm_destroy_phase_events(msm); + delete msm; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_msm_pin_work_buffers( + gnark_gpu_plonk2_msm_t msm) { + if (!msm) return GNARK_GPU_ERROR_INVALID_ARG; + return check_cuda(plonk2_msm_alloc_work_buffers(msm)); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_msm_release_work_buffers( + gnark_gpu_plonk2_msm_t msm) { + if (!msm) return GNARK_GPU_ERROR_INVALID_ARG; + cudaError_t err = cudaStreamSynchronize(msm->ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + plonk2_msm_free_work_buffers(msm); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_msm_offload_points( + gnark_gpu_plonk2_msm_t msm) { + if (!msm) return GNARK_GPU_ERROR_INVALID_ARG; + cudaError_t err = cudaStreamSynchronize(msm->ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + if (msm->d_points) { + err = cudaFree(msm->d_points); + msm->d_points = nullptr; + if (err != cudaSuccess) return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_msm_reload_points( + gnark_gpu_plonk2_msm_t msm, + const uint64_t *points, + size_t point_count) { + if (!msm || !points || point_count != msm->point_count) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (msm->d_points) return GNARK_GPU_SUCCESS; + + if (plonk2_mul_overflows(point_count, (size_t)(2 * msm->base_limbs))) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + size_t point_words = point_count * (size_t)(2 * msm->base_limbs); + cudaError_t err = cudaMalloc(&msm->d_points, point_words * sizeof(uint64_t)); + if (err != cudaSuccess) { + msm->d_points = nullptr; + cudaGetLastError(); + return check_cuda(err); + } + + err = cudaMemcpyAsync(msm->d_points, points, point_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, msm->ctx->stream); + if (err == cudaSuccess) { + err = cudaStreamSynchronize(msm->ctx->stream); + } + if (err != cudaSuccess) { + cudaFree(msm->d_points); + msm->d_points = nullptr; + return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_msm_run( + gnark_gpu_plonk2_msm_t msm, + const uint64_t *scalars, + size_t count, + uint64_t *out) { + if (!msm || !msm->d_points || !scalars || !out || + count == 0 || count > msm->point_count) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if (!plonk2_msm_has_work_buffers(msm)) { + cudaError_t alloc_err = plonk2_msm_alloc_work_buffers(msm); + if (alloc_err != cudaSuccess) return check_cuda(alloc_err); + } + cudaError_t err = gnark_gpu::plonk2::msm_pippenger_device_points_prealloc_run( + msm->curve, msm->d_points, scalars, count, msm->window_bits, out, + msm->d_scalars, msm->d_out, msm->d_keys_in, msm->d_keys_out, + msm->d_vals_in, msm->d_vals_out, msm->d_bucket_offsets, + msm->d_bucket_ends, msm->d_overflow_buckets, msm->d_overflow_count, + msm->d_buckets, msm->d_window_results, msm->d_partial_totals, + msm->d_partial_sums, msm->d_sort_temp, msm->sort_temp_bytes, + msm->phase_events_valid ? msm->phase_event : nullptr, + msm->phase_events_valid ? msm->phase_timings_ms : nullptr, + msm->ctx->stream); + return check_cuda(err); +} + +extern "C" int gnark_gpu_plonk2_msm_get_phase_timings( + gnark_gpu_plonk2_msm_t msm, + float *out) { + if (!msm || !out) return 0; + for (int i = 0; i < 9; i++) out[i] = msm->phase_timings_ms[i]; + return 9; +} + +// ============================================================================= +// Stream-aware plonk2 primitives +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_copy_d2d_stream( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t dst, + gnark_gpu_plonk2_fr_vector_t src, + int stream_id) { + if (!ctx || !dst || !src || dst->ctx != ctx || !plonk2_same_vector_shape(dst, src)) + return GNARK_GPU_ERROR_INVALID_ARG; + cudaStream_t s = get_stream(ctx, stream_id); + if (!s) return GNARK_GPU_ERROR_INVALID_ARG; + for (int i = 0; i < dst->limbs; i++) { + cudaError_t err = cudaMemcpyAsync(dst->limbs_dev[i], src->limbs_dev[i], + dst->count * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, s); + if (err != cudaSuccess) return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_ntt_forward_stream( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data, + int stream_id) { + gnark_gpu_error_t gerr = plonk2_check_domain_vector(domain, data); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + cudaStream_t s = get_stream(domain->ctx, stream_id); + if (!s) return GNARK_GPU_ERROR_INVALID_ARG; + gnark_gpu::plonk2::launch_ntt_forward(domain->curve, plonk2_view(data), + plonk2_const_view(domain->fwd), + domain->size, s); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_ntt_inverse_stream( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data, + int stream_id) { + gnark_gpu_error_t gerr = plonk2_check_domain_vector(domain, data); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + cudaStream_t s = get_stream(domain->ctx, stream_id); + if (!s) return GNARK_GPU_ERROR_INVALID_ARG; + gnark_gpu::plonk2::launch_ntt_inverse(domain->curve, plonk2_view(data), + plonk2_const_view(domain->inv), + domain->inv_n, domain->size, s); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_ntt_bit_reverse_stream( + gnark_gpu_plonk2_ntt_domain_t domain, + gnark_gpu_plonk2_fr_vector_t data, + int stream_id) { + gnark_gpu_error_t gerr = plonk2_check_domain_vector(domain, data); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + cudaStream_t s = get_stream(domain->ctx, stream_id); + if (!s) return GNARK_GPU_ERROR_INVALID_ARG; + gnark_gpu::plonk2::launch_bit_reverse(domain->curve, plonk2_view(data), + domain->size, s); + return check_cuda(cudaGetLastError()); +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk2_fr_vector_scale_by_powers_stream( + gnark_gpu_context_t ctx, + gnark_gpu_plonk2_fr_vector_t vec, + const uint64_t *generator, + int stream_id) { + if (!ctx || !vec || !generator || vec->ctx != ctx) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + cudaStream_t s = get_stream(ctx, stream_id); + if (!s) return GNARK_GPU_ERROR_INVALID_ARG; + + // THREADS=256 powers stored by local_power_table_kernel in staging buffer. + gnark_gpu_error_t gerr = ensure_plonk2_staging_words(ctx, 256ULL * (size_t)vec->limbs); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + gnark_gpu::plonk2::launch_scale_by_powers( + vec->curve, plonk2_view(vec), generator, ctx->plonk2_staging_buffer, + vec->count, s); + return check_cuda(cudaGetLastError()); +} + +// ============================================================================= +// New Fr vector operations (ScaleByPowers, ScalarMul, D2D copy, SetZero, AddMul) +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_scale_by_powers(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + const uint64_t g[4]) { + if (!ctx || !v || !g) return GNARK_GPU_ERROR_INVALID_ARG; + gnark_gpu::launch_scale_by_powers(v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + g, v->count, ctx->stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_scalar_mul(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + const uint64_t c[4]) { + if (!ctx || !v || !c) return GNARK_GPU_ERROR_INVALID_ARG; + gnark_gpu::launch_scalar_mul(v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + c, v->count, ctx->stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_copy_d2d(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src) { + if (!ctx || !dst || !src) return GNARK_GPU_ERROR_INVALID_ARG; + if (dst->count != src->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = ctx->stream; + for (int i = 0; i < 4; i++) { + cudaError_t err = cudaMemcpyAsync(dst->limbs[i], src->limbs[i], + dst->count * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_set_zero(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v) { + if (!ctx || !v) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaStream_t stream = ctx->stream; + for (int i = 0; i < 4; i++) { + cudaError_t err = cudaMemsetAsync(v->limbs[i], 0, + v->count * sizeof(uint64_t), stream); + if (err != cudaSuccess) return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_addmul(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, + gnark_gpu_fr_vector_t b) { + if (!ctx || !v || !a || !b) return GNARK_GPU_ERROR_INVALID_ARG; + if (v->count != a->count || a->count != b->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_addmul(v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + a->limbs[0], a->limbs[1], a->limbs[2], a->limbs[3], + b->limbs[0], b->limbs[1], b->limbs[2], b->limbs[3], + a->count, ctx->stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_batch_invert(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t temp) { + if (!ctx || !v || !temp) return GNARK_GPU_ERROR_INVALID_ARG; + if (v->count != temp->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaError_t err = gnark_gpu::launch_batch_invert( + v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + temp->limbs[0], temp->limbs[1], temp->limbs[2], temp->limbs[3], + v->count, ctx->stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_butterfly4(gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t b0, + gnark_gpu_fr_vector_t b1, + gnark_gpu_fr_vector_t b2, + gnark_gpu_fr_vector_t b3, + const uint64_t omega4_inv[4], + const uint64_t quarter[4]) { + if (!ctx || !b0 || !b1 || !b2 || !b3 || !omega4_inv || !quarter) + return GNARK_GPU_ERROR_INVALID_ARG; + if (b0->count != b1->count || b1->count != b2->count || b2->count != b3->count) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_butterfly4( + b0->limbs[0], b0->limbs[1], b0->limbs[2], b0->limbs[3], + b1->limbs[0], b1->limbs[1], b1->limbs[2], b1->limbs[3], + b2->limbs[0], b2->limbs[1], b2->limbs[2], b2->limbs[3], + b3->limbs[0], b3->limbs[1], b3->limbs[2], b3->limbs[3], + omega4_inv, quarter, b0->count, ctx->stream); + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// PlonK fused constraint kernel +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_plonk_perm_boundary( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t L, gnark_gpu_fr_vector_t R, gnark_gpu_fr_vector_t O, + gnark_gpu_fr_vector_t Z, + gnark_gpu_fr_vector_t S1, gnark_gpu_fr_vector_t S2, gnark_gpu_fr_vector_t S3, + gnark_gpu_fr_vector_t L1_denInv, + const uint64_t params[28], + gnark_gpu_ntt_domain_t domain) { + if (!ctx || !result || !L || !R || !O || !Z || !S1 || !S2 || !S3 || + !L1_denInv || !params || !domain) + return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = result->count; + if (L->count != n || R->count != n || O->count != n || Z->count != n || + S1->count != n || S2->count != n || S3->count != n || L1_denInv->count != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + if (domain->size != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + // Pack scalar params into struct. Layout: alpha, beta, gamma, l1_scalar, + // coset_shift, coset_shift_sq, coset_gen — each 4 uint64s. + gnark_gpu::PlonkPermBoundaryParams p; + for (int j = 0; j < 4; j++) { + p.alpha[j] = params[0*4 + j]; + p.beta[j] = params[1*4 + j]; + p.gamma[j] = params[2*4 + j]; + p.l1_scalar[j] = params[3*4 + j]; + p.coset_shift[j] = params[4*4 + j]; + p.coset_shift_sq[j] = params[5*4 + j]; + p.coset_gen[j] = params[6*4 + j]; + } + + // Get forward twiddle pointers from NTT domain via accessor + const uint64_t *tw[4]; + gnark_gpu::ntt_get_fwd_twiddles(domain->ntt_dom, tw); + + gnark_gpu::launch_plonk_perm_boundary( + result->limbs[0], result->limbs[1], result->limbs[2], result->limbs[3], + L->limbs[0], L->limbs[1], L->limbs[2], L->limbs[3], + R->limbs[0], R->limbs[1], R->limbs[2], R->limbs[3], + O->limbs[0], O->limbs[1], O->limbs[2], O->limbs[3], + Z->limbs[0], Z->limbs[1], Z->limbs[2], Z->limbs[3], + S1->limbs[0], S1->limbs[1], S1->limbs[2], S1->limbs[3], + S2->limbs[0], S2->limbs[1], S2->limbs[2], S2->limbs[3], + S3->limbs[0], S3->limbs[1], S3->limbs[2], S3->limbs[3], + L1_denInv->limbs[0], L1_denInv->limbs[1], L1_denInv->limbs[2], L1_denInv->limbs[3], + p, tw[0], tw[1], tw[2], tw[3], + n, ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Device memory helpers +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_device_alloc_copy_int64(gnark_gpu_context_t ctx, + const int64_t *host_data, size_t count, + void **d_ptr) { + if (!ctx || !host_data || count == 0 || !d_ptr) return GNARK_GPU_ERROR_INVALID_ARG; + + int64_t *dev_buf = nullptr; + cudaError_t err = cudaMalloc(&dev_buf, count * sizeof(int64_t)); + if (err != cudaSuccess) return check_cuda(err); + + err = cudaMemcpyAsync(dev_buf, host_data, count * sizeof(int64_t), + cudaMemcpyHostToDevice, ctx->stream); + if (err != cudaSuccess) { + cudaFree(dev_buf); + return check_cuda(err); + } + + err = cudaStreamSynchronize(ctx->stream); + if (err != cudaSuccess) { + cudaFree(dev_buf); + return check_cuda(err); + } + + *d_ptr = dev_buf; + return GNARK_GPU_SUCCESS; +} + +extern "C" void gnark_gpu_device_free_ptr(void *d_ptr) { + if (d_ptr) { + cudaFree(d_ptr); + } +} + +// ============================================================================= +// PlonK Z-polynomial ratio computation +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_plonk_z_compute_factors( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t L_inout, + gnark_gpu_fr_vector_t R_inout, + gnark_gpu_fr_vector_t O_in, + const void *d_perm, + const uint64_t params[16], + unsigned log2n, + gnark_gpu_ntt_domain_t domain) { + if (!ctx || !L_inout || !R_inout || !O_in || !d_perm || !params || !domain) + return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = L_inout->count; + if (R_inout->count != n || O_in->count != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + if (domain->size != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::PlonkZRatioParams p; + for (int j = 0; j < 4; j++) { + p.beta[j] = params[0*4 + j]; + p.gamma[j] = params[1*4 + j]; + p.g_mul[j] = params[2*4 + j]; + p.g_sq[j] = params[3*4 + j]; + } + + const uint64_t *tw[4]; + gnark_gpu::ntt_get_fwd_twiddles(domain->ntt_dom, tw); + + gnark_gpu::launch_plonk_z_ratio( + L_inout->limbs[0], L_inout->limbs[1], L_inout->limbs[2], L_inout->limbs[3], + R_inout->limbs[0], R_inout->limbs[1], R_inout->limbs[2], R_inout->limbs[3], + O_in->limbs[0], O_in->limbs[1], O_in->limbs[2], O_in->limbs[3], + static_cast(d_perm), + p, tw[0], tw[1], tw[2], tw[3], + n, log2n, ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Pinned memory management +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_alloc_pinned(void **ptr, size_t bytes) { + if (!ptr || bytes == 0) return GNARK_GPU_ERROR_INVALID_ARG; + return check_cuda(cudaHostAlloc(ptr, bytes, cudaHostAllocDefault)); +} + +extern "C" void gnark_gpu_free_pinned(void *ptr) { + if (ptr) { + cudaFreeHost(ptr); + } +} + +// ============================================================================= +// GPU L1 denominator computation +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_compute_l1_den( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t out, + const uint64_t coset_gen[4], + gnark_gpu_ntt_domain_t domain) { + if (!ctx || !out || !coset_gen || !domain) + return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = out->count; + if (domain->size != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + const uint64_t *tw[4]; + gnark_gpu::ntt_get_fwd_twiddles(domain->ntt_dom, tw); + + gnark_gpu::launch_compute_l1_den( + out->limbs[0], out->limbs[1], out->limbs[2], out->limbs[3], + tw[0], tw[1], tw[2], tw[3], + coset_gen, n, ctx->stream); + + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Patch elements (write a few AoS host elements into SoA GPU vector) +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_patch( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t vec, + size_t offset, + const uint64_t *host_data_aos, + size_t count) { + if (!ctx || !vec || !host_data_aos || count == 0) + return GNARK_GPU_ERROR_INVALID_ARG; + if (offset + count > vec->count) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = ctx->stream; + + // For each element, copy its 4 limbs to the correct SoA positions. + // Each element in AoS is [limb0, limb1, limb2, limb3]. + for (size_t i = 0; i < count; i++) { + for (int limb = 0; limb < 4; limb++) { + cudaError_t err = cudaMemcpyAsync( + vec->limbs[limb] + offset + i, + host_data_aos + i * 4 + limb, + sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) return check_cuda(err); + } + } + + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Synchronization +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_sync(gnark_gpu_context_t ctx) { + if (!ctx) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + cudaError_t err = cudaStreamSynchronize(ctx->stream); + return check_cuda(err); +} + +// ============================================================================= +// Multi-stream API +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_create_stream(gnark_gpu_context_t ctx, int stream_id) { + if (!ctx) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id < 0 || stream_id >= GNARK_GPU_MAX_STREAMS) return GNARK_GPU_ERROR_INVALID_ARG; + if (ctx->stream_created[stream_id]) return GNARK_GPU_SUCCESS; // already created + + cudaError_t err = cudaSetDevice(ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + err = cudaStreamCreate(&ctx->streams[stream_id]); + if (err != cudaSuccess) return check_cuda(err); + + ctx->stream_created[stream_id] = true; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_record_event(gnark_gpu_context_t ctx, + int stream_id, int event_id) { + if (!ctx) return GNARK_GPU_ERROR_INVALID_ARG; + if (event_id < 0 || event_id >= GNARK_GPU_MAX_EVENTS) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + // Lazily create the event + if (!ctx->event_created[event_id]) { + cudaError_t err = cudaEventCreateWithFlags(&ctx->events[event_id], cudaEventDisableTiming); + if (err != cudaSuccess) return check_cuda(err); + ctx->event_created[event_id] = true; + } + + cudaError_t err = cudaEventRecord(ctx->events[event_id], stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_wait_event(gnark_gpu_context_t ctx, + int stream_id, int event_id) { + if (!ctx) return GNARK_GPU_ERROR_INVALID_ARG; + if (event_id < 0 || event_id >= GNARK_GPU_MAX_EVENTS) return GNARK_GPU_ERROR_INVALID_ARG; + if (!ctx->event_created[event_id]) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + cudaError_t err = cudaStreamWaitEvent(stream, ctx->events[event_id], 0); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_sync_stream(gnark_gpu_context_t ctx, int stream_id) { + if (!ctx) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + cudaError_t err = cudaStreamSynchronize(stream); + return check_cuda(err); +} + +// ============================================================================= +// Stream-aware data transfer +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_device_stream( + gnark_gpu_fr_vector_t vec, const uint64_t *host_data, + size_t count, int stream_id) { + if (!vec || !host_data) return GNARK_GPU_ERROR_INVALID_ARG; + if (count != vec->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + GnarkGPUContext *ctx = vec->ctx; + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu_error_t gerr = ensure_staging(ctx, count); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + cudaError_t err = cudaMemcpyAsync(ctx->staging_buffer, host_data, + count * 4 * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) return check_cuda(err); + + gnark_gpu::launch_transpose_aos_to_soa_fr(vec->limbs[0], vec->limbs[1], vec->limbs[2], + vec->limbs[3], ctx->staging_buffer, count, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_copy_to_host_stream( + gnark_gpu_fr_vector_t vec, uint64_t *host_data, + size_t count, int stream_id) { + if (!vec || !host_data) return GNARK_GPU_ERROR_INVALID_ARG; + if (count != vec->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + GnarkGPUContext *ctx = vec->ctx; + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu_error_t gerr = ensure_staging(ctx, count); + if (gerr != GNARK_GPU_SUCCESS) return gerr; + + gnark_gpu::launch_transpose_soa_to_aos_fr(ctx->staging_buffer, vec->limbs[0], vec->limbs[1], + vec->limbs[2], vec->limbs[3], count, stream); + + cudaError_t err = cudaMemcpyAsync(host_data, ctx->staging_buffer, + count * 4 * sizeof(uint64_t), + cudaMemcpyDeviceToHost, stream); + if (err != cudaSuccess) return check_cuda(err); + + err = cudaStreamSynchronize(stream); + return check_cuda(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_copy_d2d_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src, int stream_id) { + if (!ctx || !dst || !src) return GNARK_GPU_ERROR_INVALID_ARG; + if (dst->count != src->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + for (int i = 0; i < 4; i++) { + cudaError_t err = cudaMemcpyAsync(dst->limbs[i], src->limbs[i], + dst->count * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Stream-aware NTT operations +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_forward_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + int stream_id) { + if (!domain || !data) return GNARK_GPU_ERROR_INVALID_ARG; + if (data->count != domain->size) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(domain->ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = domain->ctx->streams[0]; + + gnark_gpu::launch_ntt_forward(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_inverse_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + int stream_id) { + if (!domain || !data) return GNARK_GPU_ERROR_INVALID_ARG; + if (data->count != domain->size) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(domain->ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = domain->ctx->streams[0]; + + gnark_gpu::launch_ntt_inverse(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_ntt_bit_reverse_stream(gnark_gpu_ntt_domain_t domain, + gnark_gpu_fr_vector_t data, + int stream_id) { + if (!domain || !data) return GNARK_GPU_ERROR_INVALID_ARG; + if (data->count != domain->size) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(domain->ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = domain->ctx->streams[0]; + + gnark_gpu::launch_ntt_bit_reverse(domain->ntt_dom, + data->limbs[0], data->limbs[1], data->limbs[2], data->limbs[3], stream); + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Stream-aware arithmetic operations +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_scale_by_powers_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + const uint64_t g[4], int stream_id) { + if (!ctx || !v || !g) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::launch_scale_by_powers(v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + g, v->count, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_scalar_mul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + const uint64_t c[4], int stream_id) { + if (!ctx || !v || !c) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::launch_scalar_mul(v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + c, v->count, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_set_zero_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, int stream_id) { + if (!ctx || !v) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + for (int i = 0; i < 4; i++) { + cudaError_t err = cudaMemsetAsync(v->limbs[i], 0, + v->count * sizeof(uint64_t), stream); + if (err != cudaSuccess) return check_cuda(err); + } + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_add_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id) { + if (!ctx || !result || !a || !b) return GNARK_GPU_ERROR_INVALID_ARG; + if (result->count != a->count || a->count != b->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::launch_add_fr(result->limbs[0], result->limbs[1], result->limbs[2], + result->limbs[3], a->limbs[0], a->limbs[1], a->limbs[2], + a->limbs[3], b->limbs[0], b->limbs[1], b->limbs[2], + b->limbs[3], a->count, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_sub_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id) { + if (!ctx || !result || !a || !b) return GNARK_GPU_ERROR_INVALID_ARG; + if (result->count != a->count || a->count != b->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::launch_sub_fr(result->limbs[0], result->limbs[1], result->limbs[2], + result->limbs[3], a->limbs[0], a->limbs[1], a->limbs[2], + a->limbs[3], b->limbs[0], b->limbs[1], b->limbs[2], + b->limbs[3], a->count, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_mul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id) { + if (!ctx || !result || !a || !b) return GNARK_GPU_ERROR_INVALID_ARG; + if (result->count != a->count || a->count != b->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::launch_mul_mont_fr(result->limbs[0], result->limbs[1], result->limbs[2], + result->limbs[3], a->limbs[0], a->limbs[1], a->limbs[2], + a->limbs[3], b->limbs[0], b->limbs[1], b->limbs[2], + b->limbs[3], a->count, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_addmul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, gnark_gpu_fr_vector_t b, int stream_id) { + if (!ctx || !v || !a || !b) return GNARK_GPU_ERROR_INVALID_ARG; + if (v->count != a->count || a->count != b->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::launch_addmul(v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + a->limbs[0], a->limbs[1], a->limbs[2], a->limbs[3], + b->limbs[0], b->limbs[1], b->limbs[2], b->limbs[3], + a->count, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_batch_invert_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t temp, int stream_id) { + if (!ctx || !v || !temp) return GNARK_GPU_ERROR_INVALID_ARG; + if (v->count != temp->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + cudaError_t err = gnark_gpu::launch_batch_invert( + v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + temp->limbs[0], temp->limbs[1], temp->limbs[2], temp->limbs[3], + v->count, stream); + return check_cuda(err); +} + +// ============================================================================= +// AddScalarMul: v[i] += a[i] * scalar +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_add_scalar_mul( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, const uint64_t scalar[4]) { + if (!ctx || !v || !a || !scalar) return GNARK_GPU_ERROR_INVALID_ARG; + if (v->count != a->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_add_scalar_mul( + v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + a->limbs[0], a->limbs[1], a->limbs[2], a->limbs[3], + scalar, a->count, ctx->stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_fr_vector_add_scalar_mul_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t v, + gnark_gpu_fr_vector_t a, const uint64_t scalar[4], int stream_id) { + if (!ctx || !v || !a || !scalar) return GNARK_GPU_ERROR_INVALID_ARG; + if (v->count != a->count) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::launch_add_scalar_mul( + v->limbs[0], v->limbs[1], v->limbs[2], v->limbs[3], + a->limbs[0], a->limbs[1], a->limbs[2], a->limbs[3], + scalar, a->count, stream); + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Stream-aware PlonK operations +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_compute_l1_den_stream( + gnark_gpu_context_t ctx, gnark_gpu_fr_vector_t out, + const uint64_t coset_gen[4], gnark_gpu_ntt_domain_t domain, + int stream_id) { + if (!ctx || !out || !coset_gen || !domain) return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = out->count; + if (domain->size != n) return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + const uint64_t *tw[4]; + gnark_gpu::ntt_get_fwd_twiddles(domain->ntt_dom, tw); + + gnark_gpu::launch_compute_l1_den( + out->limbs[0], out->limbs[1], out->limbs[2], out->limbs[3], + tw[0], tw[1], tw[2], tw[3], + coset_gen, n, stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_plonk_perm_boundary_stream( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t L, gnark_gpu_fr_vector_t R, gnark_gpu_fr_vector_t O, + gnark_gpu_fr_vector_t Z, + gnark_gpu_fr_vector_t S1, gnark_gpu_fr_vector_t S2, gnark_gpu_fr_vector_t S3, + gnark_gpu_fr_vector_t L1_denInv, + const uint64_t params[28], + gnark_gpu_ntt_domain_t domain, int stream_id) { + if (!ctx || !result || !L || !R || !O || !Z || !S1 || !S2 || !S3 || + !L1_denInv || !params || !domain) + return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = result->count; + if (L->count != n || R->count != n || O->count != n || Z->count != n || + S1->count != n || S2->count != n || S3->count != n || L1_denInv->count != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + if (domain->size != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + cudaStream_t stream = get_stream(ctx, stream_id); + if (!stream && stream_id != 0) return GNARK_GPU_ERROR_INVALID_ARG; + if (stream_id == 0) stream = ctx->streams[0]; + + gnark_gpu::PlonkPermBoundaryParams p; + for (int j = 0; j < 4; j++) { + p.alpha[j] = params[0*4 + j]; + p.beta[j] = params[1*4 + j]; + p.gamma[j] = params[2*4 + j]; + p.l1_scalar[j] = params[3*4 + j]; + p.coset_shift[j] = params[4*4 + j]; + p.coset_shift_sq[j] = params[5*4 + j]; + p.coset_gen[j] = params[6*4 + j]; + } + + const uint64_t *tw[4]; + gnark_gpu::ntt_get_fwd_twiddles(domain->ntt_dom, tw); + + gnark_gpu::launch_plonk_perm_boundary( + result->limbs[0], result->limbs[1], result->limbs[2], result->limbs[3], + L->limbs[0], L->limbs[1], L->limbs[2], L->limbs[3], + R->limbs[0], R->limbs[1], R->limbs[2], R->limbs[3], + O->limbs[0], O->limbs[1], O->limbs[2], O->limbs[3], + Z->limbs[0], Z->limbs[1], Z->limbs[2], Z->limbs[3], + S1->limbs[0], S1->limbs[1], S1->limbs[2], S1->limbs[3], + S2->limbs[0], S2->limbs[1], S2->limbs[2], S2->limbs[3], + S3->limbs[0], S3->limbs[1], S3->limbs[2], S3->limbs[3], + L1_denInv->limbs[0], L1_denInv->limbs[1], L1_denInv->limbs[2], L1_denInv->limbs[3], + p, tw[0], tw[1], tw[2], tw[3], + n, stream); + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// GPU Z prefix product +// ============================================================================= + +// Helper: ensure Z prefix scratch buffers are large enough. +static cudaError_t z_prefix_scratch_ensure(ZPrefixScratch &s, size_t num_chunks) { + if (num_chunks <= s.capacity) return cudaSuccess; + + for (int i = 0; i < 4; i++) { + if (s.cp[i]) { cudaFree(s.cp[i]); s.cp[i] = nullptr; } + if (s.sp[i]) { cudaFree(s.sp[i]); s.sp[i] = nullptr; } + } + s.capacity = 0; + + size_t alloc = num_chunks < 64 ? 64 : num_chunks; + for (int i = 0; i < 4; i++) { + cudaError_t err = cudaMalloc(&s.cp[i], alloc * sizeof(uint64_t)); + if (err != cudaSuccess) return err; + err = cudaMalloc(&s.sp[i], alloc * sizeof(uint64_t)); + if (err != cudaSuccess) return err; + } + s.capacity = alloc; + return cudaSuccess; +} + +extern "C" gnark_gpu_error_t gnark_gpu_z_prefix_phase1( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t z_vec, + gnark_gpu_fr_vector_t ratio_vec, + uint64_t *chunk_products_host, + size_t *num_chunks_out) { + if (!ctx || !z_vec || !ratio_vec || !chunk_products_host || !num_chunks_out) + return GNARK_GPU_ERROR_INVALID_ARG; + if (z_vec->count != ratio_vec->count) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + size_t n = ratio_vec->count; + size_t num_chunks = (n + 1023) / 1024; + + // Ensure context-owned scratch buffers are large enough. + cudaError_t err = z_prefix_scratch_ensure(ctx->z_prefix_scratch, num_chunks); + if (err != cudaSuccess) return check_cuda(err); + + err = gnark_gpu::launch_z_prefix_phase1( + z_vec->limbs[0], z_vec->limbs[1], z_vec->limbs[2], z_vec->limbs[3], + ratio_vec->limbs[0], ratio_vec->limbs[1], ratio_vec->limbs[2], ratio_vec->limbs[3], + ctx->z_prefix_scratch.cp, n, ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + + // Sync to ensure kernel is done before downloading chunk products. + err = cudaStreamSynchronize(ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + + // Bulk download: 4 per-limb cudaMemcpy + host SoA→AoS transpose. + // Reuse a temporary host buffer for per-limb contiguous data. + std::vector limb_buf(num_chunks); + for (int limb = 0; limb < 4; limb++) { + err = cudaMemcpy(limb_buf.data(), ctx->z_prefix_scratch.cp[limb], + num_chunks * sizeof(uint64_t), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) return check_cuda(err); + // Scatter into AoS host layout: cpHost[c*4 + limb] = limb_buf[c] + for (size_t c = 0; c < num_chunks; c++) { + chunk_products_host[c * 4 + limb] = limb_buf[c]; + } + } + + *num_chunks_out = num_chunks; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_z_prefix_phase3( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t z_vec, + gnark_gpu_fr_vector_t temp_vec, + const uint64_t *scanned_prefixes_host, + size_t num_chunks) { + if (!ctx || !z_vec || !temp_vec || !scanned_prefixes_host) + return GNARK_GPU_ERROR_INVALID_ARG; + if (z_vec->count != temp_vec->count) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + size_t n = z_vec->count; + + // Ensure scratch is available (should already be from phase1, but be safe). + cudaError_t err = z_prefix_scratch_ensure(ctx->z_prefix_scratch, num_chunks); + if (err != cudaSuccess) return check_cuda(err); + + // Bulk upload: host AoS → gather per-limb → 4 cudaMemcpy. + std::vector limb_buf(num_chunks); + for (int limb = 0; limb < 4; limb++) { + for (size_t c = 0; c < num_chunks; c++) { + limb_buf[c] = scanned_prefixes_host[c * 4 + limb]; + } + err = cudaMemcpy(ctx->z_prefix_scratch.sp[limb], limb_buf.data(), + num_chunks * sizeof(uint64_t), cudaMemcpyHostToDevice); + if (err != cudaSuccess) return check_cuda(err); + } + + err = gnark_gpu::launch_z_prefix_phase3( + z_vec->limbs[0], z_vec->limbs[1], z_vec->limbs[2], z_vec->limbs[3], + temp_vec->limbs[0], temp_vec->limbs[1], temp_vec->limbs[2], temp_vec->limbs[3], + ctx->z_prefix_scratch.sp, num_chunks, n, ctx->stream); + return check_cuda(err); +} + +// ============================================================================= +// GPU polynomial evaluation (chunked Horner) +// ============================================================================= + +// Helper: ensure poly eval scratch buffers are large enough. +static cudaError_t poly_eval_scratch_ensure(PolyEvalScratch &s, size_t num_chunks) { + if (num_chunks <= s.capacity) return cudaSuccess; + + for (int i = 0; i < 4; i++) { + if (s.out[i]) { cudaFree(s.out[i]); s.out[i] = nullptr; } + } + s.capacity = 0; + + size_t alloc = num_chunks < 64 ? 64 : num_chunks; + for (int i = 0; i < 4; i++) { + cudaError_t err = cudaMalloc(&s.out[i], alloc * sizeof(uint64_t)); + if (err != cudaSuccess) return err; + } + s.capacity = alloc; + return cudaSuccess; +} + +static gnark_gpu_error_t poly_eval_chunks_impl( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t coeffs, + const uint64_t z[4], + uint64_t *partials_host, + size_t *num_chunks_out, + cudaStream_t stream) { + if (!ctx || !coeffs || !z || !partials_host || !num_chunks_out) + return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = coeffs->count; + if (n == 0) { + *num_chunks_out = 0; + return GNARK_GPU_SUCCESS; + } + + size_t nc = (n + 1023) / 1024; + + // Ensure context-owned scratch buffers are large enough. + cudaError_t err = poly_eval_scratch_ensure(ctx->poly_eval_scratch, nc); + if (err != cudaSuccess) return check_cuda(err); + + uint64_t *d_out[4] = {ctx->poly_eval_scratch.out[0], ctx->poly_eval_scratch.out[1], + ctx->poly_eval_scratch.out[2], ctx->poly_eval_scratch.out[3]}; + + size_t nc_out; + gnark_gpu::launch_poly_eval_chunks( + coeffs->limbs[0], coeffs->limbs[1], coeffs->limbs[2], coeffs->limbs[3], + z, d_out[0], d_out[1], d_out[2], d_out[3], + n, &nc_out, stream); + + // Synchronize to ensure kernel is done before downloading + err = cudaStreamSynchronize(stream); + if (err != cudaSuccess) return check_cuda(err); + + // Bulk download: 4 per-limb cudaMemcpy + host SoA→AoS transpose. + std::vector limb_buf(nc_out); + for (int limb = 0; limb < 4; limb++) { + err = cudaMemcpy(limb_buf.data(), d_out[limb], + nc_out * sizeof(uint64_t), cudaMemcpyDeviceToHost); + if (err != cudaSuccess) return check_cuda(err); + for (size_t c = 0; c < nc_out; c++) { + partials_host[c * 4 + limb] = limb_buf[c]; + } + } + + *num_chunks_out = nc_out; + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_poly_eval_chunks( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t coeffs, + const uint64_t z[4], + uint64_t *partials_host, + size_t *num_chunks_out) { + if (!ctx) return GNARK_GPU_ERROR_INVALID_ARG; + return poly_eval_chunks_impl(ctx, coeffs, z, partials_host, num_chunks_out, ctx->stream); +} + +extern "C" gnark_gpu_error_t gnark_gpu_poly_eval_chunks_stream( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t coeffs, + const uint64_t z[4], + uint64_t *partials_host, + size_t *num_chunks_out, + int stream_id) { + if (!ctx) return GNARK_GPU_ERROR_INVALID_ARG; + cudaStream_t stream = get_stream(ctx, stream_id); + return poly_eval_chunks_impl(ctx, coeffs, z, partials_host, num_chunks_out, stream); +} + +// ============================================================================= +// Fused gate constraint accumulation for PlonK quotient +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_plonk_gate_accum( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t result, + gnark_gpu_fr_vector_t Ql, gnark_gpu_fr_vector_t Qr, + gnark_gpu_fr_vector_t Qm, gnark_gpu_fr_vector_t Qo, + gnark_gpu_fr_vector_t Qk, + gnark_gpu_fr_vector_t L, gnark_gpu_fr_vector_t R, gnark_gpu_fr_vector_t O, + const uint64_t zhKInv[4]) { + if (!ctx || !result || !Ql || !Qr || !Qm || !Qo || !Qk || + !L || !R || !O || !zhKInv) + return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = result->count; + if (Ql->count != n || Qr->count != n || Qm->count != n || Qo->count != n || + Qk->count != n || L->count != n || R->count != n || O->count != n) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + gnark_gpu::launch_plonk_gate_accum( + result->limbs[0], result->limbs[1], result->limbs[2], result->limbs[3], + Ql->limbs[0], Ql->limbs[1], Ql->limbs[2], Ql->limbs[3], + Qr->limbs[0], Qr->limbs[1], Qr->limbs[2], Qr->limbs[3], + Qm->limbs[0], Qm->limbs[1], Qm->limbs[2], Qm->limbs[3], + Qo->limbs[0], Qo->limbs[1], Qo->limbs[2], Qo->limbs[3], + Qk->limbs[0], Qk->limbs[1], Qk->limbs[2], Qk->limbs[3], + L->limbs[0], L->limbs[1], L->limbs[2], L->limbs[3], + R->limbs[0], R->limbs[1], R->limbs[2], R->limbs[3], + O->limbs[0], O->limbs[1], O->limbs[2], O->limbs[3], + zhKInv, n, ctx->stream); + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// Reduce blinded polynomial for coset evaluation +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_reduce_blinded_coset( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src, + const uint64_t *blinding_tail_host, + size_t tail_len, + const uint64_t cosetPowN[4]) { + if (!ctx || !dst || !src || !cosetPowN) + return GNARK_GPU_ERROR_INVALID_ARG; + if (dst->count != src->count) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + + size_t n = src->count; + + // Upload tiny tail to device (typically 2-3 elements = 64-96 bytes) + uint64_t *d_tail = nullptr; + if (tail_len > 0 && blinding_tail_host) { + size_t tail_bytes = tail_len * 4 * sizeof(uint64_t); + auto err = cudaMallocAsync(&d_tail, tail_bytes, ctx->stream); + if (err != cudaSuccess) return check_cuda(err); + err = cudaMemcpyAsync(d_tail, blinding_tail_host, tail_bytes, + cudaMemcpyHostToDevice, ctx->stream); + if (err != cudaSuccess) { cudaFreeAsync(d_tail, ctx->stream); return check_cuda(err); } + } + + gnark_gpu::launch_reduce_blinded_coset( + dst->limbs[0], dst->limbs[1], dst->limbs[2], dst->limbs[3], + src->limbs[0], src->limbs[1], src->limbs[2], src->limbs[3], + cosetPowN, d_tail, (uint32_t)tail_len, (uint32_t)n, ctx->stream); + + if (d_tail) cudaFreeAsync(d_tail, ctx->stream); + return GNARK_GPU_SUCCESS; +} + +extern "C" gnark_gpu_error_t gnark_gpu_reduce_blinded_coset_stream( + gnark_gpu_context_t ctx, + gnark_gpu_fr_vector_t dst, + gnark_gpu_fr_vector_t src, + const uint64_t *blinding_tail_host, + size_t tail_len, + const uint64_t cosetPowN[4], + int stream_id) { + if (!ctx || !dst || !src || !cosetPowN) + return GNARK_GPU_ERROR_INVALID_ARG; + if (dst->count != src->count) + return GNARK_GPU_ERROR_SIZE_MISMATCH; + if (stream_id < 0 || stream_id >= GNARK_GPU_MAX_STREAMS || !ctx->stream_created[stream_id]) + return GNARK_GPU_ERROR_INVALID_ARG; + + size_t n = src->count; + cudaStream_t stream = ctx->streams[stream_id]; + + uint64_t *d_tail = nullptr; + if (tail_len > 0 && blinding_tail_host) { + size_t tail_bytes = tail_len * 4 * sizeof(uint64_t); + auto err = cudaMallocAsync(&d_tail, tail_bytes, stream); + if (err != cudaSuccess) return check_cuda(err); + err = cudaMemcpyAsync(d_tail, blinding_tail_host, tail_bytes, + cudaMemcpyHostToDevice, stream); + if (err != cudaSuccess) { cudaFreeAsync(d_tail, stream); return check_cuda(err); } + } + + gnark_gpu::launch_reduce_blinded_coset( + dst->limbs[0], dst->limbs[1], dst->limbs[2], dst->limbs[3], + src->limbs[0], src->limbs[1], src->limbs[2], src->limbs[3], + cosetPowN, d_tail, (uint32_t)tail_len, (uint32_t)n, stream); + + if (d_tail) cudaFreeAsync(d_tail, stream); + return GNARK_GPU_SUCCESS; +} + +// ============================================================================= +// GPU memory info +// ============================================================================= + +extern "C" gnark_gpu_error_t gnark_gpu_mem_get_info(gnark_gpu_context_t ctx, + size_t *free_bytes, size_t *total_bytes) { + if (!ctx || !free_bytes || !total_bytes) return GNARK_GPU_ERROR_INVALID_ARG; + + cudaError_t err = cudaSetDevice(ctx->device_id); + if (err != cudaSuccess) return check_cuda(err); + + err = cudaMemGetInfo(free_bytes, total_bytes); + return check_cuda(err); +} diff --git a/prover/gpu/cuda/src/plonk/ec.cuh b/prover/gpu/cuda/src/plonk/ec.cuh new file mode 100644 index 00000000000..effcf058057 --- /dev/null +++ b/prover/gpu/cuda/src/plonk/ec.cuh @@ -0,0 +1,539 @@ +#pragma once + +// ───────────────────────────────────────────────────────────────────────────── +// Twisted Edwards elliptic curve arithmetic for BLS12-377 G1 +// +// BLS12-377 G1 in Short Weierstrass form: y² = x³ + 1 +// Maps birationally to Twisted Edwards: -x² + y² = 1 + d·x²y² (a = -1) +// +// Why Twisted Edwards for GPU MSM? +// 1. Unified addition formula: works for any two points (no special cases +// for P+P, P+O, P+(-P)), avoiding warp divergence +// 2. No inversions in projective coordinates +// 3. Compact mixed-add: 9M for accumulator(extended) + point(affine) +// +// Two point representations: +// +// G1EdExtended (192 bytes, accumulator): +// (X, Y, T, Z) where x = X/Z, y = Y/Z, T = X·Y/Z +// Identity: (0, R, 0, R) in Montgomery form +// Coordinates live in [0, 2p) during computation (lazy reduction). +// Must call ec_te_reduce() before exporting to host. +// +// G1EdXY (96 bytes, compact input): +// (x_te, y_te) affine TE coordinates only +// The mixed-add formula computes T = 2d·x·y on the fly (2 extra fp_mul) +// 33% less memory than precomputed (y-x, y+x, 2dxy) format (144 bytes) +// At large sizes (64M+ points), memory bandwidth dominates → 21-23% faster +// Coordinates in [0, p) (fully reduced, loaded from host). +// +// Lazy reduction strategy: +// All EC formulas use fp_mul_nr, fp_add_nr, fp_sub_nr internally. +// Coordinates stay in [0, 2p) across chained additions — see bound table in fp.cuh. +// This saves 9×12 = 108 instructions per EC add from skipped fp_mul reductions, +// plus 4×12 = 48 from skipped fp_add reductions. Total ~156 instr/add saved. +// +// Point addition cost: +// Mixed add (Extended += XY): 9M (2M for T_q, 3M for A/B, 1M for C, 3M for X3/Y3/T3/Z3) +// General add (Extended += Ext): 9M (1M+1M for C with 2d, 1M for D, rest same) +// Doubling (host only): 4S + 4M (dbl-2008-hwcd formula) +// ───────────────────────────────────────────────────────────────────────────── + +#include "fp.cuh" + +namespace gnark_gpu { + +// ============================================================================= +// Twisted Edwards types and constants +// ============================================================================= + +// 2d coefficient for the Twisted Edwards curve (Montgomery form, from gbotrel/zprize-mobile-harness) +__device__ __constant__ const uint64_t TE_D_COEFF_DOUBLE[6] = { + 0xf24b7e8444a706c6ULL, 0xeae0237580faa8faULL, 0x0f4d7cf27ef38fa5ULL, + 0x5597097dc5f2bb26ULL, 0x8bf6c1dd0d95a93eULL, 0x01784602fbff628aULL, +}; + +// Extended Twisted Edwards point: (X, Y, T, Z) — 192 bytes, accumulator +// Represents affine (X/Z, Y/Z) with T = X*Y/Z +// Identity: X=0, Y=R, T=0, Z=R (Montgomery one) +struct G1EdExtended { + uint64_t x[6]; + uint64_t y[6]; + uint64_t t[6]; + uint64_t z[6]; +}; + +// Compact Twisted Edwards MSM point format: 96 bytes per point +// Stores only (x_te, y_te). The mixed add formula computes T = 2d*x*y on the fly. +// 33% smaller than the precomputed (y-x, y+x, 2dxy) format at the cost of 2 extra fp_mul. +struct G1EdXY { + uint64_t x[6]; // x_te (affine TE x-coordinate) + uint64_t y[6]; // y_te (affine TE y-coordinate) +}; + +// Set TE extended point to identity: (0, R, 0, R) +__device__ __forceinline__ void ec_te_set_identity(G1EdExtended &p) { + fp_set_zero(p.x); + fp_set_one(p.y); + fp_set_zero(p.t); + fp_set_one(p.z); +} + +// Branchless conditional negate of TE XY point x-coordinate. +// If negate==true: p.x = p - p.x. If negate==false: p.x unchanged. +// Avoids warp divergence in MSM accumulate (signs are random ~50/50). +// Input p.x must be in [0, p) (loaded from device memory, fully reduced). +__device__ __forceinline__ void ec_te_cnegate_xy(G1EdXY &p, bool negate) { + const uint64_t *mod = FP_MODULUS; + uint64_t neg[6]; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(neg[0]) : "l"(mod[0]), "l"(p.x[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[1]) : "l"(mod[1]), "l"(p.x[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[2]) : "l"(mod[2]), "l"(p.x[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[3]) : "l"(mod[3]), "l"(p.x[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[4]) : "l"(mod[4]), "l"(p.x[4])); + asm volatile("subc.u64 %0, %1, %2;" : "=l"(neg[5]) : "l"(mod[5]), "l"(p.x[5])); + fp_ccopy(p.x, neg, negate); +} + +// Reduce all coordinates of a TE extended point from [0, 2p) to [0, p). +// Call before exporting to host (e.g., MSM window results for Horner combination). +__device__ __forceinline__ void ec_te_reduce(G1EdExtended &p) { + fp_reduce(p.x); + fp_reduce(p.y); + fp_reduce(p.t); + fp_reduce(p.z); +} + +// ============================================================================= +// Unified mixed addition: G1EdExtended += G1EdXY (9M, strongly unified) +// +// Uses lazy reduction: all fp_mul_nr, fp_add_nr, fp_sub_nr. +// Accumulator coordinates in [0, 2p), point coordinates in [0, p). +// Output coordinates in [0, 2p) — invariant maintained across chained adds. +// +// q is in compact XY form: (x_te, y_te). We compute T = 2d*xq*yq on the fly. +// +// Formula (EFD madd-2008-hwcd-2, adapted for a=-1, on-the-fly T): +// T_q = 2d * q.x * q.y [1M+2M] +// A = (Y1-X1) * (Y_q - X_q) [3M] A ∈ [0, 2p) +// B = (Y1+X1) * (Y_q + X_q) [4M] B ∈ [0, 2p) +// C = T1 * T_q [5M] C ∈ [0, 2p) +// D = 2 * Z1 D ∈ [0, 4p) +// E = B - A E ∈ [0, 4p) +// H = B + A H ∈ [0, 4p) +// F = D - C F ∈ [0, 6p) +// G = D + C G ∈ [0, 6p) +// X3 = E * F → [0, 2p) [6M] +// Y3 = G * H → [0, 2p) [7M] +// T3 = E * H → [0, 2p) [8M] +// Z3 = F * G → [0, 2p) [9M] +// ============================================================================= + +__device__ __forceinline__ void ec_te_unified_mixed_add_xy(G1EdExtended &p, const G1EdXY &q) { + // T_q = 2d * xq * yq (on the fly, q coords in [0, p)) + uint64_t T_q[6]; + fp_mul_nr(T_q, q.x, q.y); // [0, 2p) + fp_mul_nr(T_q, T_q, TE_D_COEFF_DOUBLE); // [0, 2p) + + // A = (Y1-X1) * (Yq - Xq) + uint64_t A[6], t1[6]; + fp_sub_nr(A, p.y, p.x); // p coords [0, 2p) → A ∈ [0, 4p) + fp_sub(t1, q.y, q.x); // q coords [0, p) → t1 ∈ [0, p) + fp_mul_nr(A, A, t1); // [0, 2p) + + // B = (Y1+X1) * (Yq + Xq) + uint64_t B[6]; + fp_add_nr(B, p.y, p.x); // [0, 4p) + fp_add_nr(t1, q.y, q.x); // [0, 2p) + fp_mul_nr(B, B, t1); // [0, 2p) + + // C = T1 * T_q + uint64_t C[6]; + fp_mul_nr(C, p.t, T_q); // [0, 2p) + + // D = 2 * Z1 + uint64_t D[6]; + fp_add_nr(D, p.z, p.z); // [0, 4p) + + // E = B - A, H = B + A (both [0, 2p) inputs) + uint64_t E[6], H[6]; + fp_sub_nr(E, B, A); // [0, 4p) + fp_add_nr(H, B, A); // [0, 4p) + + // F = D - C, G = D + C (D [0, 4p), C [0, 2p)) + uint64_t F[6], G[6]; + fp_sub_nr(F, D, C); // [0, 6p) + fp_add_nr(G, D, C); // [0, 6p) + + // Final products: all outputs ∈ [0, 2p) since max input 6p < R + fp_mul_nr(p.x, E, F); // [0, 2p) + fp_mul_nr(p.y, G, H); // [0, 2p) + fp_mul_nr(p.t, E, H); // [0, 2p) + fp_mul_nr(p.z, F, G); // [0, 2p) +} + +// ============================================================================= +// Unified general addition: G1EdExtended += G1EdExtended (9M, strongly unified) +// +// Both operands have coordinates in [0, 2p). Same lazy reduction strategy. +// +// Formula (EFD add-2008-hwcd, for a=-1): +// A = (Y1-X1) * (Y2-X2) [1M] A ∈ [0, 2p) +// B = (Y1+X1) * (Y2+X2) [2M] B ∈ [0, 2p) +// C = T1 * 2d * T2 [3M+4M] C ∈ [0, 2p) +// D = 2 * Z1 * Z2 [5M] D ∈ [0, 4p) +// E = B - A E ∈ [0, 4p) +// H = B + A H ∈ [0, 4p) +// F = D - C F ∈ [0, 6p) +// G = D + C G ∈ [0, 6p) +// X3 = E * F → [0, 2p) [6M] +// Y3 = G * H → [0, 2p) [7M] +// T3 = E * H → [0, 2p) [8M] +// Z3 = F * G → [0, 2p) [9M] +// ============================================================================= + +__device__ __forceinline__ void ec_te_unified_add(G1EdExtended &p, const G1EdExtended &q) { + // A = (Y1-X1) * (Y2-X2) + uint64_t A[6], t1[6]; + fp_sub_nr(A, p.y, p.x); // [0, 4p) + fp_sub_nr(t1, q.y, q.x); // [0, 4p) + fp_mul_nr(A, A, t1); // [0, 2p) + + // B = (Y1+X1) * (Y2+X2) + uint64_t B[6]; + fp_add_nr(B, p.y, p.x); // [0, 4p) + fp_add_nr(t1, q.y, q.x); // [0, 4p) + fp_mul_nr(B, B, t1); // [0, 2p) + + // C = T1 * T2 * 2d + uint64_t C[6]; + fp_mul_nr(C, p.t, q.t); // [0, 2p) + fp_mul_nr(C, C, TE_D_COEFF_DOUBLE); // [0, 2p) + + // D = 2 * Z1 * Z2 + uint64_t D[6]; + fp_mul_nr(D, p.z, q.z); // [0, 2p) + fp_add_nr(D, D, D); // [0, 4p) + + // E = B - A, H = B + A + uint64_t E[6], H[6]; + fp_sub_nr(E, B, A); // [0, 4p) + fp_add_nr(H, B, A); // [0, 4p) + + // F = D - C, G = D + C + uint64_t F[6], G[6]; + fp_sub_nr(F, D, C); // [0, 6p) + fp_add_nr(G, D, C); // [0, 6p) + + // Final products + fp_mul_nr(p.x, E, F); // [0, 2p) + fp_mul_nr(p.y, G, H); // [0, 2p) + fp_mul_nr(p.t, E, H); // [0, 2p) + fp_mul_nr(p.z, F, G); // [0, 2p) +} + +// ============================================================================= +// Precomputed Twisted Edwards mixed-add input format (G1EdYZD) +// +// Stores per-point precomputed (Y-X, Y+X, 2d·X·Y) — three Fp coords (144 B +// total, vs 96 B for compact G1EdXY). The mixed-add formula then drops the +// on-the-fly T_q = 2d·X·Y computation, saving 2 fp_mul per add (9M → 7M). +// +// Tradeoff: 50% larger point memory; but for compute-bound accumulate phases +// at moderate n (≲ 2²⁵ on Blackwell), the saved muls dominate the extra +// bandwidth. See WORKLOG.md for measurements. +// +// Coordinates are loaded from device memory in [0, p) (fully reduced by the +// host conversion). Output of mixed-add stays in [0, 2p) — same lazy +// reduction discipline as G1EdXY. +// ============================================================================= + +struct G1EdYZD { + uint64_t y_minus_x[6]; // (Y_te - X_te) mod p + uint64_t y_plus_x[6]; // (Y_te + X_te) mod p + uint64_t two_d_xy[6]; // (2d * X_te * Y_te) mod p +}; + +// Branchless conditional negate of a precomputed point. +// +// Negating a TE point (X, Y) → (-X, Y) corresponds in precomputed format to: +// y_minus_x ↔ y_plus_x (swap), two_d_xy → -two_d_xy (mod p). +// +// We swap by computing both candidate values (no-op or negate) and picking +// branchlessly with fp_ccopy. Saves warp divergence in MSM accumulate. +__device__ __forceinline__ void ec_te_cnegate_yzd(G1EdYZD &p, bool negate) { + // Snapshot y_minus_x and y_plus_x before swap. + uint64_t orig_minus[6]; +#pragma unroll + for(int i = 0; i < 6; i++) orig_minus[i] = p.y_minus_x[i]; + + // If negate: y_minus_x = y_plus_x_old; y_plus_x = y_minus_x_old. + fp_ccopy(p.y_minus_x, p.y_plus_x, negate); + fp_ccopy(p.y_plus_x, orig_minus, negate); + + // Negate two_d_xy: candidate = p - two_d_xy (mod p). + const uint64_t *mod = FP_MODULUS; + uint64_t neg[6]; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(neg[0]) : "l"(mod[0]), "l"(p.two_d_xy[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[1]) : "l"(mod[1]), "l"(p.two_d_xy[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[2]) : "l"(mod[2]), "l"(p.two_d_xy[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[3]) : "l"(mod[3]), "l"(p.two_d_xy[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(neg[4]) : "l"(mod[4]), "l"(p.two_d_xy[4])); + asm volatile("subc.u64 %0, %1, %2;" : "=l"(neg[5]) : "l"(mod[5]), "l"(p.two_d_xy[5])); + fp_ccopy(p.two_d_xy, neg, negate); +} + +// ============================================================================= +// Unified mixed addition with precomputed input: G1EdExtended += G1EdYZD (7M) +// +// Same lazy-reduction strategy as ec_te_unified_mixed_add_xy, two muls +// cheaper because T_q is precomputed. +// +// Formula (madd-2008-hwcd, a=-1, with precomputed q): +// A = (Y1-X1) * y_minus_x [1M] A ∈ [0, 2p) +// B = (Y1+X1) * y_plus_x [2M] B ∈ [0, 2p) +// C = T1 * two_d_xy [3M] C ∈ [0, 2p) +// D = 2 * Z1 D ∈ [0, 4p) +// E = B - A; H = B + A +// F = D - C; G = D + C +// X3 = E * F → [0, 2p) [4M] +// Y3 = G * H → [0, 2p) [5M] +// T3 = E * H → [0, 2p) [6M] +// Z3 = F * G → [0, 2p) [7M] +// ============================================================================= +__device__ __forceinline__ void ec_te_unified_mixed_add_yzd(G1EdExtended &p, const G1EdYZD &q) { + // A = (Y1 - X1) * (Y_q - X_q) + uint64_t A[6]; + fp_sub_nr(A, p.y, p.x); // [0, 4p) + fp_mul_nr(A, A, q.y_minus_x); // [0, 2p) + + // B = (Y1 + X1) * (Y_q + X_q) + uint64_t B[6]; + fp_add_nr(B, p.y, p.x); // [0, 4p) + fp_mul_nr(B, B, q.y_plus_x); // [0, 2p) + + // C = T1 * (2d * X_q * Y_q) + uint64_t C[6]; + fp_mul_nr(C, p.t, q.two_d_xy); // [0, 2p) + + // D = 2 * Z1 + uint64_t D[6]; + fp_add_nr(D, p.z, p.z); // [0, 4p) + + // E = B - A, H = B + A (both [0, 2p) inputs) + uint64_t E[6], H[6]; + fp_sub_nr(E, B, A); // [0, 4p) + fp_add_nr(H, B, A); // [0, 4p) + + // F = D - C, G = D + C (D [0, 4p), C [0, 2p)) + uint64_t F[6], G[6]; + fp_sub_nr(F, D, C); // [0, 6p) + fp_add_nr(G, D, C); // [0, 6p) + + // Final products: outputs ∈ [0, 2p) since max input 6p < R. + fp_mul_nr(p.x, E, F); // [0, 2p) + fp_mul_nr(p.y, G, H); // [0, 2p) + fp_mul_nr(p.t, E, H); // [0, 2p) + fp_mul_nr(p.z, F, G); // [0, 2p) +} + +// ============================================================================= +// Short Weierstrass G1 affine arithmetic for batched-affine MSM. +// +// BLS12-377 G1 in SW form: y² = x³ + 1 (a=0, b=1). +// +// We use SW affine for the batched bucket-accumulation phase (see msm.cu). +// Each pair-add (P0 + P1) costs 1S + 3M *given* a precomputed 1/(x1-x0). +// Across N pairs in a batch, Montgomery's trick amortizes a single inversion +// with 3N field multiplications, so per-pair effective cost is 1S + 6M. +// +// The compact format matches gnark-crypto's bls12377.G1Affine memory layout +// (12 limbs in Montgomery form). Identity is encoded as (0, 0) — distinct +// from any on-curve point since 0² = 0 ≠ 1 = 0³ + 1. +// +// Layout invariants: +// p.x, p.y are fully reduced (in [0, p)). Outputs of g1sw_pair_add likewise. +// ============================================================================= + +struct G1AffineSW { + uint64_t x[6]; + uint64_t y[6]; +}; + +__device__ __forceinline__ void g1sw_set_identity(G1AffineSW &p) { + fp_set_zero(p.x); + fp_set_zero(p.y); +} + +__device__ __forceinline__ bool g1sw_is_identity(const G1AffineSW &p) { + return fp_is_zero(p.x) && fp_is_zero(p.y); +} + +__device__ __forceinline__ void g1sw_neg(G1AffineSW &out, const G1AffineSW &p) { + fp_copy(out.x, p.x); + fp_negate(out.y, p.y); +} + +__device__ __forceinline__ void g1sw_cnegate(G1AffineSW &p, bool negate) { + uint64_t neg_y[6]; + fp_negate(neg_y, p.y); + fp_ccopy(p.y, neg_y, negate); +} + +// ============================================================================= +// SW affine point doubling (a=0, b=1). +// +// λ = (3·x²) / (2·y) +// x3 = λ² - 2x +// y3 = λ(x - x3) - y +// +// Caller passes the precomputed inv2y = 1/(2y). Cost: 1S + 3M (after λ). +// Used rarely in batched-affine since random-scalar MSM almost never sees +// repeated points; included for completeness. +// ============================================================================= +__device__ __forceinline__ void g1sw_double_with_inv2y( + G1AffineSW &out, const G1AffineSW &p, const uint64_t inv2y[6]) { + + uint64_t three_x_sq[6], x_sq[6], two_x_sq[6]; + fp_sqr(x_sq, p.x); + fp_add(two_x_sq, x_sq, x_sq); + fp_add(three_x_sq, two_x_sq, x_sq); + + uint64_t lambda[6]; + fp_mul(lambda, three_x_sq, inv2y); + + uint64_t lam_sq[6], two_x[6]; + fp_sqr(lam_sq, lambda); + fp_add(two_x, p.x, p.x); + + uint64_t x3[6]; + fp_sub(x3, lam_sq, two_x); + + uint64_t x_minus_x3[6], lam_dx[6], y3[6]; + fp_sub(x_minus_x3, p.x, x3); + fp_mul(lam_dx, lambda, x_minus_x3); + fp_sub(y3, lam_dx, p.y); + + fp_copy(out.x, x3); + fp_copy(out.y, y3); +} + +// ============================================================================= +// SW affine pair add given precomputed 1/(x1-x0). +// +// λ = (y1 - y0) · inv_dx +// x3 = λ² - x0 - x1 +// y3 = λ(x0 - x3) - y0 +// +// Cost: 1S + 3M (the λ multiply is one of the 3M). +// +// Special cases: +// - Either operand is identity: result = the non-identity operand. +// - x0 == x1, y0 == y1 (P + P): would need doubling; the precomputed +// inv_dx is undefined. Caller must detect ahead and dispatch to double. +// - x0 == x1, y0 != y1 (P + (-P)): result = identity. +// For random-scalar MSM with sorted-by-bucket pairs, neither degenerate case +// occurs (each pair is two distinct point indices contributing to the same +// bucket — almost surely different x's). +// ============================================================================= +__device__ __forceinline__ void g1sw_pair_add_with_inv_dx( + G1AffineSW &out, const G1AffineSW &p0, const G1AffineSW &p1, + const uint64_t inv_dx[6]) { + + // λ = (y1 - y0) · inv_dx + uint64_t dy[6]; + fp_sub(dy, p1.y, p0.y); + uint64_t lambda[6]; + fp_mul(lambda, dy, inv_dx); + + // x3 = λ² - x0 - x1 + uint64_t lam_sq[6], x3[6]; + fp_sqr(lam_sq, lambda); + uint64_t sum_x[6]; + fp_add(sum_x, p0.x, p1.x); + fp_sub(x3, lam_sq, sum_x); + + // y3 = λ(x0 - x3) - y0 + uint64_t x0_minus_x3[6], lam_dx[6], y3[6]; + fp_sub(x0_minus_x3, p0.x, x3); + fp_mul(lam_dx, lambda, x0_minus_x3); + fp_sub(y3, lam_dx, p0.y); + + fp_copy(out.x, x3); + fp_copy(out.y, y3); +} + +// ============================================================================= +// SW affine → TE extended conversion. +// +// Used at the boundary between the new batched-affine accumulator (output: +// per-bucket SW affine point) and the existing reduce phase (input: per-bucket +// G1EdExtended). Mirrors `convertToEdMSM` from g1_te.go but for a single point +// at a time (no batched inversion — the single fp_inv is amortized over the +// many adds whose sum produced this point). +// +// Mapping (gbotrel/zprize-mobile-harness): +// x_te = (x_sw + 1) / (y_sw · invSqrtMinusA) +// y_te = (x_sw + 1 - √3) / (x_sw + 1 + √3) +// +// Identity SW (0, 0) → identity TE extended (0, 1, 0, 1). +// ============================================================================= + +// 1/√(-a) where a = -2√3 + 3, in Montgomery form (matches teInvSqrtMinusA in g1_te.go). +__device__ __constant__ const uint64_t TE_INV_SQRT_MINUS_A[6] = { + 0x3b092ce1fd76a6bdULL, 0x925230d9bba32683ULL, + 0x872d5d2fe991a197ULL, 0x8367c527a82b2ab0ULL, + 0xe285bbb3ef662a15ULL, 0x0160527a9283e729ULL, +}; + +// √3 in Montgomery form (matches teSqrtThree in g1_te.go). +__device__ __constant__ const uint64_t TE_SQRT_THREE[6] = { + 0x3fabdfd08894e1e4ULL, 0xcbf921ddcc1f55aaULL, + 0xd17deff1460edc0cULL, 0xd394e81e7897028dULL, + 0xc29c995d0912681aULL, 0x01515e6caff9d568ULL, +}; + +__device__ __forceinline__ void g1sw_to_te_extended( + G1EdExtended &out, const G1AffineSW &p) { + + if(g1sw_is_identity(p)) { + ec_te_set_identity(out); + return; + } + + uint64_t one[6]; + fp_set_one(one); + + uint64_t x_plus_one[6]; + fp_add(x_plus_one, p.x, one); + + // Denominator 1: y_sw · invSqrtMinusA + // Denominator 2: x_sw + 1 + √3 + uint64_t d1[6], d2[6]; + fp_mul(d1, p.y, TE_INV_SQRT_MINUS_A); + fp_add(d2, x_plus_one, TE_SQRT_THREE); + + // Single batched inverse via the well-known: invert(d1*d2) once, recover + // inv_d1 = d2 * inv(d1*d2), inv_d2 = d1 * inv(d1*d2). + uint64_t prod[6], inv_prod[6]; + fp_mul(prod, d1, d2); + fp_inv(inv_prod, prod); + + uint64_t inv_d1[6], inv_d2[6]; + fp_mul(inv_d1, d2, inv_prod); + fp_mul(inv_d2, d1, inv_prod); + + // x_te = x_plus_one * inv_d1 ; y_te = (x_plus_one - √3) * inv_d2 + uint64_t x_te[6], y_te[6], x_minus_sqrt3[6]; + fp_mul(x_te, x_plus_one, inv_d1); + fp_sub(x_minus_sqrt3, x_plus_one, TE_SQRT_THREE); + fp_mul(y_te, x_minus_sqrt3, inv_d2); + + // Pack into extended TE: (X=x_te, Y=y_te, T=x_te*y_te, Z=1). + fp_copy(out.x, x_te); + fp_copy(out.y, y_te); + fp_mul(out.t, x_te, y_te); + fp_set_one(out.z); +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/field.cuh b/prover/gpu/cuda/src/plonk/field.cuh new file mode 100644 index 00000000000..982ea9505fc --- /dev/null +++ b/prover/gpu/cuda/src/plonk/field.cuh @@ -0,0 +1,283 @@ +#pragma once + +// ───────────────────────────────────────────────────────────────────────────── +// Field type definitions and SoA storage for BLS12-377 Fp and Fr +// +// Two field types are used throughout the GPU library: +// +// Fp (base field, 377 bits, 6 × 64-bit limbs): +// Used for elliptic curve point coordinates (x, y, t, z). +// Arithmetic in fp.cuh. +// +// Fr (scalar field, 253 bits, 4 × 64-bit limbs): +// Used for polynomial coefficients, NTT elements, and MSM scalars. +// Arithmetic in fr_arith.cuh. +// +// GPU memory layout — Structure of Arrays (SoA): +// +// AoS (Array of Structs) — how CPU stores field elements: +// [a₀[0] a₀[1] a₀[2] a₀[3]] [a₁[0] a₁[1] a₁[2] a₁[3]] ... +// └──── element 0 ────────┘ └──── element 1 ────────┘ +// +// SoA (Structure of Arrays) — how GPU stores for coalesced access: +// limb0: [a₀[0], a₁[0], a₂[0], ..., aₙ₋₁[0]] ← one 256-bit load +// limb1: [a₀[1], a₁[1], a₂[1], ..., aₙ₋₁[1]] per warp covers +// limb2: [a₀[2], a₁[2], a₂[2], ..., aₙ₋₁[2]] 32 consecutive +// limb3: [a₀[3], a₁[3], a₂[3], ..., aₙ₋₁[3]] elements +// +// When a warp of 32 threads accesses consecutive elements, SoA ensures +// each limb array is accessed contiguously → coalesced 256-byte transactions +// instead of strided access with 4× the memory transactions. +// +// Exception: MSM points use AoS (G1EdXY, 96 bytes per point) because the +// accumulate kernel accesses points by random index from radix sort output. +// SoA would require 2 separate random accesses and double TLB misses. +// ───────────────────────────────────────────────────────────────────────────── + +#include +#include +#include +#include + +#ifdef __CUDACC__ +#include +#else +#define __host__ +#define __device__ +#define __forceinline__ +#endif + +namespace gnark_gpu { + +// ============================================================================= +// Field parameters for BLS12-377 +// ============================================================================= + +// BLS12-377 base field Fp (377 bits, 6 limbs) +struct Fp_params { + static constexpr size_t LIMBS = 6; + static constexpr uint64_t MODULUS[6] = { + 0x8508c00000000001ULL, 0x170b5d4430000000ULL, 0x1ef3622fba094800ULL, + 0x1a22d9f300f5138fULL, 0xc63b05c06ca1493bULL, 0x01ae3a4617c510eaULL, + }; + static constexpr uint64_t INV = 0x8508bfffffffffffULL; // -p^{-1} mod 2^64 +}; + +// BLS12-377 scalar field Fr (253 bits, 4 limbs) +struct Fr_params { + static constexpr size_t LIMBS = 4; + static constexpr uint64_t MODULUS[4] = { + 0x0a11800000000001ULL, + 0x59aa76fed0000001ULL, + 0x60b44d1e5c37b001ULL, + 0x12ab655e9a2ca556ULL, + }; + static constexpr uint64_t INV = 0x0a117fffffffffffULL; + // Montgomery R = 2^256 mod q (i.e., "one" in Montgomery form) + static constexpr uint64_t ONE[4] = { + 0x7d1c7ffffffffff3ULL, + 0x7257f50f6ffffff2ULL, + 0x16d81575512c0feeULL, + 0x0d4bda322bbb9a9dULL, + }; +}; + +// ============================================================================= +// Field element (single element, for host-side or AoS usage) +// ============================================================================= + +template struct Field { + uint64_t limbs[Params::LIMBS]; + + __host__ __device__ constexpr Field() : limbs{} {} + + __host__ __device__ constexpr Field(uint64_t v) : limbs{} { limbs[0] = v; } + + __host__ __device__ bool operator==(const Field &other) const { + for(size_t i = 0; i < Params::LIMBS; ++i) { + if(limbs[i] != other.limbs[i]) return false; + } + return true; + } + + __host__ __device__ bool operator!=(const Field &other) const { return !(*this == other); } +}; + +using Fr = Field; +using Fp = Field; + +// ============================================================================= +// HostFieldVector: Host-side SoA storage (mirrors FieldVector layout) +// ============================================================================= + +template class HostFieldVector { + size_t count_ = 0; + std::array, Params::LIMBS> data_ = {}; + + public: + HostFieldVector() = default; + + explicit HostFieldVector(size_t n) : count_(n) { + for(size_t i = 0; i < Params::LIMBS; ++i) { + data_[i] = std::make_unique(n); + } + } + + size_t size() const { return count_; } + static constexpr size_t limbs() { return Params::LIMBS; } + + // Access limb array + uint64_t *limb(size_t i) { return data_[i].get(); } + const uint64_t *limb(size_t i) const { return data_[i].get(); } + + // Get raw pointers for copy operations + auto raw_ptrs() { + std::array ptrs; + for(size_t i = 0; i < Params::LIMBS; ++i) { + ptrs[i] = data_[i].get(); + } + return ptrs; + } + + auto raw_ptrs() const { + std::array ptrs; + for(size_t i = 0; i < Params::LIMBS; ++i) { + ptrs[i] = const_cast(data_[i].get()); + } + return ptrs; + } + + // Set element at index (from a Field) + void set(size_t idx, const Field &f) { + for(size_t i = 0; i < Params::LIMBS; ++i) { + data_[i][idx] = f.limbs[i]; + } + } + + // Get element at index (as a Field) + Field get(size_t idx) const { + Field f; + for(size_t i = 0; i < Params::LIMBS; ++i) { + f.limbs[i] = data_[i][idx]; + } + return f; + } +}; + +using HostFrVector = HostFieldVector; +using HostFpVector = HostFieldVector; + +// ============================================================================= +// CUDA-only: FieldVector and PTX intrinsics +// ============================================================================= + +#ifdef __CUDACC__ + +// ============================================================================= +// FieldVector: GPU-friendly SoA (Structure of Arrays) for field elements +// Stores N field elements as LIMBS separate arrays for coalesced memory access +// ============================================================================= + +template class FieldVector { + size_t count_ = 0; + std::array device_ = {}; + + public: + FieldVector() = default; + + explicit FieldVector(size_t n) : count_(n) { allocate(); } + + ~FieldVector() { free(); } + + // Non-copyable + FieldVector(const FieldVector &) = delete; + FieldVector &operator=(const FieldVector &) = delete; + + // Movable + FieldVector(FieldVector &&other) noexcept : count_(other.count_), device_(other.device_) { + other.count_ = 0; + other.device_ = {}; + } + + FieldVector &operator=(FieldVector &&other) noexcept { + if(this != &other) { + free(); + count_ = other.count_; + device_ = other.device_; + other.count_ = 0; + other.device_ = {}; + } + return *this; + } + + void resize(size_t n) { + if(n == count_) return; + free(); + count_ = n; + allocate(); + } + + // Copy from host arrays (one per limb) + void copy_host_to_device(const std::array &host) { + for(size_t i = 0; i < Params::LIMBS; ++i) { + cudaMemcpy(device_[i], host[i], count_ * sizeof(uint64_t), cudaMemcpyHostToDevice); + } + } + + // Copy to host arrays (one per limb) + void copy_device_to_host(const std::array &host) const { + for(size_t i = 0; i < Params::LIMBS; ++i) { + cudaMemcpy(host[i], device_[i], count_ * sizeof(uint64_t), cudaMemcpyDeviceToHost); + } + } + + // Accessors + size_t size() const { return count_; } + size_t bytes() const { return count_ * Params::LIMBS * sizeof(uint64_t); } + static constexpr size_t limbs() { return Params::LIMBS; } + + // Get device pointer for limb i + uint64_t *limb(size_t i) { return device_[i]; } + const uint64_t *limb(size_t i) const { return device_[i]; } + + // Get all device pointers (for kernel launches) + auto &device_ptrs() { return device_; } + const auto &device_ptrs() const { return device_; } + + private: + void allocate() { + for(size_t i = 0; i < Params::LIMBS; ++i) { + cudaMalloc(&device_[i], count_ * sizeof(uint64_t)); + } + } + + void free() { + for(size_t i = 0; i < Params::LIMBS; ++i) { + if(device_[i]) { + cudaFree(device_[i]); + device_[i] = nullptr; + } + } + } +}; + +using FrVector = FieldVector; +using FpVector = FieldVector; + +// ============================================================================= +// PTX intrinsics for Montgomery arithmetic +// ============================================================================= + +// PTX multiply-add with carry: lo = (a*b + c + carry_in) mod 2^64 +__device__ __forceinline__ void madc_u64(uint64_t &lo, uint64_t &carry, uint64_t a, uint64_t b, uint64_t c, + uint64_t carry_in) { + asm volatile("add.cc.u64 %0, %4, %5;\n\t" + "madc.lo.cc.u64 %0, %2, %3, %0;\n\t" + "madc.hi.u64 %1, %2, %3, 0;" + : "=&l"(lo), "=l"(carry) + : "l"(a), "l"(b), "l"(c), "l"(carry_in)); +} + +#endif // __CUDACC__ + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/fp.cuh b/prover/gpu/cuda/src/plonk/fp.cuh new file mode 100644 index 00000000000..f038f3714bc --- /dev/null +++ b/prover/gpu/cuda/src/plonk/fp.cuh @@ -0,0 +1,543 @@ +#pragma once + +// ───────────────────────────────────────────────────────────────────────────── +// BLS12-377 base field Fp arithmetic (377 bits, 6 × 64-bit limbs) +// +// All elements are in Montgomery form: ā = a · R mod p, where R = 2³⁸⁴. +// This avoids a modular reduction after every multiply: instead of computing +// (a · b) mod p directly, we compute ā · b̄ · R⁻¹ mod p which equals (a·b)·R mod p. +// +// Operations are branchless (no warp divergence on GPU): +// fp_add: PTX carry chain + conditional select (12 asm instructions) +// fp_sub: PTX borrow chain + masked correction (12 asm instructions) +// fp_mul: CIOS Montgomery multiply (__noinline__) (72 mul + 72 add) +// fp_sqr: Delegates to fp_mul(a, a) (same cost) +// +// p = 0x01ae3a4617c510ea_c63b05c06ca1493b_1a22d9f300f5138f_ +// 1ef3622fba094800_170b5d4430000000_8508c00000000001 +// = 258664426012969094010652733694893533536393512754914660539884262666720468348340822774968888139573360124440321458177 +// ───────────────────────────────────────────────────────────────────────────── + +#include "field.cuh" + +namespace gnark_gpu { + +// ============================================================================= +// Fp Montgomery constants (BLS12-377 base field, 377 bits, 6 limbs) +// ============================================================================= + +// Fp modulus as device constant (mirrors Fp_params::MODULUS) +__device__ __constant__ const uint64_t FP_MODULUS[6] = { + 0x8508c00000000001ULL, 0x170b5d4430000000ULL, 0x1ef3622fba094800ULL, + 0x1a22d9f300f5138fULL, 0xc63b05c06ca1493bULL, 0x01ae3a4617c510eaULL, +}; + +// 2p — used by fp_sub_nr for lazy-reduction subtraction correction. +// Since p ≈ 2^377 and container is 384 bits, 2p ≈ 2^378 fits comfortably. +__device__ __constant__ const uint64_t FP_MODULUS_2X[6] = { + 0x0a11800000000002ULL, 0x2e16ba8860000001ULL, 0x3de6c45f74129000ULL, + 0x3445b3e601ea271eULL, 0x8c760b80d9429276ULL, 0x035c748c2f8a21d5ULL, +}; + +// -p^{-1} mod 2^64 +__device__ __constant__ const uint64_t FP_INV = 0x8508bfffffffffffULL; + +// R = 2^384 mod p (Montgomery one) +__device__ __constant__ const uint64_t FP_R[6] = { + 0x02cdffffffffff68ULL, 0x51409f837fffffb1ULL, 0x9f7db3a98a7d3ff2ULL, + 0x7b4e97b76e7c6305ULL, 0x4cf495bf803c84e8ULL, 0x008d6661e2fdf49aULL, +}; + +// R^2 = 2^768 mod p (for converting to Montgomery form) +__device__ __constant__ const uint64_t FP_R_SQUARED[6] = { + 0xb786686c9400cd22ULL, 0x0329fcaab00431b1ULL, 0x22a5f11162d6b46dULL, + 0xbfdf7d03827dc3acULL, 0x837e92f041790bf9ULL, 0x006dfccb1e914b88ULL, +}; + +// ============================================================================= +// Fp arithmetic: add, sub, mul, sqr for 6-limb Montgomery form +// All operations are branchless to avoid warp divergence on GPU. +// ============================================================================= + +// fp_add: r = (a + b) mod p — branchless using PTX carry chains + conditional select +__device__ __forceinline__ void fp_add(uint64_t r[6], const uint64_t a[6], const uint64_t b[6]) { + const uint64_t *p = FP_MODULUS; + + uint64_t s[6], carry; + + // Add with carry chain + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(s[0]) : "l"(a[0]), "l"(b[0])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s[1]) : "l"(a[1]), "l"(b[1])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s[2]) : "l"(a[2]), "l"(b[2])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s[3]) : "l"(a[3]), "l"(b[3])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s[4]) : "l"(a[4]), "l"(b[4])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s[5]) : "l"(a[5]), "l"(b[5])); + asm volatile("addc.u64 %0, 0, 0;" : "=l"(carry)); + + // Subtract modulus + uint64_t t[6], borrow; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(t[0]) : "l"(s[0]), "l"(p[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[1]) : "l"(s[1]), "l"(p[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[2]) : "l"(s[2]), "l"(p[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[3]) : "l"(s[3]), "l"(p[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[4]) : "l"(s[4]), "l"(p[4])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[5]) : "l"(s[5]), "l"(p[5])); + asm volatile("subc.u64 %0, %1, 0;" : "=l"(borrow) : "l"(carry)); + + // Branchless select: if borrow, use unreduced sum; else use reduced + // borrow == 0 means s >= p, use t (reduced) + // borrow != 0 means s < p, use s (unreduced) + uint64_t mask = -(borrow != 0); // 0xFFF...F if borrow, 0 if no borrow + r[0] = (s[0] & mask) | (t[0] & ~mask); + r[1] = (s[1] & mask) | (t[1] & ~mask); + r[2] = (s[2] & mask) | (t[2] & ~mask); + r[3] = (s[3] & mask) | (t[3] & ~mask); + r[4] = (s[4] & mask) | (t[4] & ~mask); + r[5] = (s[5] & mask) | (t[5] & ~mask); +} + +// fp_sub: r = (a - b) mod p — branchless +__device__ __forceinline__ void fp_sub(uint64_t r[6], const uint64_t a[6], const uint64_t b[6]) { + const uint64_t *p = FP_MODULUS; + + uint64_t s[6], borrow; + + // Subtract with borrow chain + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(s[0]) : "l"(a[0]), "l"(b[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[1]) : "l"(a[1]), "l"(b[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[2]) : "l"(a[2]), "l"(b[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[3]) : "l"(a[3]), "l"(b[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[4]) : "l"(a[4]), "l"(b[4])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[5]) : "l"(a[5]), "l"(b[5])); + asm volatile("subc.u64 %0, 0, 0;" : "=l"(borrow)); + + // Branchless correction: mask = all-ones if borrow, zero if no borrow + // If borrow: add p back (s + p). If no borrow: keep s. + uint64_t mask = -(borrow != 0); // 0xFFF...F if underflow + uint64_t corr[6]; + corr[0] = p[0] & mask; + corr[1] = p[1] & mask; + corr[2] = p[2] & mask; + corr[3] = p[3] & mask; + corr[4] = p[4] & mask; + corr[5] = p[5] & mask; + + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(r[0]) : "l"(s[0]), "l"(corr[0])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[1]) : "l"(s[1]), "l"(corr[1])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[2]) : "l"(s[2]), "l"(corr[2])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[3]) : "l"(s[3]), "l"(corr[3])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[4]) : "l"(s[4]), "l"(corr[4])); + asm volatile("addc.u64 %0, %1, %2;" : "=l"(r[5]) : "l"(s[5]), "l"(corr[5])); +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Lazy-reduction ("_nr" = no reduce) variants for EC formulas +// +// These skip the final conditional subtraction, keeping results in [0, 2p) +// instead of [0, p). This is safe because: +// +// 1. p ≈ 2^377 fits in 384-bit container with 7 bits headroom +// 2. 2p, 4p, 6p all fit in 6 × 64-bit limbs (max intermediate ≈ 2^380) +// 3. CIOS with inputs < R = 2^384 produces output < 2p (since 4p < R) +// +// Bound tracking through TE mixed-add formula (worst-case per variable): +// +// ┌──────────────────┬─────────────────────────────┬──────────────┐ +// │ Variable │ Expression │ Bound │ +// ├──────────────────┼─────────────────────────────┼──────────────┤ +// │ T_q │ mul_nr(mul_nr(x,y), 2d) │ [0, 2p) │ +// │ A = (Y-X)(Yq-Xq)│ mul_nr(sub_nr, sub) │ [0, 2p) │ +// │ B = (Y+X)(Yq+Xq)│ mul_nr(add_nr, add_nr) │ [0, 2p) │ +// │ C = T1 · T_q │ mul_nr(2p, 2p) │ [0, 2p) │ +// │ D = 2·Z1 │ add_nr(2p, 2p) │ [0, 4p) │ +// │ E = B - A │ sub_nr(2p, 2p) │ [0, 4p) │ +// │ H = B + A │ add_nr(2p, 2p) │ [0, 4p) │ +// │ F = D - C │ sub_nr(4p, 2p) │ [0, 6p) │ +// │ G = D + C │ add_nr(4p, 2p) │ [0, 6p) │ +// │ X3= E·F │ mul_nr(4p, 6p) → < 2p │ [0, 2p) │ +// │ Y3= G·H │ mul_nr(6p, 4p) → < 2p │ [0, 2p) │ +// │ T3= E·H │ mul_nr(4p, 4p) → < 2p │ [0, 2p) │ +// │ Z3= F·G │ mul_nr(6p, 6p) → < 2p │ [0, 2p) │ +// └──────────────────┴─────────────────────────────┴──────────────┘ +// +// All outputs ∈ [0, 2p): invariant is maintained across chained EC adds. +// +// Max mul input 6p ≈ 2^380 < R = 2^384 ✓ +// CIOS output bound: (6p·6p)/R + p = 36p²/R + p < 2p ✓ (since 36p < R) +// ═════════════════════════════════════════════════════════════════════════════ + +// fp_reduce: conditional subtract p. Brings [0, 2p) → [0, p). Branchless. +__device__ __forceinline__ void fp_reduce(uint64_t r[6]) { + const uint64_t *p = FP_MODULUS; + uint64_t s[6], borrow; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(s[0]) : "l"(r[0]), "l"(p[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[1]) : "l"(r[1]), "l"(p[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[2]) : "l"(r[2]), "l"(p[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[3]) : "l"(r[3]), "l"(p[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[4]) : "l"(r[4]), "l"(p[4])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[5]) : "l"(r[5]), "l"(p[5])); + asm volatile("subc.u64 %0, 0, 0;" : "=l"(borrow)); + + uint64_t mask = -(borrow != 0); + r[0] = (r[0] & mask) | (s[0] & ~mask); + r[1] = (r[1] & mask) | (s[1] & ~mask); + r[2] = (r[2] & mask) | (s[2] & ~mask); + r[3] = (r[3] & mask) | (s[3] & ~mask); + r[4] = (r[4] & mask) | (s[4] & ~mask); + r[5] = (r[5] & mask) | (s[5] & ~mask); +} + +// fp_add_nr: r = a + b (no modular reduction) +// Inputs must satisfy a + b < 2^384 (guaranteed when a,b < 6p ≈ 2^380). +// Saves 12 instructions vs fp_add by skipping the conditional subtract. +__device__ __forceinline__ void fp_add_nr(uint64_t r[6], const uint64_t a[6], const uint64_t b[6]) { + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(r[0]) : "l"(a[0]), "l"(b[0])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[1]) : "l"(a[1]), "l"(b[1])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[2]) : "l"(a[2]), "l"(b[2])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[3]) : "l"(a[3]), "l"(b[3])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[4]) : "l"(a[4]), "l"(b[4])); + asm volatile("addc.u64 %0, %1, %2;" : "=l"(r[5]) : "l"(a[5]), "l"(b[5])); +} + +// fp_sub_nr: r = (a - b), add 2p on borrow. Branchless. +// For unreduced inputs in [0, 2p): result in [0, 4p). +// For mixed inputs (one [0, 4p), one [0, 2p)): result in [0, 6p). +// Same instruction count as fp_sub, but uses 2p correction for wider inputs. +__device__ __forceinline__ void fp_sub_nr(uint64_t r[6], const uint64_t a[6], const uint64_t b[6]) { + const uint64_t *p2 = FP_MODULUS_2X; + + uint64_t s[6], borrow; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(s[0]) : "l"(a[0]), "l"(b[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[1]) : "l"(a[1]), "l"(b[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[2]) : "l"(a[2]), "l"(b[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[3]) : "l"(a[3]), "l"(b[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[4]) : "l"(a[4]), "l"(b[4])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[5]) : "l"(a[5]), "l"(b[5])); + asm volatile("subc.u64 %0, 0, 0;" : "=l"(borrow)); + + uint64_t mask = -(borrow != 0); + uint64_t corr[6]; + corr[0] = p2[0] & mask; + corr[1] = p2[1] & mask; + corr[2] = p2[2] & mask; + corr[3] = p2[3] & mask; + corr[4] = p2[4] & mask; + corr[5] = p2[5] & mask; + + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(r[0]) : "l"(s[0]), "l"(corr[0])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[1]) : "l"(s[1]), "l"(corr[1])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[2]) : "l"(s[2]), "l"(corr[2])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[3]) : "l"(s[3]), "l"(corr[3])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[4]) : "l"(s[4]), "l"(corr[4])); + asm volatile("addc.u64 %0, %1, %2;" : "=l"(r[5]) : "l"(s[5]), "l"(corr[5])); +} + +// ───────────────────────────────────────────────────────────────────────────── +// fp_mul: r = a · b mod p (even/odd split CIOS, 12×32-bit limbs) +// +// Uses the ARITH23 even/odd split technique (adapted from sppark/yrrid-msm): +// http://www.acsel-lab.com/arithmetic/arith23/data/1616a047.pdf +// +// Key idea: split the 12-limb accumulator into even[0,2,4,...] and odd[1,3,5,...] +// positions. Products a[even]*bi flow into the even array, a[odd]*bi into odd. +// Within each array, carries chain via PTX mad.lo.cc → madc.hi.cc (unbroken). +// After 12 iterations, merge even+odd for the final result. +// +// Each mad/madc pair is 2 native 32-bit instructions (1 cycle each, 128 ops/SM). +// vs old 64-bit CIOS: each mul64 decomposes to 4× mul32 + carry management. +// +// Register pressure: __noinline__ is CRITICAL for MSM performance. +// EC add functions must stay __forceinline__ (see ec.cuh header comment). +// ───────────────────────────────────────────────────────────────────────────── + +// ── 32-bit PTX intrinsics ─────────────────────────────────────────────────── + +__device__ __forceinline__ uint32_t ptx_mul_lo(uint32_t x, uint32_t y) { + uint32_t r; asm("mul.lo.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +__device__ __forceinline__ uint32_t ptx_mul_hi(uint32_t x, uint32_t y) { + uint32_t r; asm("mul.hi.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +__device__ __forceinline__ uint32_t ptx_mad_lo_cc(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +__device__ __forceinline__ uint32_t ptx_madc_lo_cc(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +__device__ __forceinline__ uint32_t ptx_madc_hi_cc(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +__device__ __forceinline__ uint32_t ptx_madc_hi(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +__device__ __forceinline__ uint32_t ptx_add_cc(uint32_t x, uint32_t y) { + uint32_t r; asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +__device__ __forceinline__ uint32_t ptx_addc_cc(uint32_t x, uint32_t y) { + uint32_t r; asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +__device__ __forceinline__ uint32_t ptx_addc(uint32_t x, uint32_t y) { + uint32_t r; asm volatile("addc.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} + +// ── Multiply helpers ──────────────────────────────────────────────────────── + +// Initial multiply: acc[i]=lo(a[i]*bi), acc[i+1]=hi(a[i]*bi) for even i +static __device__ __forceinline__ void fp32_mul_n(uint32_t *acc, const uint32_t *a, uint32_t bi) { + #pragma unroll + for (int i = 0; i < 12; i += 2) { + acc[i] = ptx_mul_lo(a[i], bi); + acc[i + 1] = ptx_mul_hi(a[i], bi); + } +} + +// Chained multiply-accumulate with unbroken carry chain (12 instructions) +static __device__ __forceinline__ void fp32_cmad_n(uint32_t *acc, const uint32_t *a, uint32_t bi) { + acc[0] = ptx_mad_lo_cc(a[0], bi, acc[0]); + acc[1] = ptx_madc_hi_cc(a[0], bi, acc[1]); + acc[2] = ptx_madc_lo_cc(a[2], bi, acc[2]); + acc[3] = ptx_madc_hi_cc(a[2], bi, acc[3]); + acc[4] = ptx_madc_lo_cc(a[4], bi, acc[4]); + acc[5] = ptx_madc_hi_cc(a[4], bi, acc[5]); + acc[6] = ptx_madc_lo_cc(a[6], bi, acc[6]); + acc[7] = ptx_madc_hi_cc(a[6], bi, acc[7]); + acc[8] = ptx_madc_lo_cc(a[8], bi, acc[8]); + acc[9] = ptx_madc_hi_cc(a[8], bi, acc[9]); + acc[10] = ptx_madc_lo_cc(a[10], bi, acc[10]); + acc[11] = ptx_madc_hi_cc(a[10], bi, acc[11]); +} + +// Right-shifted multiply-accumulate (consumes carry from previous op) +static __device__ __forceinline__ void fp32_madc_n_rshift(uint32_t *odd, const uint32_t *a, uint32_t bi) { + odd[0] = ptx_madc_lo_cc(a[0], bi, odd[2]); + odd[1] = ptx_madc_hi_cc(a[0], bi, odd[3]); + odd[2] = ptx_madc_lo_cc(a[2], bi, odd[4]); + odd[3] = ptx_madc_hi_cc(a[2], bi, odd[5]); + odd[4] = ptx_madc_lo_cc(a[4], bi, odd[6]); + odd[5] = ptx_madc_hi_cc(a[4], bi, odd[7]); + odd[6] = ptx_madc_lo_cc(a[6], bi, odd[8]); + odd[7] = ptx_madc_hi_cc(a[6], bi, odd[9]); + odd[8] = ptx_madc_lo_cc(a[8], bi, odd[10]); + odd[9] = ptx_madc_hi_cc(a[8], bi, odd[11]); + odd[10] = ptx_madc_lo_cc(a[10], bi, 0); + odd[11] = ptx_madc_hi(a[10], bi, 0); +} + +// One fused multiply + Montgomery reduction step +static __device__ __forceinline__ void fp32_mad_n_redc( + uint32_t *even, uint32_t *odd, const uint32_t *a, uint32_t bi, + const uint32_t *MOD, bool first) { + if (first) { + fp32_mul_n(odd, a + 1, bi); + fp32_mul_n(even, a, bi); + } else { + even[0] = ptx_add_cc(even[0], odd[1]); + fp32_madc_n_rshift(odd, a + 1, bi); + fp32_cmad_n(even, a, bi); + odd[11] = ptx_addc(odd[11], 0); + } + uint32_t mi = even[0] * 0xFFFFFFFFu; // -p⁻¹ mod 2³² (p ≡ 1 mod 2³²) + fp32_cmad_n(odd, MOD + 1, mi); + fp32_cmad_n(even, MOD, mi); + odd[11] = ptx_addc(odd[11], 0); +} + +// ── fp_mul and fp_mul_nr ──────────────────────────────────────────────────── + +static __device__ __noinline__ void fp_mul(uint64_t r[6], const uint64_t a[6], const uint64_t b[6]) { + const uint32_t *a32 = (const uint32_t *)a; + const uint32_t *b32 = (const uint32_t *)b; + const uint32_t *MOD = (const uint32_t *)FP_MODULUS; + + __align__(8) uint32_t even[12]; + __align__(8) uint32_t odd[12]; + + #pragma unroll + for (int i = 0; i < 12; i += 2) { + fp32_mad_n_redc(even, odd, a32, b32[i], MOD, i == 0); + fp32_mad_n_redc(odd, even, a32, b32[i + 1], MOD, false); + } + + // Merge even and odd arrays + even[0] = ptx_add_cc(even[0], odd[1]); + even[1] = ptx_addc_cc(even[1], odd[2]); + even[2] = ptx_addc_cc(even[2], odd[3]); + even[3] = ptx_addc_cc(even[3], odd[4]); + even[4] = ptx_addc_cc(even[4], odd[5]); + even[5] = ptx_addc_cc(even[5], odd[6]); + even[6] = ptx_addc_cc(even[6], odd[7]); + even[7] = ptx_addc_cc(even[7], odd[8]); + even[8] = ptx_addc_cc(even[8], odd[9]); + even[9] = ptx_addc_cc(even[9], odd[10]); + even[10] = ptx_addc_cc(even[10], odd[11]); + even[11] = ptx_addc(even[11], 0); + + // Final reduction: branchless conditional subtract p + const uint64_t *e64 = (const uint64_t *)even; + const uint64_t *q = FP_MODULUS; + uint64_t s[6], borrow; + + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(s[0]) : "l"(e64[0]), "l"(q[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[1]) : "l"(e64[1]), "l"(q[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[2]) : "l"(e64[2]), "l"(q[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[3]) : "l"(e64[3]), "l"(q[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[4]) : "l"(e64[4]), "l"(q[4])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[5]) : "l"(e64[5]), "l"(q[5])); + asm volatile("subc.u64 %0, 0, 0;" : "=l"(borrow)); + + uint64_t mask = -(borrow != 0); + r[0] = (e64[0] & mask) | (s[0] & ~mask); + r[1] = (e64[1] & mask) | (s[1] & ~mask); + r[2] = (e64[2] & mask) | (s[2] & ~mask); + r[3] = (e64[3] & mask) | (s[3] & ~mask); + r[4] = (e64[4] & mask) | (s[4] & ~mask); + r[5] = (e64[5] & mask) | (s[5] & ~mask); +} + +// fp_mul_nr: result in [0, 2p), no final reduction +static __device__ __noinline__ void fp_mul_nr(uint64_t r[6], const uint64_t a[6], const uint64_t b[6]) { + const uint32_t *a32 = (const uint32_t *)a; + const uint32_t *b32 = (const uint32_t *)b; + const uint32_t *MOD = (const uint32_t *)FP_MODULUS; + + __align__(8) uint32_t even[12]; + __align__(8) uint32_t odd[12]; + + #pragma unroll + for (int i = 0; i < 12; i += 2) { + fp32_mad_n_redc(even, odd, a32, b32[i], MOD, i == 0); + fp32_mad_n_redc(odd, even, a32, b32[i + 1], MOD, false); + } + + // Merge even and odd arrays + even[0] = ptx_add_cc(even[0], odd[1]); + even[1] = ptx_addc_cc(even[1], odd[2]); + even[2] = ptx_addc_cc(even[2], odd[3]); + even[3] = ptx_addc_cc(even[3], odd[4]); + even[4] = ptx_addc_cc(even[4], odd[5]); + even[5] = ptx_addc_cc(even[5], odd[6]); + even[6] = ptx_addc_cc(even[6], odd[7]); + even[7] = ptx_addc_cc(even[7], odd[8]); + even[8] = ptx_addc_cc(even[8], odd[9]); + even[9] = ptx_addc_cc(even[9], odd[10]); + even[10] = ptx_addc_cc(even[10], odd[11]); + even[11] = ptx_addc(even[11], 0); + + // No final reduction — copy to output + uint32_t *r32 = (uint32_t *)r; + #pragma unroll + for (int i = 0; i < 12; i++) r32[i] = even[i]; +} + +// fp_sqr: r = a^2 mod p (uses mul for simplicity; same correctness) +__device__ __forceinline__ void fp_sqr(uint64_t r[6], const uint64_t a[6]) { + fp_mul(r, a, a); +} + +// fp_is_zero: check if all limbs are zero +__device__ __forceinline__ bool fp_is_zero(const uint64_t a[6]) { + return (a[0] | a[1] | a[2] | a[3] | a[4] | a[5]) == 0; +} + +// fp_eq: check if a == b +__device__ __forceinline__ bool fp_eq(const uint64_t a[6], const uint64_t b[6]) { + return ((a[0] ^ b[0]) | (a[1] ^ b[1]) | (a[2] ^ b[2]) | + (a[3] ^ b[3]) | (a[4] ^ b[4]) | (a[5] ^ b[5])) == 0; +} + +// fp_copy: r = a +__device__ __forceinline__ void fp_copy(uint64_t r[6], const uint64_t a[6]) { + r[0] = a[0]; r[1] = a[1]; r[2] = a[2]; + r[3] = a[3]; r[4] = a[4]; r[5] = a[5]; +} + +// fp_set_zero: r = 0 +__device__ __forceinline__ void fp_set_zero(uint64_t r[6]) { + r[0] = 0; r[1] = 0; r[2] = 0; + r[3] = 0; r[4] = 0; r[5] = 0; +} + +// fp_set_one: r = R (Montgomery form of 1) +__device__ __forceinline__ void fp_set_one(uint64_t r[6]) { + r[0] = FP_R[0]; r[1] = FP_R[1]; r[2] = FP_R[2]; + r[3] = FP_R[3]; r[4] = FP_R[4]; r[5] = FP_R[5]; +} + +// fp_conditional_copy: r = condition ? src : r (branchless) +__device__ __forceinline__ void fp_ccopy(uint64_t r[6], const uint64_t src[6], bool condition) { + uint64_t mask = -(uint64_t)condition; // all 1s if true, all 0s if false + r[0] = (src[0] & mask) | (r[0] & ~mask); + r[1] = (src[1] & mask) | (r[1] & ~mask); + r[2] = (src[2] & mask) | (r[2] & ~mask); + r[3] = (src[3] & mask) | (r[3] & ~mask); + r[4] = (src[4] & mask) | (r[4] & ~mask); + r[5] = (src[5] & mask) | (r[5] & ~mask); +} + +// fp_inv: r = a^(p-2) mod p (Fermat's little theorem inversion). +// +// Uses square-and-multiply over the 377-bit exponent p-2. Cost on the order of +// 377 fp_sqr + popcount(p-2) fp_mul ≈ 565 fp_mul calls (one inversion takes +// roughly the cost of 565 multiplications). +// +// Use sparingly: in batched-inversion contexts call this only on the single +// global product. For block-local batched invert we still pay this cost once +// per block per wave; the fp_mul calls in the prefix-product/back-scan +// dominate over many waves. +__device__ __noinline__ void fp_inv(uint64_t r[6], const uint64_t a[6]) { + // p - 2, little-endian limbs (BLS12-377 base field). + // p[0] = 0x8508c00000000001 → p[0]-2 = 0x8508bfffffffffff (no borrow propagation). + static constexpr uint64_t P_MINUS_2[6] = { + 0x8508bfffffffffffULL, 0x170b5d4430000000ULL, + 0x1ef3622fba094800ULL, 0x1a22d9f300f5138fULL, + 0xc63b05c06ca1493bULL, 0x01ae3a4617c510eaULL, + }; + + uint64_t result[6]; + fp_set_one(result); + + // Square-and-multiply, MSB first. The leading zeros above bit 376 cost a few + // no-op squarings of 1 (negligible). + #pragma unroll 1 + for(int limb = 5; limb >= 0; limb--) { + uint64_t e = P_MINUS_2[limb]; + #pragma unroll 1 + for(int bit = 63; bit >= 0; bit--) { + uint64_t sq[6]; + fp_sqr(sq, result); + fp_copy(result, sq); + if((e >> bit) & 1ULL) { + uint64_t prod[6]; + fp_mul(prod, result, a); + fp_copy(result, prod); + } + } + } + fp_copy(r, result); +} + +// fp_negate: r = -a mod p (branchless: 0 stays 0) +__device__ __forceinline__ void fp_negate(uint64_t r[6], const uint64_t a[6]) { + const uint64_t *p = FP_MODULUS; + bool is_zero = fp_is_zero(a); + + // Compute p - a + uint64_t t[6]; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(t[0]) : "l"(p[0]), "l"(a[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[1]) : "l"(p[1]), "l"(a[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[2]) : "l"(p[2]), "l"(a[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[3]) : "l"(p[3]), "l"(a[3])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t[4]) : "l"(p[4]), "l"(a[4])); + asm volatile("subc.u64 %0, %1, %2;" : "=l"(t[5]) : "l"(p[5]), "l"(a[5])); + + // If a was zero, result should be zero (not p) + uint64_t zero_mask = -(uint64_t)is_zero; + r[0] = t[0] & ~zero_mask; + r[1] = t[1] & ~zero_mask; + r[2] = t[2] & ~zero_mask; + r[3] = t[3] & ~zero_mask; + r[4] = t[4] & ~zero_mask; + r[5] = t[5] & ~zero_mask; +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/fr_arith.cuh b/prover/gpu/cuda/src/plonk/fr_arith.cuh new file mode 100644 index 00000000000..60472ca1b24 --- /dev/null +++ b/prover/gpu/cuda/src/plonk/fr_arith.cuh @@ -0,0 +1,225 @@ +#pragma once + +// ───────────────────────────────────────────────────────────────────────────── +// BLS12-377 scalar field Fr arithmetic (253 bits, 4 × 64-bit limbs) +// +// All elements are in Montgomery form: ā = a · R mod q, where R = 2²⁵⁶. +// +// q = 0x12ab655e9a2ca556_60b44d1e5c37b001_59aa76fed0000001_0a11800000000001 +// = 8444461749428370424248824938781546531375899335154063827935233455917409239041 +// +// All functions are __forceinline__: NTT butterfly kernels use only ~20 regs/thread +// (1 fr_mul + 2 fr_add/sub per butterfly), unlike MSM's EC ops which inline 7-9 +// fp_mul calls and blow up to 166+ regs. No register pressure concern here. +// +// Operations: +// fr_add: (a + b) mod q PTX carry chain + conditional subtract (8 asm) +// fr_sub: (a - b) mod q PTX borrow chain + conditional add-back (8 asm) +// fr_mul: a · b · R⁻¹ mod q CIOS Montgomery multiply (32 mul + 32 add) +// ───────────────────────────────────────────────────────────────────────────── + +#include "field.cuh" +#include + +namespace gnark_gpu { + +// Fr modulus as device constant (mirrors Fr_params::MODULUS) +__device__ __constant__ const uint64_t FR_MODULUS[4] = { + 0x0a11800000000001ULL, + 0x59aa76fed0000001ULL, + 0x60b44d1e5c37b001ULL, + 0x12ab655e9a2ca556ULL, +}; + +// ============================================================================= +// Fr modular addition: r = (a + b) mod q +// ============================================================================= + +__device__ __forceinline__ void fr_add(uint64_t r[4], const uint64_t a[4], const uint64_t b[4]) { + constexpr uint64_t q0 = Fr_params::MODULUS[0], q1 = Fr_params::MODULUS[1]; + constexpr uint64_t q2 = Fr_params::MODULUS[2], q3 = Fr_params::MODULUS[3]; + + uint64_t s0, s1, s2, s3, carry; + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(s0) : "l"(a[0]), "l"(b[0])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s1) : "l"(a[1]), "l"(b[1])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s2) : "l"(a[2]), "l"(b[2])); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(s3) : "l"(a[3]), "l"(b[3])); + asm volatile("addc.u64 %0, 0, 0;" : "=l"(carry)); + + uint64_t t0, t1, t2, t3, borrow; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(t0) : "l"(s0), "l"(q0)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t1) : "l"(s1), "l"(q1)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t2) : "l"(s2), "l"(q2)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t3) : "l"(s3), "l"(q3)); + asm volatile("subc.u64 %0, %1, 0;" : "=l"(borrow) : "l"(carry)); + + bool use_reduced = (borrow == 0); + r[0] = use_reduced ? t0 : s0; + r[1] = use_reduced ? t1 : s1; + r[2] = use_reduced ? t2 : s2; + r[3] = use_reduced ? t3 : s3; +} + +// ============================================================================= +// Fr modular subtraction: r = (a - b) mod q +// ============================================================================= + +__device__ __forceinline__ void fr_sub(uint64_t r[4], const uint64_t a[4], const uint64_t b[4]) { + constexpr uint64_t q0 = Fr_params::MODULUS[0], q1 = Fr_params::MODULUS[1]; + constexpr uint64_t q2 = Fr_params::MODULUS[2], q3 = Fr_params::MODULUS[3]; + + uint64_t s0, s1, s2, s3, borrow; + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(s0) : "l"(a[0]), "l"(b[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s1) : "l"(a[1]), "l"(b[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s2) : "l"(a[2]), "l"(b[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s3) : "l"(a[3]), "l"(b[3])); + asm volatile("subc.u64 %0, 0, 0;" : "=l"(borrow)); + + // Branchless correction: add q if borrow (same pattern as fp_sub) + uint64_t mask = -(borrow != 0); + uint64_t c0 = q0 & mask, c1 = q1 & mask, c2 = q2 & mask, c3 = q3 & mask; + + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(r[0]) : "l"(s0), "l"(c0)); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[1]) : "l"(s1), "l"(c1)); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r[2]) : "l"(s2), "l"(c2)); + asm volatile("addc.u64 %0, %1, %2;" : "=l"(r[3]) : "l"(s3), "l"(c3)); +} + +// ============================================================================= +// Fr CIOS Montgomery multiplication: r = a · b · R⁻¹ mod q +// +// Even/odd split CIOS with 8×32-bit limbs (ARITH23 technique, from sppark/yrrid). +// Splits accumulator into even[0,2,4,6] and odd[1,3,5,7] for unbroken PTX +// carry chains. Each chain is 8 instructions (4 limb-pairs). +// +// q mod 2^32 = 1, so -q^{-1} mod 2^32 = 0xFFFFFFFF. +// ============================================================================= + +// ── 32-bit PTX intrinsics for Fr ───────────────────────────────────────────── + +static __device__ __forceinline__ uint32_t fr_ptx_mul_lo(uint32_t x, uint32_t y) { + uint32_t r; asm("mul.lo.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_mul_hi(uint32_t x, uint32_t y) { + uint32_t r; asm("mul.hi.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_mad_lo_cc(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("mad.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_madc_lo_cc(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("madc.lo.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_madc_hi_cc(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("madc.hi.cc.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_madc_hi(uint32_t x, uint32_t y, uint32_t z) { + uint32_t r; asm volatile("madc.hi.u32 %0, %1, %2, %3;" : "=r"(r) : "r"(x), "r"(y), "r"(z)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_add_cc(uint32_t x, uint32_t y) { + uint32_t r; asm volatile("add.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_addc_cc(uint32_t x, uint32_t y) { + uint32_t r; asm volatile("addc.cc.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} +static __device__ __forceinline__ uint32_t fr_ptx_addc(uint32_t x, uint32_t y) { + uint32_t r; asm volatile("addc.u32 %0, %1, %2;" : "=r"(r) : "r"(x), "r"(y)); return r; +} + +// ── Fr multiply helpers (8×32-bit) ─────────────────────────────────────────── + +// Initial multiply: acc[i]=lo(a[i]*bi), acc[i+1]=hi(a[i]*bi) for even i +static __device__ __forceinline__ void fr32_mul_n(uint32_t *acc, const uint32_t *a, uint32_t bi) { + #pragma unroll + for (int i = 0; i < 8; i += 2) { + acc[i] = fr_ptx_mul_lo(a[i], bi); + acc[i + 1] = fr_ptx_mul_hi(a[i], bi); + } +} + +// Chained multiply-accumulate with unbroken carry chain (8 instructions) +static __device__ __forceinline__ void fr32_cmad_n(uint32_t *acc, const uint32_t *a, uint32_t bi) { + acc[0] = fr_ptx_mad_lo_cc(a[0], bi, acc[0]); + acc[1] = fr_ptx_madc_hi_cc(a[0], bi, acc[1]); + acc[2] = fr_ptx_madc_lo_cc(a[2], bi, acc[2]); + acc[3] = fr_ptx_madc_hi_cc(a[2], bi, acc[3]); + acc[4] = fr_ptx_madc_lo_cc(a[4], bi, acc[4]); + acc[5] = fr_ptx_madc_hi_cc(a[4], bi, acc[5]); + acc[6] = fr_ptx_madc_lo_cc(a[6], bi, acc[6]); + acc[7] = fr_ptx_madc_hi_cc(a[6], bi, acc[7]); +} + +// Right-shifted multiply-accumulate (consumes carry from previous op) +static __device__ __forceinline__ void fr32_madc_n_rshift(uint32_t *odd, const uint32_t *a, uint32_t bi) { + odd[0] = fr_ptx_madc_lo_cc(a[0], bi, odd[2]); + odd[1] = fr_ptx_madc_hi_cc(a[0], bi, odd[3]); + odd[2] = fr_ptx_madc_lo_cc(a[2], bi, odd[4]); + odd[3] = fr_ptx_madc_hi_cc(a[2], bi, odd[5]); + odd[4] = fr_ptx_madc_lo_cc(a[4], bi, odd[6]); + odd[5] = fr_ptx_madc_hi_cc(a[4], bi, odd[7]); + odd[6] = fr_ptx_madc_lo_cc(a[6], bi, 0); + odd[7] = fr_ptx_madc_hi(a[6], bi, 0); +} + +// One fused multiply + Montgomery reduction step +static __device__ __forceinline__ void fr32_mad_n_redc( + uint32_t *even, uint32_t *odd, const uint32_t *a, uint32_t bi, + const uint32_t *MOD, bool first) { + if (first) { + fr32_mul_n(odd, a + 1, bi); + fr32_mul_n(even, a, bi); + } else { + even[0] = fr_ptx_add_cc(even[0], odd[1]); + fr32_madc_n_rshift(odd, a + 1, bi); + fr32_cmad_n(even, a, bi); + odd[7] = fr_ptx_addc(odd[7], 0); + } + uint32_t mi = even[0] * 0xFFFFFFFFu; // -q⁻¹ mod 2³² (q ≡ 1 mod 2³²) + fr32_cmad_n(odd, MOD + 1, mi); + fr32_cmad_n(even, MOD, mi); + odd[7] = fr_ptx_addc(odd[7], 0); +} + +// Fr Montgomery multiplication using 8×32-bit even/odd split CIOS +__device__ __forceinline__ void fr_mul(uint64_t r[4], const uint64_t a[4], const uint64_t b[4]) { + const uint32_t *a32 = (const uint32_t *)a; + const uint32_t *b32 = (const uint32_t *)b; + const uint32_t *MOD = (const uint32_t *)FR_MODULUS; + + __align__(8) uint32_t even[8]; + __align__(8) uint32_t odd[8]; + + #pragma unroll + for (int i = 0; i < 8; i += 2) { + fr32_mad_n_redc(even, odd, a32, b32[i], MOD, i == 0); + fr32_mad_n_redc(odd, even, a32, b32[i + 1], MOD, false); + } + + // Merge even and odd arrays + even[0] = fr_ptx_add_cc(even[0], odd[1]); + even[1] = fr_ptx_addc_cc(even[1], odd[2]); + even[2] = fr_ptx_addc_cc(even[2], odd[3]); + even[3] = fr_ptx_addc_cc(even[3], odd[4]); + even[4] = fr_ptx_addc_cc(even[4], odd[5]); + even[5] = fr_ptx_addc_cc(even[5], odd[6]); + even[6] = fr_ptx_addc_cc(even[6], odd[7]); + even[7] = fr_ptx_addc(even[7], 0); + + // Final reduction: branchless conditional subtract q + const uint64_t *e64 = (const uint64_t *)even; + const uint64_t *q = FR_MODULUS; + uint64_t s[4], borrow; + + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(s[0]) : "l"(e64[0]), "l"(q[0])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[1]) : "l"(e64[1]), "l"(q[1])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[2]) : "l"(e64[2]), "l"(q[2])); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(s[3]) : "l"(e64[3]), "l"(q[3])); + asm volatile("subc.u64 %0, 0, 0;" : "=l"(borrow)); + + uint64_t mask = -(borrow != 0); + r[0] = (e64[0] & mask) | (s[0] & ~mask); + r[1] = (e64[1] & mask) | (s[1] & ~mask); + r[2] = (e64[2] & mask) | (s[2] & ~mask); + r[3] = (e64[3] & mask) | (s[3] & ~mask); +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/fr_ops.cu b/prover/gpu/cuda/src/plonk/fr_ops.cu new file mode 100644 index 00000000000..9a15c960d21 --- /dev/null +++ b/prover/gpu/cuda/src/plonk/fr_ops.cu @@ -0,0 +1,1395 @@ +// ============================================================================= +// Extra Fr kernels used by PlonK prover hot paths. +// +// Implemented ops: +// - ScaleByPowers: v[i] *= g^i +// - ScalarMul: v[i] *= c +// - AddMul: v[i] += a[i] * b[i] +// - AddScalarMul: v[i] += a[i] * scalar +// - BatchInvert: v[i] <- 1/v[i] (parallel prefix-product method) +// +// Batch inversion strategy: +// +// Given x[0..n-1], compute inv(x[i]) with 1 inversion + O(n) muls. +// +// prefix[i] = x[0] * ... * x[i] +// inv_all = 1 / prefix[n-1] +// backward pass recovers each inv(x[i]). +// +// GPU implementation is chunked to keep work parallel and memory-friendly. +// ============================================================================= + +#include "fr_arith.cuh" +#include + +namespace gnark_gpu { + +// ============================================================================= +// ScaleByPowers: v[i] *= g^i +// Each block computes a chunk of consecutive elements. +// Thread 0 computes g^block_start and a small table {g^(2^k)} in shared memory. +// Threads reconstruct g^threadIdx from that table, then process several +// coalesced elements separated by blockDim.x using a g^blockDim stride. +// ============================================================================= + +__global__ void scale_by_powers_kernel(uint64_t *__restrict__ v0, + uint64_t *__restrict__ v1, + uint64_t *__restrict__ v2, + uint64_t *__restrict__ v3, + const uint64_t g0, const uint64_t g1, + const uint64_t g2, const uint64_t g3, + size_t n) { + constexpr unsigned ITEMS_PER_THREAD = 4; + size_t block_start = (size_t)blockIdx.x * blockDim.x * ITEMS_PER_THREAD; + size_t idx = block_start + threadIdx.x; + + __shared__ uint64_t sh_power[4]; // g^block_start + __shared__ uint64_t sh_pow2[9][4]; // g^(2^k), k in [0,8]; k=8 is g^256 + + if (threadIdx.x == 0) { + // Precompute g^(2^k) once per block. + uint64_t pow2[4] = {g0, g1, g2, g3}; + #pragma unroll + for (int k = 0; k < 9; k++) { + sh_pow2[k][0] = pow2[0]; + sh_pow2[k][1] = pow2[1]; + sh_pow2[k][2] = pow2[2]; + sh_pow2[k][3] = pow2[3]; + uint64_t sq[4]; + fr_mul(sq, pow2, pow2); + pow2[0] = sq[0]; pow2[1] = sq[1]; + pow2[2] = sq[2]; pow2[3] = sq[3]; + } + + // Compute g^block_start via repeated squaring. + uint64_t base[4] = {g0, g1, g2, g3}; + uint64_t result[4] = { + Fr_params::ONE[0], Fr_params::ONE[1], + Fr_params::ONE[2], Fr_params::ONE[3] + }; + size_t exp = block_start; + while (exp > 0) { + if (exp & 1) { + uint64_t tmp[4]; + fr_mul(tmp, result, base); + result[0] = tmp[0]; result[1] = tmp[1]; + result[2] = tmp[2]; result[3] = tmp[3]; + } + uint64_t tmp[4]; + fr_mul(tmp, base, base); + base[0] = tmp[0]; base[1] = tmp[1]; + base[2] = tmp[2]; base[3] = tmp[3]; + exp >>= 1; + } + sh_power[0] = result[0]; sh_power[1] = result[1]; + sh_power[2] = result[2]; sh_power[3] = result[3]; + } + __syncthreads(); + + if (idx >= n) return; + + // Reconstruct g^threadIdx from the shared binary-power table. + uint64_t my_power[4] = { + Fr_params::ONE[0], Fr_params::ONE[1], + Fr_params::ONE[2], Fr_params::ONE[3] + }; + unsigned t = threadIdx.x; + #pragma unroll + for (int bit = 0; bit < 8; bit++) { + if ((t >> bit) & 1u) { + uint64_t pow2[4] = { + sh_pow2[bit][0], sh_pow2[bit][1], + sh_pow2[bit][2], sh_pow2[bit][3], + }; + uint64_t tmp[4]; + fr_mul(tmp, my_power, pow2); + my_power[0] = tmp[0]; my_power[1] = tmp[1]; + my_power[2] = tmp[2]; my_power[3] = tmp[3]; + } + } + + uint64_t power[4]; + uint64_t block_pow[4] = {sh_power[0], sh_power[1], sh_power[2], sh_power[3]}; + fr_mul(power, block_pow, my_power); + + uint64_t stride[4] = { + sh_pow2[8][0], sh_pow2[8][1], sh_pow2[8][2], sh_pow2[8][3], + }; + + #pragma unroll + for (unsigned item = 0; item < ITEMS_PER_THREAD; item++) { + size_t cur = idx + (size_t)item * blockDim.x; + if (cur < n) { + uint64_t val[4] = {v0[cur], v1[cur], v2[cur], v3[cur]}; + uint64_t result[4]; + fr_mul(result, val, power); + v0[cur] = result[0]; + v1[cur] = result[1]; + v2[cur] = result[2]; + v3[cur] = result[3]; + } + if constexpr (ITEMS_PER_THREAD > 1) { + if (item + 1 < ITEMS_PER_THREAD) { + uint64_t next[4]; + fr_mul(next, power, stride); + power[0] = next[0]; power[1] = next[1]; + power[2] = next[2]; power[3] = next[3]; + } + } + } +} + +void launch_scale_by_powers(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t g[4], size_t n, cudaStream_t stream) { + constexpr unsigned threads = 256; + constexpr unsigned items_per_thread = 4; + unsigned blocks = (n + threads * items_per_thread - 1) / (threads * items_per_thread); + scale_by_powers_kernel<<>>( + v0, v1, v2, v3, g[0], g[1], g[2], g[3], n); +} + +// ============================================================================= +// ScalarMul: v[i] *= c for all i +// ============================================================================= + +__global__ void scalar_mul_kernel(uint64_t *__restrict__ v0, + uint64_t *__restrict__ v1, + uint64_t *__restrict__ v2, + uint64_t *__restrict__ v3, + const uint64_t c0, const uint64_t c1, + const uint64_t c2, const uint64_t c3, + size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + uint64_t val[4] = {v0[idx], v1[idx], v2[idx], v3[idx]}; + uint64_t c[4] = {c0, c1, c2, c3}; + uint64_t result[4]; + fr_mul(result, val, c); + v0[idx] = result[0]; + v1[idx] = result[1]; + v2[idx] = result[2]; + v3[idx] = result[3]; +} + +void launch_scalar_mul(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t c[4], size_t n, cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + scalar_mul_kernel<<>>( + v0, v1, v2, v3, c[0], c[1], c[2], c[3], n); +} + +// ============================================================================= +// AddMul: v[i] += a[i] * b[i] (fused multiply-add) +// ============================================================================= + +__global__ void addmul_kernel(uint64_t *__restrict__ v0, + uint64_t *__restrict__ v1, + uint64_t *__restrict__ v2, + uint64_t *__restrict__ v3, + const uint64_t *__restrict__ a0, + const uint64_t *__restrict__ a1, + const uint64_t *__restrict__ a2, + const uint64_t *__restrict__ a3, + const uint64_t *__restrict__ b0, + const uint64_t *__restrict__ b1, + const uint64_t *__restrict__ b2, + const uint64_t *__restrict__ b3, + size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + uint64_t a[4] = {a0[idx], a1[idx], a2[idx], a3[idx]}; + uint64_t b[4] = {b0[idx], b1[idx], b2[idx], b3[idx]}; + uint64_t prod[4]; + fr_mul(prod, a, b); + + uint64_t v[4] = {v0[idx], v1[idx], v2[idx], v3[idx]}; + uint64_t result[4]; + fr_add(result, v, prod); + v0[idx] = result[0]; + v1[idx] = result[1]; + v2[idx] = result[2]; + v3[idx] = result[3]; +} + +void launch_addmul(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, const uint64_t *a3, + const uint64_t *b0, const uint64_t *b1, const uint64_t *b2, const uint64_t *b3, + size_t n, cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + addmul_kernel<<>>( + v0, v1, v2, v3, a0, a1, a2, a3, b0, b1, b2, b3, n); +} + +// ============================================================================= +// AddScalarMul: v[i] += a[i] * scalar (broadcast scalar multiply-add) +// ============================================================================= + +__global__ void add_scalar_mul_kernel(uint64_t *__restrict__ v0, + uint64_t *__restrict__ v1, + uint64_t *__restrict__ v2, + uint64_t *__restrict__ v3, + const uint64_t *__restrict__ a0, + const uint64_t *__restrict__ a1, + const uint64_t *__restrict__ a2, + const uint64_t *__restrict__ a3, + const uint64_t s0, const uint64_t s1, + const uint64_t s2, const uint64_t s3, + size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) return; + + uint64_t a[4] = {a0[idx], a1[idx], a2[idx], a3[idx]}; + uint64_t s[4] = {s0, s1, s2, s3}; + uint64_t prod[4]; + fr_mul(prod, a, s); + + uint64_t v[4] = {v0[idx], v1[idx], v2[idx], v3[idx]}; + uint64_t result[4]; + fr_add(result, v, prod); + v0[idx] = result[0]; + v1[idx] = result[1]; + v2[idx] = result[2]; + v3[idx] = result[3]; +} + +void launch_add_scalar_mul(uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, const uint64_t *a3, + const uint64_t scalar[4], size_t n, cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + add_scalar_mul_kernel<<>>( + v0, v1, v2, v3, a0, a1, a2, a3, + scalar[0], scalar[1], scalar[2], scalar[3], n); +} + +// ============================================================================= +// Fused gate constraint accumulation for PlonK quotient computation. +// result[i] = (result[i] + Ql[i]*L[i] + Qr[i]*R[i] + Qm[i]*L[i]*R[i] +// + Qo[i]*O[i] + Qk[i]) * zhKInv +// Single pass replaces 6 separate kernel launches per coset. +// ============================================================================= + +__global__ void plonk_gate_accum_kernel( + uint64_t *__restrict__ res0, uint64_t *__restrict__ res1, + uint64_t *__restrict__ res2, uint64_t *__restrict__ res3, + const uint64_t *__restrict__ Ql0, const uint64_t *__restrict__ Ql1, + const uint64_t *__restrict__ Ql2, const uint64_t *__restrict__ Ql3, + const uint64_t *__restrict__ Qr0, const uint64_t *__restrict__ Qr1, + const uint64_t *__restrict__ Qr2, const uint64_t *__restrict__ Qr3, + const uint64_t *__restrict__ Qm0, const uint64_t *__restrict__ Qm1, + const uint64_t *__restrict__ Qm2, const uint64_t *__restrict__ Qm3, + const uint64_t *__restrict__ Qo0, const uint64_t *__restrict__ Qo1, + const uint64_t *__restrict__ Qo2, const uint64_t *__restrict__ Qo3, + const uint64_t *__restrict__ Qk0, const uint64_t *__restrict__ Qk1, + const uint64_t *__restrict__ Qk2, const uint64_t *__restrict__ Qk3, + const uint64_t *__restrict__ L0, const uint64_t *__restrict__ L1_, + const uint64_t *__restrict__ L2, const uint64_t *__restrict__ L3, + const uint64_t *__restrict__ R0, const uint64_t *__restrict__ R1, + const uint64_t *__restrict__ R2, const uint64_t *__restrict__ R3, + const uint64_t *__restrict__ O0, const uint64_t *__restrict__ O1, + const uint64_t *__restrict__ O2, const uint64_t *__restrict__ O3, + const uint64_t zh0, const uint64_t zh1, + const uint64_t zh2, const uint64_t zh3, + size_t n) +{ + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + uint64_t zhKInv[4] = {zh0, zh1, zh2, zh3}; + + // Load result (perm+boundary already accumulated) + uint64_t r[4] = {res0[i], res1[i], res2[i], res3[i]}; + + // Load wires + uint64_t l[4] = {L0[i], L1_[i], L2[i], L3[i]}; + uint64_t rv[4] = {R0[i], R1[i], R2[i], R3[i]}; + uint64_t o[4] = {O0[i], O1[i], O2[i], O3[i]}; + + // r += Ql * L + uint64_t ql[4] = {Ql0[i], Ql1[i], Ql2[i], Ql3[i]}; + uint64_t tmp[4]; + fr_mul(tmp, ql, l); + fr_add(r, r, tmp); + + // r += Qr * R + uint64_t qr[4] = {Qr0[i], Qr1[i], Qr2[i], Qr3[i]}; + fr_mul(tmp, qr, rv); + fr_add(r, r, tmp); + + // r += Qm * L * R + uint64_t qm[4] = {Qm0[i], Qm1[i], Qm2[i], Qm3[i]}; + uint64_t lr[4]; + fr_mul(lr, l, rv); + fr_mul(tmp, qm, lr); + fr_add(r, r, tmp); + + // r += Qo * O + uint64_t qo[4] = {Qo0[i], Qo1[i], Qo2[i], Qo3[i]}; + fr_mul(tmp, qo, o); + fr_add(r, r, tmp); + + // r += Qk + uint64_t qk[4] = {Qk0[i], Qk1[i], Qk2[i], Qk3[i]}; + fr_add(r, r, qk); + + // r *= zhKInv + uint64_t out[4]; + fr_mul(out, r, zhKInv); + + res0[i] = out[0]; res1[i] = out[1]; res2[i] = out[2]; res3[i] = out[3]; +} + +void launch_plonk_gate_accum( + uint64_t *res0, uint64_t *res1, uint64_t *res2, uint64_t *res3, + const uint64_t *Ql0, const uint64_t *Ql1, const uint64_t *Ql2, const uint64_t *Ql3, + const uint64_t *Qr0, const uint64_t *Qr1, const uint64_t *Qr2, const uint64_t *Qr3, + const uint64_t *Qm0, const uint64_t *Qm1, const uint64_t *Qm2, const uint64_t *Qm3, + const uint64_t *Qo0, const uint64_t *Qo1, const uint64_t *Qo2, const uint64_t *Qo3, + const uint64_t *Qk0, const uint64_t *Qk1, const uint64_t *Qk2, const uint64_t *Qk3, + const uint64_t *L0, const uint64_t *L1, const uint64_t *L2, const uint64_t *L3, + const uint64_t *R0, const uint64_t *R1, const uint64_t *R2, const uint64_t *R3, + const uint64_t *O0, const uint64_t *O1, const uint64_t *O2, const uint64_t *O3, + const uint64_t zhKInv[4], size_t n, cudaStream_t stream) +{ + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + plonk_gate_accum_kernel<<>>( + res0, res1, res2, res3, + Ql0, Ql1, Ql2, Ql3, + Qr0, Qr1, Qr2, Qr3, + Qm0, Qm1, Qm2, Qm3, + Qo0, Qo1, Qo2, Qo3, + Qk0, Qk1, Qk2, Qk3, + L0, L1, L2, L3, + R0, R1, R2, R3, + O0, O1, O2, O3, + zhKInv[0], zhKInv[1], zhKInv[2], zhKInv[3], + n); +} + +// ============================================================================= +// BatchInvert: v[i] = 1/v[i] using Montgomery batch inversion +// Two-level parallel prefix scan: +// 1. Forward: per-chunk prefix products (parallel), then fixup +// 2. Invert total product via Fermat's little theorem +// 3. Backward: per-chunk sweep to recover individual inverses (parallel) +// ============================================================================= + +// VRAM guardrail: each BatchInvert scratch arena stores bp/sa/tmp for four +// limbs (12 uint64 arrays). The large-vector path keeps two arenas, so retained +// scratch is roughly: +// +// 2 * 12 * 8 * ceil(n / BATCH_INV_CHUNK) bytes +// +// At n=2^27: chunk=256 => 96 MiB, 128 => 192 MiB, 64 => 384 MiB. +// Chunk 64 was faster in isolation but made repeated 2^27 PlonK proofs OOM. +// Do not lower this without adding an explicit scratch release/shrink path and +// validating repeated BenchmarkPlonkECMul750 runs. +constexpr size_t BATCH_INV_CHUNK = 256; + +// Device function: field inversion via Fermat's little theorem (a^(q-2) mod q) +__device__ void fr_invert(uint64_t result[4], const uint64_t a[4]) { + // q-2 for BLS12-377 Fr field + static constexpr uint64_t EXP[4] = { + 0x0a117fffffffffffULL, + 0x59aa76fed0000001ULL, + 0x60b44d1e5c37b001ULL, + 0x12ab655e9a2ca556ULL, + }; + + uint64_t base[4] = {a[0], a[1], a[2], a[3]}; + result[0] = Fr_params::ONE[0]; result[1] = Fr_params::ONE[1]; + result[2] = Fr_params::ONE[2]; result[3] = Fr_params::ONE[3]; + + for (int word = 0; word < 4; word++) { + uint64_t bits = EXP[word]; + for (int bit = 0; bit < 64; bit++) { + if (bits & 1) { + uint64_t tmp[4]; + fr_mul(tmp, result, base); + result[0] = tmp[0]; result[1] = tmp[1]; + result[2] = tmp[2]; result[3] = tmp[3]; + } + uint64_t tmp[4]; + fr_mul(tmp, base, base); + base[0] = tmp[0]; base[1] = tmp[1]; + base[2] = tmp[2]; base[3] = tmp[3]; + bits >>= 1; + } + } +} + +// Kernel 1: Local prefix products per chunk + store chunk product. +// Each thread handles one chunk of BATCH_INV_CHUNK elements sequentially. +__global__ void batch_invert_prefix_local( + uint64_t *__restrict__ v0, uint64_t *__restrict__ v1, + uint64_t *__restrict__ v2, uint64_t *__restrict__ v3, + uint64_t *__restrict__ bp0, uint64_t *__restrict__ bp1, + uint64_t *__restrict__ bp2, uint64_t *__restrict__ bp3, + size_t n) +{ + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + BATCH_INV_CHUNK - 1) / BATCH_INV_CHUNK; + if (chunk_id >= num_chunks) return; + + size_t start = chunk_id * BATCH_INV_CHUNK; + size_t end = start + BATCH_INV_CHUNK; + if (end > n) end = n; + + uint64_t acc[4] = {v0[start], v1[start], v2[start], v3[start]}; + for (size_t i = start + 1; i < end; i++) { + uint64_t elem[4] = {v0[i], v1[i], v2[i], v3[i]}; + uint64_t prod[4]; + fr_mul(prod, acc, elem); + acc[0] = prod[0]; acc[1] = prod[1]; + acc[2] = prod[2]; acc[3] = prod[3]; + v0[i] = acc[0]; v1[i] = acc[1]; + v2[i] = acc[2]; v3[i] = acc[3]; + } + + bp0[chunk_id] = acc[0]; bp1[chunk_id] = acc[1]; + bp2[chunk_id] = acc[2]; bp3[chunk_id] = acc[3]; +} + +// Kernel 2: Serial fixup (single thread). +// - Prefix product of block products +// - Invert total +// - Compute backward starting accs (suffix product * inv_total) +__global__ void batch_invert_serial_fixup( + uint64_t *__restrict__ bp0, uint64_t *__restrict__ bp1, + uint64_t *__restrict__ bp2, uint64_t *__restrict__ bp3, + uint64_t *__restrict__ sa0, uint64_t *__restrict__ sa1, + uint64_t *__restrict__ sa2, uint64_t *__restrict__ sa3, + size_t num_chunks) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) return; + + // Save original block products to sa[] temporarily + for (size_t i = 0; i < num_chunks; i++) { + sa0[i] = bp0[i]; sa1[i] = bp1[i]; + sa2[i] = bp2[i]; sa3[i] = bp3[i]; + } + + // Forward prefix product of block products (in-place in bp[]) + for (size_t i = 1; i < num_chunks; i++) { + uint64_t prev[4] = {bp0[i-1], bp1[i-1], bp2[i-1], bp3[i-1]}; + uint64_t curr[4] = {bp0[i], bp1[i], bp2[i], bp3[i]}; + uint64_t prod[4]; + fr_mul(prod, prev, curr); + bp0[i] = prod[0]; bp1[i] = prod[1]; + bp2[i] = prod[2]; bp3[i] = prod[3]; + } + + // Invert total product + uint64_t total[4] = { + bp0[num_chunks-1], bp1[num_chunks-1], + bp2[num_chunks-1], bp3[num_chunks-1] + }; + uint64_t inv[4]; + fr_invert(inv, total); + + // Compute backward starting accs: start_acc[k] = inv * prod(orig_bp[k+1..last]) + // Process right-to-left, saving next_orig before overwriting. + uint64_t acc[4] = {inv[0], inv[1], inv[2], inv[3]}; + uint64_t next_orig[4] = { + sa0[num_chunks-1], sa1[num_chunks-1], + sa2[num_chunks-1], sa3[num_chunks-1] + }; + sa0[num_chunks-1] = acc[0]; sa1[num_chunks-1] = acc[1]; + sa2[num_chunks-1] = acc[2]; sa3[num_chunks-1] = acc[3]; + + for (int k = (int)num_chunks - 2; k >= 0; k--) { + uint64_t tmp[4]; + fr_mul(tmp, acc, next_orig); + acc[0] = tmp[0]; acc[1] = tmp[1]; + acc[2] = tmp[2]; acc[3] = tmp[3]; + next_orig[0] = sa0[k]; next_orig[1] = sa1[k]; + next_orig[2] = sa2[k]; next_orig[3] = sa3[k]; + sa0[k] = acc[0]; sa1[k] = acc[1]; + sa2[k] = acc[2]; sa3[k] = acc[3]; + } +} + +// Prefix-only serial fixup (single thread). +// Computes inclusive prefix products of bp[] in-place. +// Used as the second level of a hierarchical scan to avoid long serial loops. +__global__ void batch_prefix_serial_fixup( + uint64_t *__restrict__ bp0, uint64_t *__restrict__ bp1, + uint64_t *__restrict__ bp2, uint64_t *__restrict__ bp3, + size_t num_chunks) +{ + if (threadIdx.x != 0 || blockIdx.x != 0) return; + for (size_t i = 1; i < num_chunks; i++) { + uint64_t prev[4] = {bp0[i-1], bp1[i-1], bp2[i-1], bp3[i-1]}; + uint64_t curr[4] = {bp0[i], bp1[i], bp2[i], bp3[i]}; + uint64_t prod[4]; + fr_mul(prod, prev, curr); + bp0[i] = prod[0]; bp1[i] = prod[1]; + bp2[i] = prod[2]; bp3[i] = prod[3]; + } +} + +// Kernel 3: Apply forward fixup — multiply each element in chunk k (for k>=1) +// by bp[k-1] (prefix of all prior block products). +__global__ void batch_invert_apply_fixup( + uint64_t *__restrict__ v0, uint64_t *__restrict__ v1, + uint64_t *__restrict__ v2, uint64_t *__restrict__ v3, + const uint64_t *__restrict__ bp0, const uint64_t *__restrict__ bp1, + const uint64_t *__restrict__ bp2, const uint64_t *__restrict__ bp3, + size_t n) +{ + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + BATCH_INV_CHUNK - 1) / BATCH_INV_CHUNK; + if (chunk_id == 0 || chunk_id >= num_chunks) return; + + size_t start = chunk_id * BATCH_INV_CHUNK; + size_t end = start + BATCH_INV_CHUNK; + if (end > n) end = n; + + uint64_t prefix[4] = {bp0[chunk_id-1], bp1[chunk_id-1], + bp2[chunk_id-1], bp3[chunk_id-1]}; + for (size_t i = start; i < end; i++) { + uint64_t elem[4] = {v0[i], v1[i], v2[i], v3[i]}; + uint64_t prod[4]; + fr_mul(prod, prefix, elem); + v0[i] = prod[0]; v1[i] = prod[1]; + v2[i] = prod[2]; v3[i] = prod[3]; + } +} + +// Kernel 4: Backward sweep — compute inverses in-place in v[]. +// v[] holds corrected prefix products (read within own chunk + bp for boundary). +// orig[] holds original values (read-only). bp[] holds prefixed block products. +// Results are written back to v[] (safe: only write to own chunk, and within a +// chunk we process backward so v[i-1] is read before v[i] is overwritten). +// Cross-chunk boundary: v[start-1] is read via bp[chunk_id-1] to avoid races. +__global__ void batch_invert_backward( + uint64_t *__restrict__ v0, uint64_t *__restrict__ v1, + uint64_t *__restrict__ v2, uint64_t *__restrict__ v3, + const uint64_t *__restrict__ orig0, const uint64_t *__restrict__ orig1, + const uint64_t *__restrict__ orig2, const uint64_t *__restrict__ orig3, + const uint64_t *__restrict__ bp0, const uint64_t *__restrict__ bp1, + const uint64_t *__restrict__ bp2, const uint64_t *__restrict__ bp3, + const uint64_t *__restrict__ sa0, const uint64_t *__restrict__ sa1, + const uint64_t *__restrict__ sa2, const uint64_t *__restrict__ sa3, + size_t n) +{ + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + BATCH_INV_CHUNK - 1) / BATCH_INV_CHUNK; + if (chunk_id >= num_chunks) return; + + size_t start = chunk_id * BATCH_INV_CHUNK; + size_t end = start + BATCH_INV_CHUNK; + if (end > n) end = n; + + uint64_t acc[4] = {sa0[chunk_id], sa1[chunk_id], + sa2[chunk_id], sa3[chunk_id]}; + + for (size_t i = end; i > start; ) { + i--; + uint64_t orig_val[4] = {orig0[i], orig1[i], orig2[i], orig3[i]}; + if (i > 0) { + uint64_t prefix_prev[4]; + if (i == start && chunk_id > 0) { + // Cross-chunk boundary: use bp[chunk_id-1] instead of v[start-1] + // bp[k] = prefix product of elements 0..(k+1)*CHUNK-1 after fixup + prefix_prev[0] = bp0[chunk_id-1]; prefix_prev[1] = bp1[chunk_id-1]; + prefix_prev[2] = bp2[chunk_id-1]; prefix_prev[3] = bp3[chunk_id-1]; + } else { + // Within-chunk: safe to read v[i-1] (not yet overwritten) + prefix_prev[0] = v0[i-1]; prefix_prev[1] = v1[i-1]; + prefix_prev[2] = v2[i-1]; prefix_prev[3] = v3[i-1]; + } + uint64_t result[4]; + fr_mul(result, acc, prefix_prev); + v0[i] = result[0]; v1[i] = result[1]; + v2[i] = result[2]; v3[i] = result[3]; + } else { + // i == 0: result = acc (no prefix before element 0) + v0[0] = acc[0]; v1[0] = acc[1]; + v2[0] = acc[2]; v3[0] = acc[3]; + } + uint64_t tmp[4]; + fr_mul(tmp, acc, orig_val); + acc[0] = tmp[0]; acc[1] = tmp[1]; + acc[2] = tmp[2]; acc[3] = tmp[3]; + } +} + +// BatchInvert scratch memory for block-level arrays (bp and sa). +// Pre-allocated to avoid per-call cudaMalloc/cudaFree overhead. +struct BatchInvertScratch { + uint64_t *bp[4] = {}; // block products (num_chunks per limb) + uint64_t *sa[4] = {}; // starting accs (num_chunks per limb) + uint64_t *tmp[4] = {}; // extra workspace (num_chunks per limb) + size_t capacity = 0; // number of chunks allocated +}; + +// Ensure scratch has capacity for at least num_chunks. +static cudaError_t batch_invert_scratch_ensure(BatchInvertScratch &s, size_t num_chunks) { + if (num_chunks <= s.capacity) return cudaSuccess; + + // Free old + for (int i = 0; i < 4; i++) { + if (s.bp[i]) { cudaFree(s.bp[i]); s.bp[i] = nullptr; } + if (s.sa[i]) { cudaFree(s.sa[i]); s.sa[i] = nullptr; } + if (s.tmp[i]) { cudaFree(s.tmp[i]); s.tmp[i] = nullptr; } + } + s.capacity = 0; + + // Allocate new (with 2x growth factor) + size_t alloc_chunks = num_chunks < 64 ? 64 : num_chunks; + cudaError_t err; + for (int i = 0; i < 4; i++) { + err = cudaMalloc(&s.bp[i], alloc_chunks * sizeof(uint64_t)); + if (err != cudaSuccess) return err; + err = cudaMalloc(&s.sa[i], alloc_chunks * sizeof(uint64_t)); + if (err != cudaSuccess) return err; + err = cudaMalloc(&s.tmp[i], alloc_chunks * sizeof(uint64_t)); + if (err != cudaSuccess) return err; + } + s.capacity = alloc_chunks; + return cudaSuccess; +} + +// Per-context scratch (thread-local would be needed for multi-context; for now, stored in context). +// The context struct will hold this. +static thread_local BatchInvertScratch g_batch_inv_scratch; +static thread_local BatchInvertScratch g_batch_inv_aux_scratch; + +// Baseline two-level batch inversion implementation using a single scratch arena. +// Good for small/medium vectors where num_chunks is not huge. +static cudaError_t launch_batch_invert_baseline( + uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + uint64_t *orig0, uint64_t *orig1, uint64_t *orig2, uint64_t *orig3, + size_t n, cudaStream_t stream, + BatchInvertScratch &scratch) +{ + if (n == 0) return cudaSuccess; + + size_t num_chunks = (n + BATCH_INV_CHUNK - 1) / BATCH_INV_CHUNK; + + cudaError_t err = batch_invert_scratch_ensure(scratch, num_chunks); + if (err != cudaSuccess) return err; + + auto &s = scratch; + + err = cudaMemcpyAsync(orig0, v0, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(orig1, v1, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(orig2, v2, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(orig3, v3, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + + constexpr unsigned threads = 256; + unsigned blocks = (num_chunks + threads - 1) / threads; + + batch_invert_prefix_local<<>>( + v0, v1, v2, v3, s.bp[0], s.bp[1], s.bp[2], s.bp[3], n); + + batch_invert_serial_fixup<<<1, 1, 0, stream>>>( + s.bp[0], s.bp[1], s.bp[2], s.bp[3], + s.sa[0], s.sa[1], s.sa[2], s.sa[3], num_chunks); + + batch_invert_apply_fixup<<>>( + v0, v1, v2, v3, s.bp[0], s.bp[1], s.bp[2], s.bp[3], n); + + batch_invert_backward<<>>( + v0, v1, v2, v3, orig0, orig1, orig2, orig3, + s.bp[0], s.bp[1], s.bp[2], s.bp[3], + s.sa[0], s.sa[1], s.sa[2], s.sa[3], n); + + return cudaSuccess; +} + +// Launch function: orchestrates the full batch inversion pipeline. +// v is modified in-place to contain 1/v[i]. +// orig[] is a temp buffer (same size as v) for original values. +// Fully async — no internal synchronization. +cudaError_t launch_batch_invert( + uint64_t *v0, uint64_t *v1, uint64_t *v2, uint64_t *v3, + uint64_t *orig0, uint64_t *orig1, uint64_t *orig2, uint64_t *orig3, + size_t n, cudaStream_t stream) +{ + if (n == 0) return cudaSuccess; + + size_t num_chunks = (n + BATCH_INV_CHUNK - 1) / BATCH_INV_CHUNK; + if (num_chunks <= BATCH_INV_CHUNK) { + return launch_batch_invert_baseline( + v0, v1, v2, v3, + orig0, orig1, orig2, orig3, + n, stream, g_batch_inv_scratch); + } + + // Hierarchical path for large vectors: + // 1) parallel local scan on v + // 2) hierarchical scan of chunk products (avoids long serial prefix loops) + // 3) build chunk start accumulators from scanned chunk prefixes + original chunk products + // 4) backward sweep on v + cudaError_t err = batch_invert_scratch_ensure(g_batch_inv_scratch, num_chunks); + if (err != cudaSuccess) return err; + err = batch_invert_scratch_ensure(g_batch_inv_aux_scratch, num_chunks); + if (err != cudaSuccess) return err; + + auto &s = g_batch_inv_scratch; + auto &aux = g_batch_inv_aux_scratch; + + err = cudaMemcpyAsync(orig0, v0, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(orig1, v1, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(orig2, v2, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(orig3, v3, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + + constexpr unsigned threads = 256; + unsigned blocks = (num_chunks + threads - 1) / threads; + + // Step 1: local scan within each primary chunk + primary chunk products in s.bp. + batch_invert_prefix_local<<>>( + v0, v1, v2, v3, s.bp[0], s.bp[1], s.bp[2], s.bp[3], n); + + // Step 2: hierarchical scan of primary chunk products. + size_t super_chunks = (num_chunks + BATCH_INV_CHUNK - 1) / BATCH_INV_CHUNK; + unsigned super_blocks = (super_chunks + threads - 1) / threads; + + batch_invert_prefix_local<<>>( + s.bp[0], s.bp[1], s.bp[2], s.bp[3], + aux.bp[0], aux.bp[1], aux.bp[2], aux.bp[3], + num_chunks); + + batch_prefix_serial_fixup<<<1, 1, 0, stream>>>( + aux.bp[0], aux.bp[1], aux.bp[2], aux.bp[3], super_chunks); + + batch_invert_apply_fixup<<>>( + s.bp[0], s.bp[1], s.bp[2], s.bp[3], + aux.bp[0], aux.bp[1], aux.bp[2], aux.bp[3], + num_chunks); + + // Apply global chunk-prefix fixup to the original vector. + batch_invert_apply_fixup<<>>( + v0, v1, v2, v3, s.bp[0], s.bp[1], s.bp[2], s.bp[3], n); + + // Step 3: start_acc[k] = 1/prefix[k]. Keep scanned prefixes in s.bp for + // boundary reads in backward sweep, and invert a copy in aux.tmp. + for (int i = 0; i < 4; i++) { + err = cudaMemcpyAsync( + aux.tmp[i], s.bp[i], num_chunks * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + } + + err = launch_batch_invert_baseline( + aux.tmp[0], aux.tmp[1], aux.tmp[2], aux.tmp[3], + s.tmp[0], s.tmp[1], s.tmp[2], s.tmp[3], + num_chunks, stream, aux); + if (err != cudaSuccess) return err; + + // Step 4: backward sweep with hierarchical start accumulators. + batch_invert_backward<<>>( + v0, v1, v2, v3, orig0, orig1, orig2, orig3, + s.bp[0], s.bp[1], s.bp[2], s.bp[3], + aux.tmp[0], aux.tmp[1], aux.tmp[2], aux.tmp[3], n); + + return cudaSuccess; +} + +// ============================================================================= +// Butterfly4: Size-4 inverse DFT butterfly for decomposed iFFT(4n). +// For each j in [0, n): +// (a0,a1,a2,a3) = (b0[j], b1[j], b2[j], b3[j]) +// b0[j] = (a0 + a1 + a2 + a3) * quarter +// b1[j] = ((a0 - a2) + omega4_inv*(a1 - a3)) * quarter +// b2[j] = (a0 - a1 + a2 - a3) * quarter +// b3[j] = ((a0 - a2) - omega4_inv*(a1 - a3)) * quarter +// ============================================================================= + +__global__ void butterfly4_kernel( + uint64_t *__restrict__ b0_0, uint64_t *__restrict__ b0_1, + uint64_t *__restrict__ b0_2, uint64_t *__restrict__ b0_3, + uint64_t *__restrict__ b1_0, uint64_t *__restrict__ b1_1, + uint64_t *__restrict__ b1_2, uint64_t *__restrict__ b1_3, + uint64_t *__restrict__ b2_0, uint64_t *__restrict__ b2_1, + uint64_t *__restrict__ b2_2, uint64_t *__restrict__ b2_3, + uint64_t *__restrict__ b3_0, uint64_t *__restrict__ b3_1, + uint64_t *__restrict__ b3_2, uint64_t *__restrict__ b3_3, + const uint64_t w0, const uint64_t w1, + const uint64_t w2, const uint64_t w3, + const uint64_t q0, const uint64_t q1, + const uint64_t q2, const uint64_t q3, + size_t n) +{ + size_t j = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n) return; + + uint64_t a0[4] = {b0_0[j], b0_1[j], b0_2[j], b0_3[j]}; + uint64_t a1[4] = {b1_0[j], b1_1[j], b1_2[j], b1_3[j]}; + uint64_t a2[4] = {b2_0[j], b2_1[j], b2_2[j], b2_3[j]}; + uint64_t a3[4] = {b3_0[j], b3_1[j], b3_2[j], b3_3[j]}; + + uint64_t omega4_inv[4] = {w0, w1, w2, w3}; + uint64_t quarter[4] = {q0, q1, q2, q3}; + + // t0 = a0 + a2, t1 = a0 - a2 + uint64_t t0[4], t1[4]; + fr_add(t0, a0, a2); + fr_sub(t1, a0, a2); + + // t2 = a1 + a3, t3 = omega4_inv * (a1 - a3) + uint64_t t2[4], t3[4], diff13[4]; + fr_add(t2, a1, a3); + fr_sub(diff13, a1, a3); + fr_mul(t3, omega4_inv, diff13); + + // r0 = (t0 + t2) * quarter = (a0+a1+a2+a3)/4 + uint64_t sum02[4], r0[4]; + fr_add(sum02, t0, t2); + fr_mul(r0, sum02, quarter); + + // r1 = (t1 + t3) * quarter = ((a0-a2) + w*(a1-a3))/4 + uint64_t sum13[4], r1[4]; + fr_add(sum13, t1, t3); + fr_mul(r1, sum13, quarter); + + // r2 = (t0 - t2) * quarter = (a0-a1+a2-a3)/4 + uint64_t sub02[4], r2[4]; + fr_sub(sub02, t0, t2); + fr_mul(r2, sub02, quarter); + + // r3 = (t1 - t3) * quarter = ((a0-a2) - w*(a1-a3))/4 + uint64_t sub13[4], r3[4]; + fr_sub(sub13, t1, t3); + fr_mul(r3, sub13, quarter); + + b0_0[j] = r0[0]; b0_1[j] = r0[1]; b0_2[j] = r0[2]; b0_3[j] = r0[3]; + b1_0[j] = r1[0]; b1_1[j] = r1[1]; b1_2[j] = r1[2]; b1_3[j] = r1[3]; + b2_0[j] = r2[0]; b2_1[j] = r2[1]; b2_2[j] = r2[2]; b2_3[j] = r2[3]; + b3_0[j] = r3[0]; b3_1[j] = r3[1]; b3_2[j] = r3[2]; b3_3[j] = r3[3]; +} + +void launch_butterfly4( + uint64_t *b0_0, uint64_t *b0_1, uint64_t *b0_2, uint64_t *b0_3, + uint64_t *b1_0, uint64_t *b1_1, uint64_t *b1_2, uint64_t *b1_3, + uint64_t *b2_0, uint64_t *b2_1, uint64_t *b2_2, uint64_t *b2_3, + uint64_t *b3_0, uint64_t *b3_1, uint64_t *b3_2, uint64_t *b3_3, + const uint64_t omega4_inv[4], const uint64_t quarter[4], + size_t n, cudaStream_t stream) +{ + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + butterfly4_kernel<<>>( + b0_0, b0_1, b0_2, b0_3, + b1_0, b1_1, b1_2, b1_3, + b2_0, b2_1, b2_2, b2_3, + b3_0, b3_1, b3_2, b3_3, + omega4_inv[0], omega4_inv[1], omega4_inv[2], omega4_inv[3], + quarter[0], quarter[1], quarter[2], quarter[3], + n); +} + +// ============================================================================= +// Fused permutation + boundary constraint kernel for PlonK. +// For each thread i: +// x_i = coset_gen * omega_powers[i] (identity point on coset) +// id1 = beta * x_i +// id2 = id1 * u (u = coset_shift) +// id3 = id1 * u_sq (u² = coset_shift²) +// num = Z[i] * (L[i]+id1+gamma) * (R[i]+id2+gamma) * (O[i]+id3+gamma) +// ZS = Z[(i+1) % n] +// den = ZS * (L[i]+beta*S1[i]+gamma) * (R[i]+beta*S2[i]+gamma) * (O[i]+beta*S3[i]+gamma) +// L1 = L1_scalar * L1_denInv[i] +// loc = (Z[i] - 1) * L1 +// result[i] = alpha * ((den - num) + alpha * loc) +// ============================================================================= + +__global__ void plonk_perm_boundary_kernel( + // Output + uint64_t *__restrict__ res0, uint64_t *__restrict__ res1, + uint64_t *__restrict__ res2, uint64_t *__restrict__ res3, + // Wire evaluations (SoA, natural order) + const uint64_t *__restrict__ L0, const uint64_t *__restrict__ L1_, + const uint64_t *__restrict__ L2, const uint64_t *__restrict__ L3, + const uint64_t *__restrict__ R0, const uint64_t *__restrict__ R1, + const uint64_t *__restrict__ R2, const uint64_t *__restrict__ R3, + const uint64_t *__restrict__ O0, const uint64_t *__restrict__ O1, + const uint64_t *__restrict__ O2, const uint64_t *__restrict__ O3, + // Z polynomial (SoA, natural order) + const uint64_t *__restrict__ Z0, const uint64_t *__restrict__ Z1_, + const uint64_t *__restrict__ Z2, const uint64_t *__restrict__ Z3, + // Permutation polynomials (SoA, natural order) + const uint64_t *__restrict__ S1_0, const uint64_t *__restrict__ S1_1, + const uint64_t *__restrict__ S1_2, const uint64_t *__restrict__ S1_3, + const uint64_t *__restrict__ S2_0, const uint64_t *__restrict__ S2_1, + const uint64_t *__restrict__ S2_2, const uint64_t *__restrict__ S2_3, + const uint64_t *__restrict__ S3_0, const uint64_t *__restrict__ S3_1, + const uint64_t *__restrict__ S3_2, const uint64_t *__restrict__ S3_3, + // Batch-inverted L1 denominators + const uint64_t *__restrict__ dinv0, const uint64_t *__restrict__ dinv1, + const uint64_t *__restrict__ dinv2, const uint64_t *__restrict__ dinv3, + // Scalar parameters (passed by value as 4 uint64 each) + const uint64_t al0, const uint64_t al1, const uint64_t al2, const uint64_t al3, + const uint64_t be0, const uint64_t be1, const uint64_t be2, const uint64_t be3, + const uint64_t ga0, const uint64_t ga1, const uint64_t ga2, const uint64_t ga3, + const uint64_t ls0, const uint64_t ls1, const uint64_t ls2, const uint64_t ls3, + const uint64_t u0, const uint64_t u1, const uint64_t u2, const uint64_t u3, + const uint64_t usq0, const uint64_t usq1, const uint64_t usq2, const uint64_t usq3, + const uint64_t cg0, const uint64_t cg1, const uint64_t cg2, const uint64_t cg3, + // Twiddle factors: omega^0..omega^(n/2-1) in SoA (half-size from NTT domain) + // For i >= n/2: omega^i = -omega^(i-n/2) since omega^(n/2) = -1 + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + size_t n) +{ + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + uint64_t alpha[4] = {al0, al1, al2, al3}; + uint64_t beta[4] = {be0, be1, be2, be3}; + uint64_t gamma[4] = {ga0, ga1, ga2, ga3}; + uint64_t l1_scalar[4] = {ls0, ls1, ls2, ls3}; + uint64_t cshift[4] = {u0, u1, u2, u3}; + uint64_t cshift_sq[4] = {usq0, usq1, usq2, usq3}; + uint64_t coset_gen[4] = {cg0, cg1, cg2, cg3}; + + // Load wires + uint64_t li[4] = {L0[i], L1_[i], L2[i], L3[i]}; + uint64_t ri[4] = {R0[i], R1[i], R2[i], R3[i]}; + uint64_t oi[4] = {O0[i], O1[i], O2[i], O3[i]}; + + // Load Z[i] and Z[(i+1)%n] + uint64_t zi[4] = {Z0[i], Z1_[i], Z2[i], Z3[i]}; + size_t i_next = (i + 1 < n) ? (i + 1) : 0; + uint64_t zs[4] = {Z0[i_next], Z1_[i_next], Z2[i_next], Z3[i_next]}; + + // Identity permutation: x_i = coset_gen * omega^i + // Twiddles are half-size (n/2). For i >= n/2: omega^i = -omega^(i-n/2) + size_t half_n = n >> 1; + uint64_t tw[4]; + if (i < half_n) { + tw[0] = tw0[i]; tw[1] = tw1[i]; tw[2] = tw2[i]; tw[3] = tw3[i]; + } else { + // omega^i = -omega^(i-n/2) + size_t j = i - half_n; + uint64_t pos[4] = {tw0[j], tw1[j], tw2[j], tw3[j]}; + uint64_t zero[4] = {0, 0, 0, 0}; + fr_sub(tw, zero, pos); // tw = -pos + } + uint64_t x_i[4]; + fr_mul(x_i, coset_gen, tw); + + // id1 = beta * x_i, id2 = id1 * u, id3 = id1 * u² + uint64_t id1[4], id2[4], id3[4]; + fr_mul(id1, beta, x_i); + fr_mul(id2, id1, cshift); + fr_mul(id3, id1, cshift_sq); + + // num = Z[i] * (L+id1+gamma) * (R+id2+gamma) * (O+id3+gamma) + uint64_t t1[4], t2[4], t3[4]; + fr_add(t1, li, id1); + fr_add(t1, t1, gamma); + fr_add(t2, ri, id2); + fr_add(t2, t2, gamma); + fr_add(t3, oi, id3); + fr_add(t3, t3, gamma); + + uint64_t num[4], tmp[4]; + fr_mul(num, zi, t1); + fr_mul(tmp, num, t2); + fr_mul(num, tmp, t3); + + // den = ZS * (L+beta*S1+gamma) * (R+beta*S2+gamma) * (O+beta*S3+gamma) + uint64_t s1[4] = {S1_0[i], S1_1[i], S1_2[i], S1_3[i]}; + uint64_t s2[4] = {S2_0[i], S2_1[i], S2_2[i], S2_3[i]}; + uint64_t s3[4] = {S3_0[i], S3_1[i], S3_2[i], S3_3[i]}; + + uint64_t bs1[4], bs2[4], bs3[4]; + fr_mul(bs1, beta, s1); + fr_mul(bs2, beta, s2); + fr_mul(bs3, beta, s3); + + fr_add(t1, li, bs1); + fr_add(t1, t1, gamma); + fr_add(t2, ri, bs2); + fr_add(t2, t2, gamma); + fr_add(t3, oi, bs3); + fr_add(t3, t3, gamma); + + uint64_t den[4]; + fr_mul(den, zs, t1); + fr_mul(tmp, den, t2); + fr_mul(den, tmp, t3); + + // ordering = den - num (gnark convention: ZS*prod_sigma - Z*prod_id) + uint64_t ordering[4]; + fr_sub(ordering, den, num); + + // L1_i = l1_scalar * L1_denInv[i] + uint64_t dinv[4] = {dinv0[i], dinv1[i], dinv2[i], dinv3[i]}; + uint64_t l1_val[4]; + fr_mul(l1_val, l1_scalar, dinv); + + // local = (Z[i] - 1) * L1_i + uint64_t one[4] = {Fr_params::ONE[0], Fr_params::ONE[1], + Fr_params::ONE[2], Fr_params::ONE[3]}; + uint64_t zm1[4]; + fr_sub(zm1, zi, one); + uint64_t local_val[4]; + fr_mul(local_val, zm1, l1_val); + + // result[i] = alpha * (ordering + alpha * local) + uint64_t al_local[4]; + fr_mul(al_local, alpha, local_val); + uint64_t sum[4]; + fr_add(sum, ordering, al_local); + uint64_t result[4]; + fr_mul(result, alpha, sum); + + res0[i] = result[0]; + res1[i] = result[1]; + res2[i] = result[2]; + res3[i] = result[3]; +} + +// Struct to pass all scalar parameters to the launch function +struct PlonkPermBoundaryParams { + uint64_t alpha[4]; + uint64_t beta[4]; + uint64_t gamma[4]; + uint64_t l1_scalar[4]; + uint64_t coset_shift[4]; + uint64_t coset_shift_sq[4]; + uint64_t coset_gen[4]; +}; + +void launch_plonk_perm_boundary( + uint64_t *res0, uint64_t *res1, uint64_t *res2, uint64_t *res3, + const uint64_t *L0, const uint64_t *L1, const uint64_t *L2, const uint64_t *L3, + const uint64_t *R0, const uint64_t *R1, const uint64_t *R2, const uint64_t *R3, + const uint64_t *O0, const uint64_t *O1, const uint64_t *O2, const uint64_t *O3, + const uint64_t *Z0, const uint64_t *Z1, const uint64_t *Z2, const uint64_t *Z3, + const uint64_t *S1_0, const uint64_t *S1_1, const uint64_t *S1_2, const uint64_t *S1_3, + const uint64_t *S2_0, const uint64_t *S2_1, const uint64_t *S2_2, const uint64_t *S2_3, + const uint64_t *S3_0, const uint64_t *S3_1, const uint64_t *S3_2, const uint64_t *S3_3, + const uint64_t *dinv0, const uint64_t *dinv1, const uint64_t *dinv2, const uint64_t *dinv3, + const PlonkPermBoundaryParams ¶ms, + const uint64_t *tw0, const uint64_t *tw1, const uint64_t *tw2, const uint64_t *tw3, + size_t n, cudaStream_t stream) +{ + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + plonk_perm_boundary_kernel<<>>( + res0, res1, res2, res3, + L0, L1, L2, L3, + R0, R1, R2, R3, + O0, O1, O2, O3, + Z0, Z1, Z2, Z3, + S1_0, S1_1, S1_2, S1_3, + S2_0, S2_1, S2_2, S2_3, + S3_0, S3_1, S3_2, S3_3, + dinv0, dinv1, dinv2, dinv3, + params.alpha[0], params.alpha[1], params.alpha[2], params.alpha[3], + params.beta[0], params.beta[1], params.beta[2], params.beta[3], + params.gamma[0], params.gamma[1], params.gamma[2], params.gamma[3], + params.l1_scalar[0], params.l1_scalar[1], params.l1_scalar[2], params.l1_scalar[3], + params.coset_shift[0], params.coset_shift[1], params.coset_shift[2], params.coset_shift[3], + params.coset_shift_sq[0], params.coset_shift_sq[1], params.coset_shift_sq[2], params.coset_shift_sq[3], + params.coset_gen[0], params.coset_gen[1], params.coset_gen[2], params.coset_gen[3], + tw0, tw1, tw2, tw3, + n); +} + +// ============================================================================= +// PlonK Z-polynomial per-element ratio computation. +// For each i in [0, n): +// num[i] = (L[i]+β*ω^i+γ) * (R[i]+β*g*ω^i+γ) * (O[i]+β*g²*ω^i+γ) +// den[i] = (L[i]+β*id[S[i]]+γ) * (R[i]+β*id[S[n+i]]+γ) * (O[i]+β*id[S[2n+i]]+γ) +// where id[j] = g^(j>>log2n) * ω^(j&(n-1)) +// +// In-place: L is overwritten with num, R with den. O is read-only. +// ============================================================================= + +// Helper: compute omega^pos from twiddle table. +// Twiddles store ω^0..ω^(n/2-1). For pos >= n/2: ω^pos = -ω^(pos-n/2). +static __device__ __forceinline__ void get_omega( + uint64_t result[4], size_t pos, size_t half_n, + const uint64_t *tw0, const uint64_t *tw1, + const uint64_t *tw2, const uint64_t *tw3) +{ + if (pos < half_n) { + result[0] = tw0[pos]; result[1] = tw1[pos]; + result[2] = tw2[pos]; result[3] = tw3[pos]; + } else { + size_t j = pos - half_n; + uint64_t p[4] = {tw0[j], tw1[j], tw2[j], tw3[j]}; + uint64_t z[4] = {0, 0, 0, 0}; + fr_sub(result, z, p); + } +} + +// Helper: compute identity permutation evaluation at index perm_idx. +// id[perm_idx] = g^coset * ω^pos where coset = perm_idx >> log2n, pos = perm_idx & (n-1). +static __device__ __forceinline__ void get_perm_id_eval( + uint64_t result[4], int64_t perm_idx, + size_t n, size_t half_n, unsigned log2n, + const uint64_t g_mul[4], const uint64_t g_sq[4], + const uint64_t *tw0, const uint64_t *tw1, + const uint64_t *tw2, const uint64_t *tw3) +{ + unsigned coset = (unsigned)((size_t)perm_idx >> log2n); + size_t pos = (size_t)perm_idx & (n - 1); + + uint64_t omega_pos[4]; + get_omega(omega_pos, pos, half_n, tw0, tw1, tw2, tw3); + + if (coset == 0) { + result[0] = omega_pos[0]; result[1] = omega_pos[1]; + result[2] = omega_pos[2]; result[3] = omega_pos[3]; + } else if (coset == 1) { + fr_mul(result, g_mul, omega_pos); + } else { + fr_mul(result, g_sq, omega_pos); + } +} + +__global__ void plonk_z_ratio_kernel( + // L/num (read L, write num in-place) + uint64_t *__restrict__ LN0, uint64_t *__restrict__ LN1, + uint64_t *__restrict__ LN2, uint64_t *__restrict__ LN3, + // R/den (read R, write den in-place) + uint64_t *__restrict__ RD0, uint64_t *__restrict__ RD1, + uint64_t *__restrict__ RD2, uint64_t *__restrict__ RD3, + // O (read-only) + const uint64_t *__restrict__ O0, const uint64_t *__restrict__ O1, + const uint64_t *__restrict__ O2, const uint64_t *__restrict__ O3, + // Permutation table (3n int64s, device memory) + const int64_t *__restrict__ perm, + // Scalar parameters (passed by value) + const uint64_t be0, const uint64_t be1, const uint64_t be2, const uint64_t be3, + const uint64_t ga0, const uint64_t ga1, const uint64_t ga2, const uint64_t ga3, + const uint64_t gm0, const uint64_t gm1, const uint64_t gm2, const uint64_t gm3, + const uint64_t gs0, const uint64_t gs1, const uint64_t gs2, const uint64_t gs3, + // Twiddle factors (omega^0..omega^(n/2-1), SoA) + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + size_t n, unsigned log2n) +{ + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + uint64_t beta[4] = {be0, be1, be2, be3}; + uint64_t gamma[4] = {ga0, ga1, ga2, ga3}; + uint64_t g_mul[4] = {gm0, gm1, gm2, gm3}; + uint64_t g_sq[4] = {gs0, gs1, gs2, gs3}; + + size_t half_n = n >> 1; + + // Read L[i], R[i], O[i] into registers before overwriting + uint64_t li[4] = {LN0[i], LN1[i], LN2[i], LN3[i]}; + uint64_t ri[4] = {RD0[i], RD1[i], RD2[i], RD3[i]}; + uint64_t oi[4] = {O0[i], O1[i], O2[i], O3[i]}; + + // Compute omega^i from twiddle table + uint64_t omega_i[4]; + get_omega(omega_i, i, half_n, tw0, tw1, tw2, tw3); + + // β * identity evaluations: β*ω^i, β*g*ω^i, β*g²*ω^i + // Optimize: compute β*ω^i once, then multiply by g and g² + uint64_t bid0[4]; + fr_mul(bid0, beta, omega_i); // β * ω^i + uint64_t bid1[4]; + fr_mul(bid1, g_mul, bid0); // g * β * ω^i = β * g * ω^i + uint64_t bid2[4]; + fr_mul(bid2, g_sq, bid0); // g² * β * ω^i = β * g² * ω^i + + // Numerator: (L + β*id0 + γ) * (R + β*id1 + γ) * (O + β*id2 + γ) + uint64_t t1[4], t2[4], t3[4], tmp[4]; + fr_add(t1, li, bid0); + fr_add(t1, t1, gamma); + fr_add(t2, ri, bid1); + fr_add(t2, t2, gamma); + fr_add(t3, oi, bid2); + fr_add(t3, t3, gamma); + + uint64_t num_val[4]; + fr_mul(tmp, t1, t2); + fr_mul(num_val, tmp, t3); + + // Denominator: look up permutation for each wire and compute identity evaluation + uint64_t sid0[4], sid1[4], sid2[4]; + get_perm_id_eval(sid0, perm[i], n, half_n, log2n, g_mul, g_sq, tw0, tw1, tw2, tw3); + get_perm_id_eval(sid1, perm[n + i], n, half_n, log2n, g_mul, g_sq, tw0, tw1, tw2, tw3); + get_perm_id_eval(sid2, perm[2*n + i], n, half_n, log2n, g_mul, g_sq, tw0, tw1, tw2, tw3); + + uint64_t bs0[4], bs1[4], bs2[4]; + fr_mul(bs0, beta, sid0); + fr_mul(bs1, beta, sid1); + fr_mul(bs2, beta, sid2); + + fr_add(t1, li, bs0); + fr_add(t1, t1, gamma); + fr_add(t2, ri, bs1); + fr_add(t2, t2, gamma); + fr_add(t3, oi, bs2); + fr_add(t3, t3, gamma); + + uint64_t den_val[4]; + fr_mul(tmp, t1, t2); + fr_mul(den_val, tmp, t3); + + // Write num to L, den to R (in-place) + LN0[i] = num_val[0]; LN1[i] = num_val[1]; LN2[i] = num_val[2]; LN3[i] = num_val[3]; + RD0[i] = den_val[0]; RD1[i] = den_val[1]; RD2[i] = den_val[2]; RD3[i] = den_val[3]; +} + +struct PlonkZRatioParams { + uint64_t beta[4]; + uint64_t gamma[4]; + uint64_t g_mul[4]; + uint64_t g_sq[4]; +}; + +void launch_plonk_z_ratio( + uint64_t *LN0, uint64_t *LN1, uint64_t *LN2, uint64_t *LN3, + uint64_t *RD0, uint64_t *RD1, uint64_t *RD2, uint64_t *RD3, + const uint64_t *O0, const uint64_t *O1, const uint64_t *O2, const uint64_t *O3, + const int64_t *d_perm, + const PlonkZRatioParams ¶ms, + const uint64_t *tw0, const uint64_t *tw1, const uint64_t *tw2, const uint64_t *tw3, + size_t n, unsigned log2n, cudaStream_t stream) +{ + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + plonk_z_ratio_kernel<<>>( + LN0, LN1, LN2, LN3, + RD0, RD1, RD2, RD3, + O0, O1, O2, O3, + d_perm, + params.beta[0], params.beta[1], params.beta[2], params.beta[3], + params.gamma[0], params.gamma[1], params.gamma[2], params.gamma[3], + params.g_mul[0], params.g_mul[1], params.g_mul[2], params.g_mul[3], + params.g_sq[0], params.g_sq[1], params.g_sq[2], params.g_sq[3], + tw0, tw1, tw2, tw3, + n, log2n); +} + +// ============================================================================= +// ComputeL1Den: out[i] = cosetGen * omega^i - 1 +// Uses the same twiddle access pattern as plonk_perm_boundary_kernel. +// The caller should BatchInvert the result to get L1DenInv. +// ============================================================================= + +__global__ void compute_l1_den_kernel( + uint64_t *__restrict__ out0, uint64_t *__restrict__ out1, + uint64_t *__restrict__ out2, uint64_t *__restrict__ out3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + uint64_t cg0, uint64_t cg1, uint64_t cg2, uint64_t cg3, size_t n) +{ + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + uint64_t coset_gen[4] = {cg0, cg1, cg2, cg3}; + + // Get omega^i from twiddle table (half-size, same pattern as perm kernel) + size_t half_n = n >> 1; + uint64_t tw[4]; + if (i < half_n) { + tw[0] = tw0[i]; tw[1] = tw1[i]; tw[2] = tw2[i]; tw[3] = tw3[i]; + } else { + size_t j = i - half_n; + uint64_t pos[4] = {tw0[j], tw1[j], tw2[j], tw3[j]}; + uint64_t zero[4] = {0, 0, 0, 0}; + fr_sub(tw, zero, pos); + } + + // result = cosetGen * omega^i - 1 + uint64_t prod[4]; + fr_mul(prod, coset_gen, tw); + + uint64_t one[4] = {Fr_params::ONE[0], Fr_params::ONE[1], + Fr_params::ONE[2], Fr_params::ONE[3]}; + uint64_t result[4]; + fr_sub(result, prod, one); + + out0[i] = result[0]; + out1[i] = result[1]; + out2[i] = result[2]; + out3[i] = result[3]; +} + +void launch_compute_l1_den( + uint64_t *out0, uint64_t *out1, uint64_t *out2, uint64_t *out3, + const uint64_t *tw0, const uint64_t *tw1, const uint64_t *tw2, const uint64_t *tw3, + const uint64_t cg[4], size_t n, cudaStream_t stream) +{ + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + compute_l1_den_kernel<<>>( + out0, out1, out2, out3, + tw0, tw1, tw2, tw3, + cg[0], cg[1], cg[2], cg[3], n); +} + +// ============================================================================= +// Reduce blinded polynomial for coset evaluation +// +// dst[i] = src[i] + tail[j] * cosetPowN for j in [0, tail_len), i = j +// dst[i] = src[i] for i in [tail_len, n) +// +// tail coefficients are loaded from shared memory (uploaded from host AoS). +// tail_len is tiny (2-3), so we broadcast via shared memory. +// ============================================================================= + +__global__ void reduce_blinded_coset_kernel( + uint64_t *__restrict__ dst0, uint64_t *__restrict__ dst1, + uint64_t *__restrict__ dst2, uint64_t *__restrict__ dst3, + const uint64_t *__restrict__ src0, const uint64_t *__restrict__ src1, + const uint64_t *__restrict__ src2, const uint64_t *__restrict__ src3, + const uint64_t cpn0, const uint64_t cpn1, + const uint64_t cpn2, const uint64_t cpn3, + const uint64_t *__restrict__ tail_aos, // AoS: [t0_l0..t0_l3, t1_l0..t1_l3, ...] + uint32_t tail_len, uint32_t n) +{ + uint32_t i = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + uint64_t v[4] = { src0[i], src1[i], src2[i], src3[i] }; + + if (i < tail_len) { + // v[i] += tail[i] * cosetPowN + uint64_t t[4] = { tail_aos[i*4], tail_aos[i*4+1], tail_aos[i*4+2], tail_aos[i*4+3] }; + uint64_t cpn[4] = { cpn0, cpn1, cpn2, cpn3 }; + uint64_t prod[4]; + fr_mul(prod, t, cpn); + fr_add(v, v, prod); + } + + dst0[i] = v[0]; dst1[i] = v[1]; dst2[i] = v[2]; dst3[i] = v[3]; +} + +void launch_reduce_blinded_coset( + uint64_t *dst0, uint64_t *dst1, uint64_t *dst2, uint64_t *dst3, + const uint64_t *src0, const uint64_t *src1, + const uint64_t *src2, const uint64_t *src3, + const uint64_t cpn[4], + const uint64_t *tail_device, + uint32_t tail_len, uint32_t n, cudaStream_t stream) +{ + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + reduce_blinded_coset_kernel<<>>( + dst0, dst1, dst2, dst3, + src0, src1, src2, src3, + cpn[0], cpn[1], cpn[2], cpn[3], + tail_device, tail_len, n); +} + +// ============================================================================= +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/kernels.cu b/prover/gpu/cuda/src/plonk/kernels.cu new file mode 100644 index 00000000000..8d09478a456 --- /dev/null +++ b/prover/gpu/cuda/src/plonk/kernels.cu @@ -0,0 +1,370 @@ +// ============================================================================= +// Core Fr element-wise kernels (SoA backend) +// +// This file provides simple, high-throughput primitives used across the stack: +// - Montgomery multiply: c[i] = a[i] * b[i] mod r +// - Add/sub: c[i] = a[i] ± b[i] mod r +// - Layout transpose: AoS <-> SoA +// +// Data layouts: +// +// AoS (host / gnark-crypto): +// [e0.l0 e0.l1 e0.l2 e0.l3 | e1.l0 e1.l1 ...] +// +// SoA (device): +// limb0: [e0.l0 e1.l0 e2.l0 ...] +// limb1: [e0.l1 e1.l1 e2.l1 ...] +// limb2: [e0.l2 e1.l2 e2.l2 ...] +// limb3: [e0.l3 e1.l3 e2.l3 ...] +// +// Why SoA: +// Warps operating on consecutive elements read contiguous limb arrays, giving +// coalesced global memory access. +// ============================================================================= + +#include "field.cuh" +#include + +namespace gnark_gpu { + +// ============================================================================= +// Helper: 64-bit multiply giving 128-bit result +// ============================================================================= + +__device__ __forceinline__ void mul_wide(uint64_t a, uint64_t b, uint64_t &lo, uint64_t &hi) { + lo = a * b; + hi = __umul64hi(a, b); +} + +// ============================================================================= +// Helper: Add with carry (a + b + carry_in) -> (result, carry_out) +// ============================================================================= + +__device__ __forceinline__ uint64_t add_with_carry(uint64_t a, uint64_t b, uint64_t carry_in, + uint64_t &carry_out) { + uint64_t sum = a + b; + carry_out = (sum < a) ? 1ULL : 0ULL; + uint64_t sum2 = sum + carry_in; + carry_out += (sum2 < sum) ? 1ULL : 0ULL; + return sum2; +} + +// ============================================================================= +// CIOS Montgomery Multiplication kernel for Fr (4 limbs) +// Reference: Algorithm 2 from "Montgomery Multiplication on Modern Processors" +// ============================================================================= + +__global__ void mul_mont_fr_kernel(const uint64_t *__restrict__ a0, + const uint64_t *__restrict__ a1, + const uint64_t *__restrict__ a2, + const uint64_t *__restrict__ a3, + const uint64_t *__restrict__ b0, + const uint64_t *__restrict__ b1, + const uint64_t *__restrict__ b2, + const uint64_t *__restrict__ b3, + uint64_t *__restrict__ c0, uint64_t *__restrict__ c1, + uint64_t *__restrict__ c2, uint64_t *__restrict__ c3, + size_t n) { + auto idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) + return; + + // BLS12-377 Fr modulus + constexpr uint64_t q[4] = { + Fr_params::MODULUS[0], Fr_params::MODULUS[1], + Fr_params::MODULUS[2], Fr_params::MODULUS[3] + }; + // Montgomery constant: qInvNeg = -q^(-1) mod 2^64 + constexpr uint64_t qInvNeg = Fr_params::INV; + + // Load operands + uint64_t a[4] = {__ldg(&a0[idx]), __ldg(&a1[idx]), __ldg(&a2[idx]), __ldg(&a3[idx])}; + uint64_t b[4] = {__ldg(&b0[idx]), __ldg(&b1[idx]), __ldg(&b2[idx]), __ldg(&b3[idx])}; + + // Working registers T[0..4] - one extra for overflow + uint64_t t[5] = {0, 0, 0, 0, 0}; + + // CIOS: 4 iterations, one per limb of a + for (int i = 0; i < 4; i++) { + uint64_t carry = 0; + uint64_t lo, hi; + + // Step 1: t = t + a[i] * b + for (int j = 0; j < 4; j++) { + mul_wide(a[i], b[j], lo, hi); + // t[j] = t[j] + lo + carry + uint64_t tmp = t[j] + lo; + uint64_t c1 = (tmp < t[j]) ? 1ULL : 0ULL; + uint64_t tmp2 = tmp + carry; + uint64_t c2 = (tmp2 < tmp) ? 1ULL : 0ULL; + t[j] = tmp2; + carry = hi + c1 + c2; + } + t[4] += carry; + + // Step 2: m = t[0] * qInvNeg mod 2^64 + uint64_t m = t[0] * qInvNeg; + + // Step 3: t = (t + m * q) / 2^64 + carry = 0; + for (int j = 0; j < 4; j++) { + mul_wide(m, q[j], lo, hi); + uint64_t tmp = t[j] + lo; + uint64_t c1 = (tmp < t[j]) ? 1ULL : 0ULL; + uint64_t tmp2 = tmp + carry; + uint64_t c2 = (tmp2 < tmp) ? 1ULL : 0ULL; + if (j > 0) { + t[j - 1] = tmp2; + } + carry = hi + c1 + c2; + } + t[3] = t[4] + carry; + t[4] = 0; + } + + // Final reduction: if t >= q, then t = t - q + uint64_t borrow = 0; + uint64_t r[4]; + + // Subtract q from t + for (int j = 0; j < 4; j++) { + uint64_t diff = t[j] - q[j] - borrow; + borrow = (t[j] < q[j] + borrow) ? 1ULL : ((t[j] == q[j] && borrow) ? 1ULL : 0ULL); + // More accurate borrow calculation + if (t[j] < q[j]) { + borrow = 1; + } else if (t[j] == q[j]) { + // borrow stays the same + } else { + borrow = 0; + } + r[j] = diff; + } + + // If no borrow, t >= q, use reduced value + // If borrow, t < q, use original value + if (borrow) { + // t < q, use t + c0[idx] = t[0]; + c1[idx] = t[1]; + c2[idx] = t[2]; + c3[idx] = t[3]; + } else { + // t >= q, use r = t - q + c0[idx] = r[0]; + c1[idx] = r[1]; + c2[idx] = r[2]; + c3[idx] = r[3]; + } +} + +// ============================================================================= +// Addition kernel for Fr (4 limbs) with modular reduction +// result = (a + b) mod p +// ============================================================================= + +__global__ void add_fr_kernel(const uint64_t *__restrict__ a0, + const uint64_t *__restrict__ a1, + const uint64_t *__restrict__ a2, + const uint64_t *__restrict__ a3, + const uint64_t *__restrict__ b0, + const uint64_t *__restrict__ b1, + const uint64_t *__restrict__ b2, + const uint64_t *__restrict__ b3, uint64_t *__restrict__ c0, + uint64_t *__restrict__ c1, uint64_t *__restrict__ c2, + uint64_t *__restrict__ c3, size_t n) { + auto idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) + return; + + constexpr uint64_t p0 = Fr_params::MODULUS[0], p1 = Fr_params::MODULUS[1]; + constexpr uint64_t p2 = Fr_params::MODULUS[2], p3 = Fr_params::MODULUS[3]; + + // Load operands + uint64_t A0 = __ldg(&a0[idx]), A1 = __ldg(&a1[idx]); + uint64_t A2 = __ldg(&a2[idx]), A3 = __ldg(&a3[idx]); + uint64_t B0 = __ldg(&b0[idx]), B1 = __ldg(&b1[idx]); + uint64_t B2 = __ldg(&b2[idx]), B3 = __ldg(&b3[idx]); + + // Add with carry chain using PTX + uint64_t r0, r1, r2, r3, carry; + + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(r0) : "l"(A0), "l"(B0)); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r1) : "l"(A1), "l"(B1)); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r2) : "l"(A2), "l"(B2)); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r3) : "l"(A3), "l"(B3)); + asm volatile("addc.u64 %0, 0, 0;" : "=l"(carry)); + + // Subtract modulus to check if reduction needed + uint64_t t0, t1, t2, t3, borrow; + + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(t0) : "l"(r0), "l"(p0)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t1) : "l"(r1), "l"(p1)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t2) : "l"(r2), "l"(p2)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(t3) : "l"(r3), "l"(p3)); + asm volatile("subc.u64 %0, %1, 0;" : "=l"(borrow) : "l"(carry)); + + // If no borrow (borrow == 0), use reduced value; else use original + // borrow == 0 means r >= p, so we should use t (the reduced value) + bool use_reduced = (borrow == 0); + c0[idx] = use_reduced ? t0 : r0; + c1[idx] = use_reduced ? t1 : r1; + c2[idx] = use_reduced ? t2 : r2; + c3[idx] = use_reduced ? t3 : r3; +} + +// ============================================================================= +// Subtraction kernel for Fr (4 limbs) with modular reduction +// result = (a - b) mod p +// ============================================================================= + +__global__ void sub_fr_kernel(const uint64_t *__restrict__ a0, + const uint64_t *__restrict__ a1, + const uint64_t *__restrict__ a2, + const uint64_t *__restrict__ a3, + const uint64_t *__restrict__ b0, + const uint64_t *__restrict__ b1, + const uint64_t *__restrict__ b2, + const uint64_t *__restrict__ b3, uint64_t *__restrict__ c0, + uint64_t *__restrict__ c1, uint64_t *__restrict__ c2, + uint64_t *__restrict__ c3, size_t n) { + auto idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= n) + return; + + constexpr uint64_t p0 = Fr_params::MODULUS[0], p1 = Fr_params::MODULUS[1]; + constexpr uint64_t p2 = Fr_params::MODULUS[2], p3 = Fr_params::MODULUS[3]; + + // Load operands + uint64_t A0 = __ldg(&a0[idx]), A1 = __ldg(&a1[idx]); + uint64_t A2 = __ldg(&a2[idx]), A3 = __ldg(&a3[idx]); + uint64_t B0 = __ldg(&b0[idx]), B1 = __ldg(&b1[idx]); + uint64_t B2 = __ldg(&b2[idx]), B3 = __ldg(&b3[idx]); + + // Subtract with borrow chain using PTX + uint64_t r0, r1, r2, r3, borrow; + + asm volatile("sub.cc.u64 %0, %1, %2;" : "=l"(r0) : "l"(A0), "l"(B0)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(r1) : "l"(A1), "l"(B1)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(r2) : "l"(A2), "l"(B2)); + asm volatile("subc.cc.u64 %0, %1, %2;" : "=l"(r3) : "l"(A3), "l"(B3)); + asm volatile("subc.u64 %0, 0, 0;" : "=l"(borrow)); + + // If borrow occurred (a < b), add modulus back + // borrow will be 0xFFFFFFFFFFFFFFFF if underflow occurred + if (borrow != 0) { + asm volatile("add.cc.u64 %0, %1, %2;" : "=l"(r0) : "l"(r0), "l"(p0)); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r1) : "l"(r1), "l"(p1)); + asm volatile("addc.cc.u64 %0, %1, %2;" : "=l"(r2) : "l"(r2), "l"(p2)); + asm volatile("addc.u64 %0, %1, %2;" : "=l"(r3) : "l"(r3), "l"(p3)); + } + + c0[idx] = r0; + c1[idx] = r1; + c2[idx] = r2; + c3[idx] = r3; +} + +// ============================================================================= +// AoS → SoA transpose kernel for Fr +// Input: AoS format [e0.l0, e0.l1, e0.l2, e0.l3, e1.l0, e1.l1, ...] +// Output: SoA format limb0[e0.l0, e1.l0, ...], limb1[e0.l1, e1.l1, ...], ... +// ============================================================================= + +__global__ void transpose_aos_to_soa_fr_kernel(uint64_t *__restrict__ limb0, + uint64_t *__restrict__ limb1, + uint64_t *__restrict__ limb2, + uint64_t *__restrict__ limb3, + const uint64_t *__restrict__ aos_data, + size_t count) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= count) + return; + + const uint64_t *elem = aos_data + idx * 4; + limb0[idx] = elem[0]; + limb1[idx] = elem[1]; + limb2[idx] = elem[2]; + limb3[idx] = elem[3]; +} + +// ============================================================================= +// SoA → AoS transpose kernel for Fr +// Input: SoA format limb0[e0.l0, e1.l0, ...], limb1[e0.l1, e1.l1, ...], ... +// Output: AoS format [e0.l0, e0.l1, e0.l2, e0.l3, e1.l0, e1.l1, ...] +// ============================================================================= + +__global__ void transpose_soa_to_aos_fr_kernel(uint64_t *__restrict__ aos_data, + const uint64_t *__restrict__ limb0, + const uint64_t *__restrict__ limb1, + const uint64_t *__restrict__ limb2, + const uint64_t *__restrict__ limb3, + size_t count) { + size_t idx = blockIdx.x * blockDim.x + threadIdx.x; + if (idx >= count) + return; + + uint64_t *elem = aos_data + idx * 4; + elem[0] = limb0[idx]; + elem[1] = limb1[idx]; + elem[2] = limb2[idx]; + elem[3] = limb3[idx]; +} + +// ============================================================================= +// Kernel launchers +// ============================================================================= + +void launch_mul_mont_fr(uint64_t *c0, uint64_t *c1, uint64_t *c2, uint64_t *c3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, + const uint64_t *a3, const uint64_t *b0, const uint64_t *b1, + const uint64_t *b2, const uint64_t *b3, size_t n, + cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + mul_mont_fr_kernel<<>>(a0, a1, a2, a3, b0, b1, b2, b3, c0, + c1, c2, c3, n); +} + +void launch_add_fr(uint64_t *c0, uint64_t *c1, uint64_t *c2, uint64_t *c3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, + const uint64_t *a3, const uint64_t *b0, const uint64_t *b1, + const uint64_t *b2, const uint64_t *b3, size_t n, + cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + add_fr_kernel<<>>(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, + c2, c3, n); +} + +void launch_sub_fr(uint64_t *c0, uint64_t *c1, uint64_t *c2, uint64_t *c3, + const uint64_t *a0, const uint64_t *a1, const uint64_t *a2, + const uint64_t *a3, const uint64_t *b0, const uint64_t *b1, + const uint64_t *b2, const uint64_t *b3, size_t n, + cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + sub_fr_kernel<<>>(a0, a1, a2, a3, b0, b1, b2, b3, c0, c1, + c2, c3, n); +} + +void launch_transpose_aos_to_soa_fr(uint64_t *limb0, uint64_t *limb1, uint64_t *limb2, + uint64_t *limb3, const uint64_t *aos_data, size_t count, + cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (count + threads - 1) / threads; + transpose_aos_to_soa_fr_kernel<<>>(limb0, limb1, limb2, + limb3, aos_data, count); +} + +void launch_transpose_soa_to_aos_fr(uint64_t *aos_data, const uint64_t *limb0, + const uint64_t *limb1, const uint64_t *limb2, + const uint64_t *limb3, size_t count, + cudaStream_t stream) { + constexpr unsigned threads = 256; + unsigned blocks = (count + threads - 1) / threads; + transpose_soa_to_aos_fr_kernel<<>>(aos_data, limb0, limb1, + limb2, limb3, count); +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/msm.cu b/prover/gpu/cuda/src/plonk/msm.cu new file mode 100644 index 00000000000..edf9535b04b --- /dev/null +++ b/prover/gpu/cuda/src/plonk/msm.cu @@ -0,0 +1,1394 @@ +// ═══════════════════════════════════════════════════════════════════════════════ +// MSM (Multi-Scalar Multiplication) for BLS12-377 G1 +// +// Computes: Q = Σᵢ sᵢ · Pᵢ for n scalar-point pairs (sᵢ, Pᵢ) +// +// Algorithm: Pippenger's bucket method with signed-digit decomposition +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ Pippenger's Method Overview │ +// │ │ +// │ Each 253-bit scalar sᵢ is decomposed into w windows of c bits: │ +// │ sᵢ = Σⱼ dᵢⱼ · 2^(j·c) where dᵢⱼ ∈ {-2^(c-1), ..., 2^(c-1)} │ +// │ │ +// │ Signed digits halve the bucket count: 2^(c-1) buckets per window. │ +// │ When dᵢⱼ < 0, we negate the point and use bucket |dᵢⱼ|. │ +// │ │ +// │ For each window j, the bucket sum is: │ +// │ Wⱼ = Σ_b b · (Σ {Pᵢ : |dᵢⱼ| = b}) │ +// │ │ +// │ Final result via Horner's rule: │ +// │ Q = (...((W[w-1])·2^c + W[w-2])·2^c + ...)·2^c + W[0] │ +// └─────────────────────────────────────────────────────────────────────────┘ +// +// GPU Pipeline: build_pairs → radix sort → boundaries → accumulate → reduce +// Host: Horner combination in TE coordinates, single TE→Jacobian at end. +// ═══════════════════════════════════════════════════════════════════════════════ + +#include "ec.cuh" +#include +#include +#include + +namespace gnark_gpu { + +// ============================================================================= +// MSM configuration +// ============================================================================= + +static constexpr int MSM_SCALAR_BITS = 253; +static constexpr int ACCUM_PARALLEL_THREADS = 128; +static constexpr int REDUCE_THREADS_PER_WINDOW = 128; +static constexpr int FINALIZE_THREADS = 32; + +// Two-phase bucket accumulation cap. +// Phase 1 (sequential): each thread processes at most CAP entries per bucket. +// Phase 2 (parallel): 128 threads/block handle any remaining entries. +// For uniform scalars (avg bucket ~70), phase 1 handles everything. +// For concentrated scalars (huge buckets), phase 2 distributes the tail work. +static constexpr int ACCUM_SEQ_CAP = 256; + +static int forced_window_bits() { + static int forced_c = -1; + if(forced_c != -1) return forced_c; + + forced_c = 0; + const char *env = std::getenv("GNARK_GPU_MSM_FORCE_C"); + if(!env || !*env) return forced_c; + + const int parsed = std::atoi(env); + // Keep c within a safe range for 32-bit bucket math and 253-bit scalars. + if(parsed >= 1 && parsed <= 23) forced_c = parsed; + return forced_c; +} + +static size_t host_register_threshold_points() { + static int threshold = -1; + if(threshold != -1) return (size_t)threshold; + + threshold = 1 << 20; + const char *env = std::getenv("GNARK_GPU_MSM_REGISTER_THRESHOLD"); + if(!env || !*env) return (size_t)threshold; + + const int parsed = std::atoi(env); + if(parsed >= 0) threshold = parsed; + return (size_t)threshold; +} + +static int overflow_compaction_mode() { + static int mode = -2; + if(mode != -2) return mode; + + mode = -1; + const char *env = std::getenv("GNARK_GPU_MSM_COMPACT_OVERFLOWS"); + if(env && *env) mode = std::atoi(env) != 0 ? 1 : 0; + return mode; +} + +static bool compact_overflow_buckets(size_t n) { + int mode = overflow_compaction_mode(); + if(mode != -1) return mode != 0; + return n <= (1u << 23); +} + +// Enable batched-affine bucket accumulation. Requires d_points_sw to be +// populated via msm_load_points_sw. Off by default. +// Read on each invocation so tests can toggle without process restart. +static bool batched_affine_enabled() { + const char *env = std::getenv("GNARK_GPU_MSM_BATCHED_AFFINE"); + return env && *env && std::atoi(env) != 0; +} + + +// Window-size schedule for BLS12-377 signed-digit MSM. +// Empirical outcome on real gnark scalar datasets: +// - c=13 is best for tiny sizes, +// - c=15 is best for small-mid, +// - c=17 is consistently best from ~2^19 upward. +// +// We intentionally avoid c=19/c=20 defaults: they can look good on synthetic +// random inputs but regress badly on large real runs due bucket skew and +// higher reduction overhead. +static int compute_optimal_c(size_t n) { + const int forced_c = forced_window_bits(); + if(forced_c != 0) return forced_c; + + if(n <= (1 << 14)) return 13; + if(n <= (1 << 18)) return 15; + return 17; +} + +// ============================================================================= +// Kernel 1: Build (bucket_id, point_idx) pairs — signed-digit decomposition +// +// For each window, extract c bits + carry, apply signed-digit reduction: +// digit > 2^(c-1) → negate: digit = 2^c - digit, sign = 1, carry = 1 +// digit ≤ 2^(c-1) → positive: sign = 0, carry = 0 +// digit == 0 → sentinel (skip bucket assignment) +// +// Scalars are decomposed in Montgomery form; host corrects by R^{-1}. +// ============================================================================= + +__global__ void __launch_bounds__(256) build_pairs_kernel( + const uint64_t *__restrict__ scalars, + uint32_t *__restrict__ keys, + uint32_t *__restrict__ vals, + size_t n, int c, int num_windows, int num_buckets, int total_buckets, + size_t point_offset) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t s[4]; + s[0] = scalars[idx * 4 + 0]; + s[1] = scalars[idx * 4 + 1]; + s[2] = scalars[idx * 4 + 2]; + s[3] = scalars[idx * 4 + 3]; + + uint32_t c_mask = (1u << c) - 1; + uint32_t carry = 0; + const uint32_t point_base = (uint32_t)(idx + point_offset) & 0x7FFFFFFFu; + + for(int w = 0; w < num_windows; w++) { + int bit_offset = w * c; + int limb_idx = bit_offset / 64; + int bit_shift = bit_offset % 64; + + uint32_t digit; + if(limb_idx >= 4) { + digit = 0; + } else { + digit = (uint32_t)(s[limb_idx] >> bit_shift); + if(bit_shift + c > 64 && limb_idx + 1 < 4) + digit |= (uint32_t)(s[limb_idx + 1] << (64 - bit_shift)); + } + size_t out_idx = (size_t)idx * num_windows + w; + digit = (digit & c_mask) + carry; + + carry = (digit > (uint32_t)num_buckets) ? 1u : 0u; + uint32_t neg_digit = (1u << c) - digit; + uint32_t use_neg = carry; + uint32_t bucket = use_neg ? neg_digit : digit; + uint32_t sign = use_neg; + + // Handle edge case: 2^c - digit == 0 when digit == 2^c (carry overflow) + uint32_t is_overflow = (bucket == 0 && use_neg) ? 1u : 0u; + carry |= is_overflow; + sign &= ~is_overflow; + + keys[out_idx] = (bucket == 0) ? (uint32_t)total_buckets + : (uint32_t)(w * num_buckets + (bucket - 1)); + // Store absolute point index (chunk-relative idx + point_offset) + vals[out_idx] = point_base | (sign << 31); + } +} + +// ============================================================================= +// Kernel 2: Detect bucket boundaries in sorted key array +// ============================================================================= + +__global__ void __launch_bounds__(256) detect_bucket_boundaries_kernel( + const uint32_t *__restrict__ sorted_keys, + uint32_t *__restrict__ bucket_offsets, + uint32_t *__restrict__ bucket_ends, + size_t assignments, int total_buckets) { + + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(i >= assignments) return; + + uint32_t key = sorted_keys[i]; + if(key >= (uint32_t)total_buckets) return; + + if(i == 0 || sorted_keys[i - 1] != key) bucket_offsets[key] = (uint32_t)i; + if(i == assignments - 1 || sorted_keys[i + 1] != key) bucket_ends[key] = (uint32_t)(i + 1); +} + +// ============================================================================= +// Kernel 3: Accumulate points per bucket (sequential, one thread per bucket) +// ============================================================================= + +__global__ void __launch_bounds__(256, 2) + accumulate_buckets_kernel( + const G1EdXY *__restrict__ points, + const uint32_t *__restrict__ point_indices, + const uint32_t *__restrict__ bucket_offsets, + const uint32_t *__restrict__ bucket_ends, + G1EdExtended *__restrict__ buckets, + int total_buckets, + bool add_to_existing, + int cap, + uint32_t *__restrict__ overflow_buckets, + uint32_t *__restrict__ overflow_count) { + + int bucket_flat = blockIdx.x * blockDim.x + threadIdx.x; + if(bucket_flat >= total_buckets) return; + + uint32_t start = bucket_offsets[bucket_flat]; + uint32_t full_end = bucket_ends[bucket_flat]; + uint32_t end = full_end; + + // Cap: process at most `cap` entries per bucket (0 = unlimited). + // Phase 2 (parallel kernel) handles any remainder. + if(cap > 0 && full_end > start + (uint32_t)cap) { + end = start + (uint32_t)cap; + if(overflow_buckets && overflow_count) { + uint32_t slot = atomicAdd(overflow_count, 1u); + overflow_buckets[slot] = (uint32_t)bucket_flat; + } + } + + G1EdExtended acc; + if(add_to_existing) + acc = buckets[bucket_flat]; + else + ec_te_set_identity(acc); + + for(uint32_t i = start; i < end; i++) { + uint32_t packed = point_indices[i]; + G1EdXY pt = points[packed & 0x7FFFFFFFu]; + ec_te_cnegate_xy(pt, (bool)(packed >> 31)); + ec_te_unified_mixed_add_xy(acc, pt); + } + + buckets[bucket_flat] = acc; +} + +// ============================================================================= +// Kernel 3b: Parallel bucket accumulation (one block per bucket, tree reduce) +// ============================================================================= + +__global__ void __launch_bounds__(128, 4) + accumulate_buckets_parallel_kernel( + const G1EdXY *__restrict__ points, + const uint32_t *__restrict__ point_indices, + const uint32_t *__restrict__ bucket_offsets, + const uint32_t *__restrict__ bucket_ends, + const uint32_t *__restrict__ overflow_buckets, + G1EdExtended *__restrict__ buckets, + bool add_to_existing, + uint32_t start_offset) { + + int bucket_flat = overflow_buckets ? overflow_buckets[blockIdx.x] : blockIdx.x; + int tid = threadIdx.x; + uint32_t start = bucket_offsets[bucket_flat] + start_offset; + uint32_t end = bucket_ends[bucket_flat]; + + if(start >= end) { + return; + } + + G1EdExtended acc; + ec_te_set_identity(acc); + for(uint32_t i = start + tid; i < end; i += ACCUM_PARALLEL_THREADS) { + uint32_t packed = point_indices[i]; + G1EdXY pt = points[packed & 0x7FFFFFFFu]; + ec_te_cnegate_xy(pt, (bool)(packed >> 31)); + ec_te_unified_mixed_add_xy(acc, pt); + } + + extern __shared__ G1EdExtended shared[]; + shared[tid] = acc; + __syncthreads(); + for(int stride = ACCUM_PARALLEL_THREADS / 2; stride > 0; stride >>= 1) { + if(tid < stride) ec_te_unified_add(shared[tid], shared[tid + stride]); + __syncthreads(); + } + if(tid == 0) { + if(add_to_existing) + ec_te_unified_add(buckets[bucket_flat], shared[0]); + else + buckets[bucket_flat] = shared[0]; + } +} + +// ============================================================================= +// Small TE scalar multiply (double-and-add, for reduce corrections) +// ============================================================================= + +__device__ __forceinline__ void ec_te_mul_small(G1EdExtended &out, const G1EdExtended &in, int k) { + ec_te_set_identity(out); + if(k <= 0) return; + G1EdExtended base = in; + while(k > 0) { + if(k & 1) ec_te_unified_add(out, base); + k >>= 1; + if(k > 0) ec_te_unified_add(base, base); + } +} + +// ============================================================================= +// Kernel 4a: Partial reduce — running sum trick per block-range +// +// For b = high down to low: S += B[b]; Total += S +// Result: Total = Σ (b+1)·B[b] (weights = bucket digit = b+1) +// ============================================================================= + +__global__ void __launch_bounds__(256, 2) reduce_buckets_partial_kernel( + const G1EdExtended *__restrict__ buckets, + G1EdExtended *__restrict__ partial_totals, + G1EdExtended *__restrict__ partial_sums, + int num_windows, int num_buckets, int blocks_per_window) { + + int block_flat = blockIdx.x; + int w = block_flat / blocks_per_window; + int part = block_flat % blocks_per_window; + if(w >= num_windows) return; + + int tid = threadIdx.x; + int P = blockDim.x; + + int range_size = (num_buckets + blocks_per_window - 1) / blocks_per_window; + int high = num_buckets - 1 - part * range_size; + if(high < 0) { + if(tid == 0) { + int out_idx = w * blocks_per_window + part; + ec_te_set_identity(partial_totals[out_idx]); + ec_te_set_identity(partial_sums[out_idx]); + } + return; + } + int low = high - range_size + 1; + if(low < 0) low = 0; + int range_len = high - low + 1; + + int chunk_size = (range_len + P - 1) / P; + int chunk_high = high - tid * chunk_size; + int chunk_low = chunk_high - chunk_size + 1; + if(chunk_low < low) chunk_low = low; + if(chunk_high > high) chunk_high = high; + bool has_work = (chunk_high >= low); + + G1EdExtended local_running, local_total; + ec_te_set_identity(local_running); + ec_te_set_identity(local_total); + int local_len = 0; + + if(has_work) { + for(int b = chunk_high; b >= chunk_low; b--) { + ec_te_unified_add(local_running, buckets[w * num_buckets + b]); + ec_te_unified_add(local_total, local_running); + local_len++; + } + } + + // Hillis-Steele inclusive prefix scan of running sums + __shared__ G1EdExtended shared_prefix[REDUCE_THREADS_PER_WINDOW]; + shared_prefix[tid] = local_running; + __syncthreads(); + for(int d = 1; d < P; d <<= 1) { + G1EdExtended tmp; + bool do_add = (tid >= d); + if(do_add) tmp = shared_prefix[tid - d]; + __syncthreads(); + if(do_add) ec_te_unified_add(shared_prefix[tid], tmp); + __syncthreads(); + } + + if(tid == 0) + partial_sums[w * blocks_per_window + part] = shared_prefix[P - 1]; + + // Convert to exclusive prefix + G1EdExtended my_exclusive; + if(tid == 0) ec_te_set_identity(my_exclusive); + else my_exclusive = shared_prefix[tid - 1]; + __syncthreads(); + shared_prefix[tid] = my_exclusive; + __syncthreads(); + + G1EdExtended correction; + ec_te_mul_small(correction, shared_prefix[tid], local_len); + ec_te_unified_add(local_total, correction); + + // Tree reduction of corrected totals + shared_prefix[tid] = local_total; + __syncthreads(); + for(int stride = P / 2; stride > 0; stride >>= 1) { + if(tid < stride) ec_te_unified_add(shared_prefix[tid], shared_prefix[tid + stride]); + __syncthreads(); + } + if(tid == 0) + partial_totals[w * blocks_per_window + part] = shared_prefix[0]; +} + +// ============================================================================= +// Kernel 4b: Finalize — combine partial ranges into one result per window +// ============================================================================= + +__global__ void reduce_buckets_finalize_kernel( + const G1EdExtended *__restrict__ partial_totals, + const G1EdExtended *__restrict__ partial_sums, + G1EdExtended *__restrict__ window_results, + int num_windows, int num_buckets, int blocks_per_window) { + + int w = blockIdx.x; + if(w >= num_windows) return; + int tid = threadIdx.x; + extern __shared__ G1EdExtended smem[]; + + int range_size = (num_buckets + blocks_per_window - 1) / blocks_per_window; + + G1EdExtended my_total, my_sum; + int my_len = 0; + if(tid < blocks_per_window) { + int high = num_buckets - 1 - tid * range_size; + if(high >= 0) { + int low = high - range_size + 1; + if(low < 0) low = 0; + my_len = high - low + 1; + my_total = partial_totals[w * blocks_per_window + tid]; + my_sum = partial_sums[w * blocks_per_window + tid]; + } else { ec_te_set_identity(my_total); ec_te_set_identity(my_sum); } + } else { ec_te_set_identity(my_total); ec_te_set_identity(my_sum); } + + // Exclusive prefix scan of partial_sums + smem[tid] = my_sum; + __syncthreads(); + for(int d = 1; d < FINALIZE_THREADS; d <<= 1) { + G1EdExtended tmp; + bool do_add = (tid >= d && tid < blocks_per_window); + if(do_add) tmp = smem[tid - d]; + __syncthreads(); + if(do_add) ec_te_unified_add(smem[tid], tmp); + __syncthreads(); + } + G1EdExtended my_exclusive; + if(tid == 0) ec_te_set_identity(my_exclusive); + else if(tid < blocks_per_window) my_exclusive = smem[tid - 1]; + else ec_te_set_identity(my_exclusive); + __syncthreads(); + smem[tid] = my_exclusive; + __syncthreads(); + + if(tid < blocks_per_window && my_len > 0) { + G1EdExtended correction; + ec_te_mul_small(correction, smem[tid], my_len); + ec_te_unified_add(my_total, correction); + } + + smem[tid] = my_total; + __syncthreads(); + for(int stride = FINALIZE_THREADS / 2; stride > 0; stride >>= 1) { + if(tid < stride) ec_te_unified_add(smem[tid], smem[tid + stride]); + __syncthreads(); + } + if(tid == 0) { + G1EdExtended result = smem[0]; + ec_te_reduce(result); + window_results[w] = result; + } +} + +// ============================================================================= +// MSM context +// ============================================================================= + +// Phase-timing event indices (cudaEvents bracket the listed phases of +// msm_run_full). Events recorded with cudaEventRecord on the compute stream; +// no per-phase synchronization is added (events are non-blocking on the host). +// After the final cudaStreamSynchronize, cudaEventElapsedTime fills timings. +enum MSMPhase { + PHASE_H2D = 0, // scalar upload + PHASE_BUILD_PAIRS = 1, // signed-digit decomposition + PHASE_SORT = 2, // CUB radix sort + PHASE_BOUNDARIES = 3, // memset + detect_bucket_boundaries + PHASE_ACCUM_SEQ = 4, // accumulate_buckets_kernel (sequential, with cap) + PHASE_ACCUM_PAR = 5, // accumulate_buckets_parallel_kernel (overflow tail) + PHASE_REDUCE_PARTIAL = 6, + PHASE_REDUCE_FINALIZE = 7, + PHASE_D2H = 8, // window results back to host + PHASE_COUNT = 9, +}; + +struct MSMContext { + size_t max_points; + int c, num_windows, num_buckets, sort_key_bits, reduce_blocks_per_window; + + G1EdXY *d_points; + // Optional Short-Weierstrass affine point buffer used by the + // batched-affine accumulate kernel (see GNARK_GPU_MSM_BATCHED_AFFINE). + // nullptr unless gnark_gpu_msm_load_points_sw was called. + G1AffineSW *d_points_sw; + uint64_t *d_scalars; + uint32_t *d_bucket_offsets, *d_bucket_ends, *d_point_indices; + G1EdExtended *d_buckets, *d_window_results, *d_window_accum; + G1EdExtended *d_window_partial_totals, *d_window_partial_sums; + uint32_t *d_keys_in, *d_keys_out, *d_vals_in; + uint32_t *d_overflow_buckets, *d_overflow_count; + void *d_sort_temp; + size_t sort_temp_bytes; + + // Double-buffered pinned staging for overlapped CPU memcpy + GPU DMA + void *h_scalar_staging; // pinned buffer A + void *h_scalar_staging_b; // pinned buffer B + size_t staging_buf_bytes; // per-buffer size in bytes (0 if alloc failed) + + // Optional persistent registration of caller scalar memory. + const void *registered_host_ptr; + size_t registered_host_bytes; + bool host_registered; + + // Per-phase timing events (boundary events: phase i runs between + // phase_event[i] and phase_event[i+1]). + cudaEvent_t phase_event[PHASE_COUNT + 1]; + float phase_timings_ms[PHASE_COUNT]; + bool phase_par_recorded; // accum_par may be skipped (no overflow) + bool phase_events_valid; // false if event creation failed + + // Persistent-buffer mode. When set, msm_run_full keeps work buffers and + // host registration alive across calls. Use for back-to-back MSMs. + // Toggle with msm_pin_buffers / msm_release_buffers. + bool buffers_pinned; +}; + +// ── Lazy work buffer management ── +// +// Sort buffers (d_keys_in/out, d_vals_in, d_point_indices, d_sort_temp, +// d_scalars) dominate MSM VRAM at large n (~49 GiB at n=2^27). They are +// only needed during msm_run_full, so we allocate them lazily before each +// run and free them immediately after. This allows the quotient phase to +// reclaim all that VRAM for working vectors + selector uploads. +// +// At n=2^27 with c=17, lazy alloc/free adds ~5-10ms per MSM call (negligible +// vs 200-1700ms compute). The permanent allocations (points, buckets, window +// results) stay resident. + +cudaError_t msm_alloc_work_buffers(MSMContext *ctx) { + if(ctx->d_keys_in) return cudaSuccess; // already allocated + size_t max_assignments = ctx->max_points * (size_t)ctx->num_windows; + cudaGetLastError(); // clear any prior error + cudaMalloc(&ctx->d_scalars, ctx->max_points * 4 * sizeof(uint64_t)); + cudaMalloc(&ctx->d_keys_in, max_assignments * sizeof(uint32_t)); + cudaMalloc(&ctx->d_keys_out, max_assignments * sizeof(uint32_t)); + cudaMalloc(&ctx->d_vals_in, max_assignments * sizeof(uint32_t)); + cudaMalloc(&ctx->d_point_indices, max_assignments * sizeof(uint32_t)); + cudaMalloc(&ctx->d_sort_temp, ctx->sort_temp_bytes); + cudaError_t err = cudaGetLastError(); + if(err != cudaSuccess) { + // Allocation failed — free whatever was partially allocated and clear error. + auto safe_free = [](auto &p) { if(p) { cudaFree(p); p = nullptr; } }; + safe_free(ctx->d_scalars); + safe_free(ctx->d_keys_in); safe_free(ctx->d_keys_out); + safe_free(ctx->d_vals_in); safe_free(ctx->d_point_indices); + safe_free(ctx->d_sort_temp); + cudaGetLastError(); + } + return err; +} + +void msm_free_work_buffers(MSMContext *ctx) { + auto free = [](auto &p) { if(p) { cudaFree(p); p = nullptr; } }; + free(ctx->d_scalars); + free(ctx->d_keys_in); free(ctx->d_keys_out); + free(ctx->d_vals_in); free(ctx->d_point_indices); + free(ctx->d_sort_temp); +} + +MSMContext *msm_create(size_t max_points) { + cudaGetLastError(); + + MSMContext *ctx = new MSMContext; + memset(ctx, 0, sizeof(MSMContext)); + ctx->max_points = max_points; + ctx->c = compute_optimal_c(max_points); + ctx->num_windows = (MSM_SCALAR_BITS + ctx->c - 1) / ctx->c; + ctx->num_buckets = 1 << (ctx->c - 1); // signed digits halve bucket count + + int total_buckets = ctx->num_windows * ctx->num_buckets; + size_t max_assignments = max_points * (size_t)ctx->num_windows; + + int key_val = total_buckets; + ctx->sort_key_bits = 1; + while((1 << ctx->sort_key_bits) <= key_val) ctx->sort_key_bits++; + + { + int max_bpw = ctx->num_buckets / REDUCE_THREADS_PER_WINDOW; + int target_bpw = 752 / ctx->num_windows; + ctx->reduce_blocks_per_window = max_bpw < target_bpw ? max_bpw : target_bpw; + if(ctx->reduce_blocks_per_window < 1) ctx->reduce_blocks_per_window = 1; + if(ctx->reduce_blocks_per_window > FINALIZE_THREADS) ctx->reduce_blocks_per_window = FINALIZE_THREADS; + } + + int total_partials = ctx->num_windows * ctx->reduce_blocks_per_window; + + // Permanent small allocations (points, buckets, window results). + // Sort buffers are allocated lazily in msm_run_full. + cudaMalloc(&ctx->d_points, max_points * sizeof(G1EdXY)); + cudaMalloc(&ctx->d_bucket_offsets, total_buckets * sizeof(uint32_t)); + cudaMalloc(&ctx->d_bucket_ends, total_buckets * sizeof(uint32_t)); + cudaMalloc(&ctx->d_buckets, total_buckets * sizeof(G1EdExtended)); + cudaMalloc(&ctx->d_window_results, ctx->num_windows * sizeof(G1EdExtended)); + cudaMalloc(&ctx->d_window_accum, ctx->num_windows * sizeof(G1EdExtended)); + cudaMalloc(&ctx->d_window_partial_totals, total_partials * sizeof(G1EdExtended)); + cudaMalloc(&ctx->d_window_partial_sums, total_partials * sizeof(G1EdExtended)); + cudaMalloc(&ctx->d_overflow_buckets, total_buckets * sizeof(uint32_t)); + cudaMalloc(&ctx->d_overflow_count, sizeof(uint32_t)); + + // Query CUB sort temp size (no allocation, just the size query). + ctx->sort_temp_bytes = 0; + cub::DeviceRadixSort::SortPairs(nullptr, ctx->sort_temp_bytes, + (uint32_t *)nullptr, (uint32_t *)nullptr, (uint32_t *)nullptr, + (uint32_t *)nullptr, max_assignments, 0, ctx->sort_key_bits); + + // Also check chunk-sized sort temp (CUB may need more for smaller inputs). + static constexpr size_t STAGING_CAP = 256ULL * 1024 * 1024; + size_t total_scalar_bytes = max_points * 4 * sizeof(uint64_t); + size_t per_buf = total_scalar_bytes / 2; + if(per_buf > STAGING_CAP) per_buf = STAGING_CAP; + if(per_buf < 32) per_buf = 32; + ctx->staging_buf_bytes = per_buf; + + size_t chunk_size = per_buf / (4 * sizeof(uint64_t)); + if(chunk_size > 0 && chunk_size < max_points) { + size_t chunk_assignments = chunk_size * (size_t)ctx->num_windows; + size_t chunk_sort_temp = 0; + cub::DeviceRadixSort::SortPairs(nullptr, chunk_sort_temp, + (uint32_t *)nullptr, (uint32_t *)nullptr, (uint32_t *)nullptr, + (uint32_t *)nullptr, chunk_assignments, 0, ctx->sort_key_bits); + if(chunk_sort_temp > ctx->sort_temp_bytes) + ctx->sort_temp_bytes = chunk_sort_temp; + } + + // Double-buffered pinned staging for scalar upload. + cudaError_t err_a = cudaHostAlloc(&ctx->h_scalar_staging, per_buf, cudaHostAllocDefault); + cudaError_t err_b = cudaHostAlloc(&ctx->h_scalar_staging_b, per_buf, cudaHostAllocDefault); + if(err_a != cudaSuccess || err_b != cudaSuccess) { + if(ctx->h_scalar_staging) { cudaFreeHost(ctx->h_scalar_staging); ctx->h_scalar_staging = nullptr; } + if(ctx->h_scalar_staging_b) { cudaFreeHost(ctx->h_scalar_staging_b); ctx->h_scalar_staging_b = nullptr; } + ctx->staging_buf_bytes = 0; + } + + // Per-phase timing events (cudaEventDefault — record host timestamps). + ctx->phase_events_valid = true; + for(int i = 0; i <= PHASE_COUNT; i++) { + if(cudaEventCreate(&ctx->phase_event[i]) != cudaSuccess) { + ctx->phase_events_valid = false; + break; + } + } + if(!ctx->phase_events_valid) { + for(int i = 0; i <= PHASE_COUNT; i++) { + if(ctx->phase_event[i]) { + cudaEventDestroy(ctx->phase_event[i]); + ctx->phase_event[i] = nullptr; + } + } + } + for(int i = 0; i < PHASE_COUNT; i++) ctx->phase_timings_ms[i] = 0.0f; + ctx->phase_par_recorded = false; + + return ctx; +} + +void msm_destroy(MSMContext *ctx) { + if(!ctx) return; + cudaFree(ctx->d_points); cudaFree(ctx->d_scalars); + if(ctx->d_points_sw) cudaFree(ctx->d_points_sw); + cudaFree(ctx->d_bucket_offsets); cudaFree(ctx->d_bucket_ends); + cudaFree(ctx->d_point_indices); cudaFree(ctx->d_buckets); + cudaFree(ctx->d_window_results); cudaFree(ctx->d_window_accum); + cudaFree(ctx->d_window_partial_totals); cudaFree(ctx->d_window_partial_sums); + cudaFree(ctx->d_keys_in); cudaFree(ctx->d_keys_out); cudaFree(ctx->d_vals_in); + cudaFree(ctx->d_overflow_buckets); cudaFree(ctx->d_overflow_count); + cudaFree(ctx->d_sort_temp); + if(ctx->h_scalar_staging) cudaFreeHost(ctx->h_scalar_staging); + if(ctx->h_scalar_staging_b) cudaFreeHost(ctx->h_scalar_staging_b); + if(ctx->host_registered && ctx->registered_host_ptr) cudaHostUnregister((void *)ctx->registered_host_ptr); + if(ctx->phase_events_valid) { + for(int i = 0; i <= PHASE_COUNT; i++) { + if(ctx->phase_event[i]) cudaEventDestroy(ctx->phase_event[i]); + } + } + delete ctx; +} + +void msm_load_points(MSMContext *ctx, const void *host_points, size_t count, cudaStream_t stream) { + cudaMemcpyAsync(ctx->d_points, host_points, count * sizeof(G1EdXY), cudaMemcpyHostToDevice, stream); +} +void msm_offload_points(MSMContext *ctx) { + if(ctx->d_points) { cudaFree(ctx->d_points); ctx->d_points = nullptr; } +} +void msm_unregister_host(MSMContext *ctx) { + if(ctx->host_registered && ctx->registered_host_ptr) { + cudaHostUnregister((void *)ctx->registered_host_ptr); + cudaGetLastError(); // clear non-sticky error from failed unregister + ctx->host_registered = false; + ctx->registered_host_ptr = nullptr; + ctx->registered_host_bytes = 0; + } +} +cudaError_t msm_reload_points(MSMContext *ctx, const void *host_points, size_t count, cudaStream_t stream) { + cudaError_t err = cudaMalloc(&ctx->d_points, count * sizeof(G1EdXY)); + if(err != cudaSuccess) return err; + cudaMemcpyAsync(ctx->d_points, host_points, count * sizeof(G1EdXY), cudaMemcpyHostToDevice, stream); + return cudaSuccess; +} + +// Upload SW affine points into the optional d_points_sw buffer used by the +// batched-affine accumulate kernel. May be called any time after msm_create. +cudaError_t msm_load_points_sw(MSMContext *ctx, const void *host_sw_points, size_t count, cudaStream_t stream) { + if(!ctx) return cudaErrorInvalidValue; + if(!ctx->d_points_sw) { + cudaError_t err = cudaMalloc(&ctx->d_points_sw, ctx->max_points * sizeof(G1AffineSW)); + if(err != cudaSuccess) return err; + } + if(count > ctx->max_points) count = ctx->max_points; + cudaMemcpyAsync(ctx->d_points_sw, host_sw_points, + count * sizeof(G1AffineSW), cudaMemcpyHostToDevice, stream); + return cudaSuccess; +} +void msm_upload_scalars(MSMContext *ctx, const uint64_t *host_scalars, size_t n, cudaStream_t stream) { + size_t bytes = n * 4 * sizeof(uint64_t); + if(ctx->h_scalar_staging && ctx->h_scalar_staging_b && bytes <= 2 * ctx->staging_buf_bytes) { + // Double-buffered: overlap CPU memcpy with GPU DMA. + size_t half = bytes / 2; + const char *src = (const char *)host_scalars; + + memcpy(ctx->h_scalar_staging, src, half); + cudaMemcpyAsync(ctx->d_scalars, ctx->h_scalar_staging, half, cudaMemcpyHostToDevice, stream); + memcpy(ctx->h_scalar_staging_b, src + half, bytes - half); + cudaMemcpyAsync((char *)ctx->d_scalars + half, ctx->h_scalar_staging_b, bytes - half, cudaMemcpyHostToDevice, stream); + } else { + cudaMemcpyAsync(ctx->d_scalars, host_scalars, bytes, cudaMemcpyHostToDevice, stream); + } +} + +// Helper: record phase event on stream if event tracking is enabled. +static inline void msm_record_event(MSMContext *ctx, int idx, cudaStream_t stream) { + if(ctx->phase_events_valid) cudaEventRecord(ctx->phase_event[idx], stream); +} + +// ============================================================================= +// Batched-affine bucket accumulation (one block per bucket). +// +// Algorithm: pairwise reduction in SW affine, with block-local Montgomery +// batched inversion. log₂(B) waves; each wave halves the active count. +// +// Phases per wave: +// 1. Each thread t ∈ [0, B/2) computes Δx[t] = x_{2t+1} − x_{2t}. +// 2. Lane 0 sequentially walks the Montgomery batched-invert protocol +// (forward prefix product, single fp_inv on the global product, backward +// scan). All other lanes are idle here — fine for an MVP since fp_inv +// and ~B muls are dwarfed by the per-thread serial chain savings of +// log₂(B) vs B in the full reduction. +// 3. Each thread t ∈ [0, B/2) applies pair add using inv_dx[t]. +// 4. If active is odd, the last point is passed through unchanged. +// +// Limitations of this MVP: +// - Fixed shared-mem layout: B ≤ MAX_BATCHED_AFFINE_B (256). Buckets larger +// than this are not handled; caller must dispatch to a fallback. +// - Single-lane invert is the obvious bottleneck. Phase 2 will become a +// parallel block-scan once the kernel proves the algorithmic win. +// - Special cases (Δx=0 → P+P or P+(−P)) are not handled. For random-scalar +// MSM with sorted-by-bucket pairs, two distinct point indices in the same +// bucket have ~zero probability of identical x-coordinate. +// +// Output is per-bucket sum in G1EdExtended for compatibility with the +// existing reduce phase (the SW→TE conversion is done once per bucket at the +// end of the kernel — amortized over all the adds that produced this point). +// ============================================================================= + +static constexpr int MAX_BATCHED_AFFINE_B = 256; +static constexpr int MAX_BATCHED_AFFINE_HALF = MAX_BATCHED_AFFINE_B / 2; + +__global__ void __launch_bounds__(MAX_BATCHED_AFFINE_B, 1) + accumulate_buckets_batched_affine_kernel( + const G1AffineSW *__restrict__ points, + const uint32_t *__restrict__ point_indices, + const uint32_t *__restrict__ bucket_offsets, + const uint32_t *__restrict__ bucket_ends, + G1EdExtended *__restrict__ buckets, + int total_buckets) { + + int bucket_flat = blockIdx.x; + if(bucket_flat >= total_buckets) return; + + uint32_t start = bucket_offsets[bucket_flat]; + uint32_t end = bucket_ends[bucket_flat]; + int B = (int)(end - start); + int tid = threadIdx.x; + + // Empty bucket: write identity and return. + if(B == 0) { + if(tid == 0) ec_te_set_identity(buckets[bucket_flat]); + return; + } + + // Truncate to MVP bound. (Caller is responsible for falling back on + // any bucket larger than MAX_BATCHED_AFFINE_B.) + if(B > MAX_BATCHED_AFFINE_B) B = MAX_BATCHED_AFFINE_B; + + __shared__ G1AffineSW pts[MAX_BATCHED_AFFINE_B]; + __shared__ uint64_t dx_orig[MAX_BATCHED_AFFINE_HALF * 6]; + __shared__ uint64_t prefix [MAX_BATCHED_AFFINE_HALF * 6]; + __shared__ uint64_t inv_dx [MAX_BATCHED_AFFINE_HALF * 6]; + + // Phase A: parallel point load with conditional negation. + if(tid < B) { + uint32_t packed = point_indices[start + tid]; + G1AffineSW p = points[packed & 0x7FFFFFFFu]; + g1sw_cnegate(p, (bool)(packed >> 31)); + pts[tid] = p; + } + __syncthreads(); + + // Phase B: pairwise reduction waves. + int active = B; + while(active > 1) { + int half = active >> 1; // #pairs + bool oddTail = (active & 1) != 0; // last element passes through + + // B.1: each thread t in [0, half) computes Δx[t]. + if(tid < half) { + uint64_t dx[6]; + fp_sub(dx, pts[2*tid + 1].x, pts[2*tid].x); + #pragma unroll + for(int k = 0; k < 6; k++) dx_orig[tid*6 + k] = dx[k]; + } + __syncthreads(); + + // B.2: lane 0 does Montgomery batched-invert. + if(tid == 0 && half > 0) { + // Forward scan: prefix[i] = Δx[0] · Δx[1] · … · Δx[i] + uint64_t running[6]; + #pragma unroll + for(int k = 0; k < 6; k++) running[k] = dx_orig[k]; + #pragma unroll + for(int k = 0; k < 6; k++) prefix[k] = running[k]; + + for(int i = 1; i < half; i++) { + uint64_t cur[6]; + #pragma unroll + for(int k = 0; k < 6; k++) cur[k] = dx_orig[i*6 + k]; + uint64_t next[6]; + fp_mul(next, running, cur); + #pragma unroll + for(int k = 0; k < 6; k++) running[k] = next[k]; + #pragma unroll + for(int k = 0; k < 6; k++) prefix[i*6 + k] = next[k]; + } + + // Single inversion of the global product. + uint64_t inv_total[6]; + fp_inv(inv_total, running); + + // Backward scan: extract individual inverses. + // inv[i] = inv_total · prefix[i-1] (for i > 0) + // inv_total ← inv_total · Δx[i] + // inv[0] = the final inv_total + uint64_t r[6]; + #pragma unroll + for(int k = 0; k < 6; k++) r[k] = inv_total[k]; + + for(int i = half - 1; i > 0; i--) { + uint64_t prev[6]; + #pragma unroll + for(int k = 0; k < 6; k++) prev[k] = prefix[(i-1)*6 + k]; + uint64_t inv_i[6]; + fp_mul(inv_i, r, prev); + #pragma unroll + for(int k = 0; k < 6; k++) inv_dx[i*6 + k] = inv_i[k]; + + uint64_t cur_dx[6]; + #pragma unroll + for(int k = 0; k < 6; k++) cur_dx[k] = dx_orig[i*6 + k]; + uint64_t new_r[6]; + fp_mul(new_r, r, cur_dx); + #pragma unroll + for(int k = 0; k < 6; k++) r[k] = new_r[k]; + } + #pragma unroll + for(int k = 0; k < 6; k++) inv_dx[k] = r[k]; + } + __syncthreads(); + + // B.3 + B.4: read inputs into registers FIRST, sync, then write. + // Race fix: passthrough writes pts[half], which pair-add at + // tid=half/2 reads as pts[2*(half/2)] = pts[half]. Without separating + // read from write with a sync, different warps could read/write the + // same slot concurrently. + G1AffineSW lhs, rhs, last; + bool doPair = (tid < half); + bool doPass = (oddTail && tid == half); + if(doPair) { + lhs = pts[2*tid]; + rhs = pts[2*tid + 1]; + } + if(doPass) { + last = pts[active - 1]; + } + __syncthreads(); + + if(doPair) { + uint64_t inv[6]; + #pragma unroll + for(int k = 0; k < 6; k++) inv[k] = inv_dx[tid*6 + k]; + G1AffineSW out; + g1sw_pair_add_with_inv_dx(out, lhs, rhs, inv); + pts[tid] = out; + } + if(doPass) { + pts[half] = last; + } + __syncthreads(); + + active = (active + 1) >> 1; + } + + // Phase C: convert single result to TE extended. + if(tid == 0) { + G1EdExtended out; + g1sw_to_te_extended(out, pts[0]); + buckets[bucket_flat] = out; + } +} + +void launch_msm(MSMContext *ctx, size_t n, cudaStream_t stream) { + constexpr unsigned threads = 256; + int total_buckets = ctx->num_windows * ctx->num_buckets; + size_t assignments = n * (size_t)ctx->num_windows; + + // Build pairs + { + unsigned blocks = ((unsigned)n + threads - 1) / threads; + build_pairs_kernel<<>>( + ctx->d_scalars, ctx->d_keys_in, ctx->d_vals_in, n, + ctx->c, ctx->num_windows, ctx->num_buckets, total_buckets, 0); + } + msm_record_event(ctx, 2, stream); + + // Radix sort + size_t sort_bytes = ctx->sort_temp_bytes; + cub::DeviceRadixSort::SortPairs(ctx->d_sort_temp, sort_bytes, + ctx->d_keys_in, ctx->d_keys_out, ctx->d_vals_in, ctx->d_point_indices, + assignments, 0, ctx->sort_key_bits, stream); + msm_record_event(ctx, 3, stream); + + cudaMemsetAsync(ctx->d_bucket_offsets, 0, total_buckets * sizeof(uint32_t), stream); + cudaMemsetAsync(ctx->d_bucket_ends, 0, total_buckets * sizeof(uint32_t), stream); + + // Detect boundaries + { + unsigned blocks = (unsigned)((assignments + threads - 1) / threads); + detect_bucket_boundaries_kernel<<>>( + ctx->d_keys_out, ctx->d_bucket_offsets, ctx->d_bucket_ends, + assignments, total_buckets); + } + msm_record_event(ctx, 4, stream); + + // Accumulate (two-phase: sequential with cap, then parallel for overflow) + // + // Dynamic cap: max(ACCUM_SEQ_CAP, 2·avg + 64). For uniform scalars, the + // cap exceeds the max bucket size (Poisson tail), so phase 1 handles + // everything. For concentrated scalars (bucket size >> cap), phase 2 + // distributes the tail across 128 threads. + ctx->phase_par_recorded = false; + { + size_t avg = assignments / (size_t)total_buckets; + int cap = (int)(2 * avg + 64); + if(cap < ACCUM_SEQ_CAP) cap = ACCUM_SEQ_CAP; + if(cap > 4096) cap = 4096; + + unsigned seq_blocks = ((unsigned)total_buckets + threads - 1) / threads; + + // Optional path: batched-affine accumulation (one block per bucket). + // Requires SW affine points to have been uploaded via msm_load_points_sw. + // Skips the seq + parallel-tail kernels entirely when active. + if(batched_affine_enabled() && ctx->d_points_sw != nullptr) { + constexpr int BAA_BLOCK = MAX_BATCHED_AFFINE_B; + accumulate_buckets_batched_affine_kernel<<>>( + ctx->d_points_sw, ctx->d_point_indices, + ctx->d_bucket_offsets, ctx->d_bucket_ends, + ctx->d_buckets, total_buckets); + msm_record_event(ctx, 5, stream); + // No accum_par phase recorded — leave phase_par_recorded = false. + } else if(compact_overflow_buckets(n)) { + // Phase 1: Sequential — each thread handles min(bucket_size, cap). + cudaMemsetAsync(ctx->d_overflow_count, 0, sizeof(uint32_t), stream); + accumulate_buckets_kernel<<>>( + ctx->d_points, ctx->d_point_indices, + ctx->d_bucket_offsets, ctx->d_bucket_ends, ctx->d_buckets, + total_buckets, false, cap, ctx->d_overflow_buckets, ctx->d_overflow_count); + msm_record_event(ctx, 5, stream); + + // Phase 2: Parallel tree-reduce only buckets that exceeded the cap. + // Random proving scalars normally produce no overflow buckets; compacting + // avoids launching one empty 128-thread block for every bucket. + uint32_t overflow_count = 0; + cudaMemcpyAsync(&overflow_count, ctx->d_overflow_count, sizeof(uint32_t), + cudaMemcpyDeviceToHost, stream); + cudaStreamSynchronize(stream); + if(overflow_count > 0) { + size_t smem = ACCUM_PARALLEL_THREADS * sizeof(G1EdExtended); + accumulate_buckets_parallel_kernel<<>>( + ctx->d_points, ctx->d_point_indices, + ctx->d_bucket_offsets, ctx->d_bucket_ends, ctx->d_overflow_buckets, + ctx->d_buckets, true, (uint32_t)cap); + msm_record_event(ctx, 6, stream); + ctx->phase_par_recorded = true; + } + } else { + accumulate_buckets_kernel<<>>( + ctx->d_points, ctx->d_point_indices, + ctx->d_bucket_offsets, ctx->d_bucket_ends, ctx->d_buckets, + total_buckets, false, cap, nullptr, nullptr); + msm_record_event(ctx, 5, stream); + + size_t smem = ACCUM_PARALLEL_THREADS * sizeof(G1EdExtended); + accumulate_buckets_parallel_kernel<<>>( + ctx->d_points, ctx->d_point_indices, + ctx->d_bucket_offsets, ctx->d_bucket_ends, nullptr, + ctx->d_buckets, true, (uint32_t)cap); + msm_record_event(ctx, 6, stream); + ctx->phase_par_recorded = true; + } + } + + // Reduce + { + int bpw = ctx->reduce_blocks_per_window; + reduce_buckets_partial_kernel<<num_windows * bpw, REDUCE_THREADS_PER_WINDOW, 0, stream>>>( + ctx->d_buckets, ctx->d_window_partial_totals, ctx->d_window_partial_sums, + ctx->num_windows, ctx->num_buckets, bpw); + msm_record_event(ctx, 7, stream); + + size_t smem = FINALIZE_THREADS * sizeof(G1EdExtended); + reduce_buckets_finalize_kernel<<num_windows, FINALIZE_THREADS, smem, stream>>>( + ctx->d_window_partial_totals, ctx->d_window_partial_sums, ctx->d_window_results, + ctx->num_windows, ctx->num_buckets, bpw); + msm_record_event(ctx, 8, stream); + } +} + +void msm_download_results(MSMContext *ctx, G1EdExtended *host_results, cudaStream_t stream) { + cudaMemcpyAsync(host_results, ctx->d_window_results, + ctx->num_windows * sizeof(G1EdExtended), cudaMemcpyDeviceToHost, stream); +} + +// ============================================================================= +// Full MSM pipeline: fast scalar upload + single-pass compute +// +// For large n, uses cudaHostRegister to pin the caller's (Go heap) memory +// in-place, enabling full-bandwidth DMA without CPU-side memcpy through +// staging buffers. This avoids CUDA's internal pageable→pinned staging +// which is the main transfer bottleneck. +// +// Fallback: staging buffers for small n, or if registration fails. +// ============================================================================= + +cudaError_t msm_run_full(MSMContext *ctx, const uint64_t *host_scalars, size_t n, + G1EdExtended *host_results, cudaStream_t compute_stream) { + + // Lazy alloc sort buffers (d_scalars, d_keys, d_sort_temp, etc.) + cudaError_t alloc_err = msm_alloc_work_buffers(ctx); + if(alloc_err != cudaSuccess) return alloc_err; + + size_t total_bytes = n * 4 * sizeof(uint64_t); + const size_t register_threshold = host_register_threshold_points(); + + // For large n, try to pin the caller's memory for fast DMA + bool registered = false; + if(register_threshold > 0 && n >= register_threshold) { + if(ctx->host_registered) { + const bool pointer_changed = (ctx->registered_host_ptr != host_scalars); + const bool need_larger_span = !pointer_changed && + (ctx->registered_host_bytes < total_bytes); + if(pointer_changed || need_larger_span) { + cudaHostUnregister((void *)ctx->registered_host_ptr); + ctx->host_registered = false; + ctx->registered_host_ptr = nullptr; + ctx->registered_host_bytes = 0; + } + } + if(!ctx->host_registered) { + cudaError_t reg_err = cudaHostRegister( + (void *)host_scalars, total_bytes, cudaHostRegisterDefault); + if(reg_err == cudaSuccess) { + ctx->host_registered = true; + ctx->registered_host_ptr = host_scalars; + ctx->registered_host_bytes = total_bytes; + } else { + cudaGetLastError(); + } + } + registered = ctx->host_registered; + } + + msm_record_event(ctx, 0, compute_stream); // start of H2D + if(registered) { + cudaMemcpyAsync(ctx->d_scalars, host_scalars, total_bytes, + cudaMemcpyHostToDevice, compute_stream); + } else { + msm_upload_scalars(ctx, host_scalars, n, compute_stream); + } + msm_record_event(ctx, 1, compute_stream); // end of H2D = start of build_pairs + + launch_msm(ctx, n, compute_stream); + + msm_download_results(ctx, host_results, compute_stream); + msm_record_event(ctx, 9, compute_stream); // end of D2H + + // Sync before freeing work buffers (kernels must finish using them). + cudaError_t sync_err = cudaStreamSynchronize(compute_stream); + + // Compute per-phase elapsed times. Skipped accum_par phase reads back as 0. + if(sync_err == cudaSuccess && ctx->phase_events_valid) { + auto elapsed = [&](int from, int to) -> float { + float ms = 0.0f; + if(cudaEventElapsedTime(&ms, ctx->phase_event[from], ctx->phase_event[to]) + != cudaSuccess) { + cudaGetLastError(); + ms = 0.0f; + } + return ms; + }; + ctx->phase_timings_ms[PHASE_H2D] = elapsed(0, 1); + ctx->phase_timings_ms[PHASE_BUILD_PAIRS] = elapsed(1, 2); + ctx->phase_timings_ms[PHASE_SORT] = elapsed(2, 3); + ctx->phase_timings_ms[PHASE_BOUNDARIES] = elapsed(3, 4); + ctx->phase_timings_ms[PHASE_ACCUM_SEQ] = elapsed(4, 5); + ctx->phase_timings_ms[PHASE_ACCUM_PAR] = + ctx->phase_par_recorded ? elapsed(5, 6) : 0.0f; + // reduce_partial spans event[6] (post-accum_par or post-accum_seq) to event[7] + int reduce_start = ctx->phase_par_recorded ? 6 : 5; + ctx->phase_timings_ms[PHASE_REDUCE_PARTIAL] = elapsed(reduce_start, 7); + ctx->phase_timings_ms[PHASE_REDUCE_FINALIZE] = elapsed(7, 8); + ctx->phase_timings_ms[PHASE_D2H] = elapsed(8, 9); + } + + // In pinned mode, keep work buffers and host registration alive so + // back-to-back MSMs amortize ~5–10 ms of cudaMalloc/Free + register/ + // unregister overhead. Caller releases via msm_release_buffers (typically + // before the quotient phase that needs the VRAM back). + if(!ctx->buffers_pinned) { + // Unregister host memory before freeing sort buffers. + msm_unregister_host(ctx); + + // Free sort buffers to reclaim VRAM for other phases. + msm_free_work_buffers(ctx); + } + + return sync_err; +} + +// Pin work buffers (keep allocations and host registration across calls). +void msm_pin_buffers(MSMContext *ctx) { + if(ctx) ctx->buffers_pinned = true; +} + +// Release pinned work buffers immediately (frees VRAM, drops host registration). +// Subsequent msm_run_full calls will re-allocate lazily. +void msm_release_buffers(MSMContext *ctx) { + if(!ctx) return; + ctx->buffers_pinned = false; + msm_unregister_host(ctx); + msm_free_work_buffers(ctx); +} + +int msm_get_c(MSMContext *ctx) { return ctx->c; } +int msm_get_num_windows(MSMContext *ctx) { return ctx->num_windows; } + +// ============================================================================= +// Test kernels for SW affine primitives (validation against host reference). +// ============================================================================= + +// Compute out = p0 + p1 in SW affine using fp_inv to recover 1/(x1-x0) and +// then the unified pair-add formula. Inputs and outputs use the gnark +// G1Affine memory layout (12 uint64 limbs in Montgomery form: x[0..6], y[0..6]). +__global__ void test_sw_pair_add_kernel( + const uint64_t *p0_xy, + const uint64_t *p1_xy, + uint64_t *out_xy) { + + if(threadIdx.x != 0) return; + + G1AffineSW p0, p1; + for(int i = 0; i < 6; i++) p0.x[i] = p0_xy[i]; + for(int i = 0; i < 6; i++) p0.y[i] = p0_xy[6 + i]; + for(int i = 0; i < 6; i++) p1.x[i] = p1_xy[i]; + for(int i = 0; i < 6; i++) p1.y[i] = p1_xy[6 + i]; + + uint64_t dx[6], inv_dx[6]; + fp_sub(dx, p1.x, p0.x); + fp_inv(inv_dx, dx); + + G1AffineSW out; + g1sw_pair_add_with_inv_dx(out, p0, p1, inv_dx); + + for(int i = 0; i < 6; i++) out_xy[i] = out.x[i]; + for(int i = 0; i < 6; i++) out_xy[6 + i] = out.y[i]; +} + +// Compute out_te = SW→TE forward mapping of p_sw. Output is G1EdExtended +// (X, Y, T, Z) — 24 uint64 limbs in Montgomery form. +__global__ void test_sw_to_te_kernel( + const uint64_t *p_sw_xy, + uint64_t *out_te_xytz) { + + if(threadIdx.x != 0) return; + + G1AffineSW p; + for(int i = 0; i < 6; i++) p.x[i] = p_sw_xy[i]; + for(int i = 0; i < 6; i++) p.y[i] = p_sw_xy[6 + i]; + + G1EdExtended out; + g1sw_to_te_extended(out, p); + + for(int i = 0; i < 6; i++) out_te_xytz[i] = out.x[i]; + for(int i = 0; i < 6; i++) out_te_xytz[6 + i] = out.y[i]; + for(int i = 0; i < 6; i++) out_te_xytz[12 + i] = out.t[i]; + for(int i = 0; i < 6; i++) out_te_xytz[18 + i] = out.z[i]; +} + +cudaError_t test_sw_pair_add_run(const uint64_t *p0, const uint64_t *p1, uint64_t *out) { + uint64_t *d_p0, *d_p1, *d_out; + cudaMalloc(&d_p0, 12 * sizeof(uint64_t)); + cudaMalloc(&d_p1, 12 * sizeof(uint64_t)); + cudaMalloc(&d_out, 12 * sizeof(uint64_t)); + cudaMemcpy(d_p0, p0, 12 * sizeof(uint64_t), cudaMemcpyHostToDevice); + cudaMemcpy(d_p1, p1, 12 * sizeof(uint64_t), cudaMemcpyHostToDevice); + test_sw_pair_add_kernel<<<1, 32>>>(d_p0, d_p1, d_out); + cudaError_t err = cudaDeviceSynchronize(); + cudaMemcpy(out, d_out, 12 * sizeof(uint64_t), cudaMemcpyDeviceToHost); + cudaFree(d_p0); cudaFree(d_p1); cudaFree(d_out); + return err; +} + +// Test kernel: reduce N affine SW points via the same batched-affine +// pairwise reduction used in the bucket accumulator. Returns the final +// SW affine sum (one point) — no TE conversion. Used to isolate bugs in +// the multi-wave reduction logic. +__global__ void test_batched_affine_reduce_kernel( + const uint64_t *points_aos, // N × 12 uint64s, AoS (x[0..6], y[0..6]) + uint64_t *out_aos, // 12 uint64s SW affine + int N) { + + if(blockIdx.x != 0) return; + int tid = threadIdx.x; + + __shared__ G1AffineSW pts[MAX_BATCHED_AFFINE_B]; + __shared__ uint64_t dx_orig[MAX_BATCHED_AFFINE_HALF * 6]; + __shared__ uint64_t prefix [MAX_BATCHED_AFFINE_HALF * 6]; + __shared__ uint64_t inv_dx [MAX_BATCHED_AFFINE_HALF * 6]; + + // Load N points (no negation). + if(tid < N) { + G1AffineSW p; + for(int k = 0; k < 6; k++) p.x[k] = points_aos[tid * 12 + k]; + for(int k = 0; k < 6; k++) p.y[k] = points_aos[tid * 12 + 6 + k]; + pts[tid] = p; + } + __syncthreads(); + + int active = N; + while(active > 1) { + int half = active >> 1; + bool oddTail = (active & 1) != 0; + + if(tid < half) { + uint64_t dx[6]; + fp_sub(dx, pts[2*tid + 1].x, pts[2*tid].x); + for(int k = 0; k < 6; k++) dx_orig[tid*6 + k] = dx[k]; + } + __syncthreads(); + + if(tid == 0 && half > 0) { + uint64_t running[6]; + for(int k = 0; k < 6; k++) running[k] = dx_orig[k]; + for(int k = 0; k < 6; k++) prefix[k] = running[k]; + for(int i = 1; i < half; i++) { + uint64_t cur[6]; + for(int k = 0; k < 6; k++) cur[k] = dx_orig[i*6 + k]; + uint64_t next[6]; + fp_mul(next, running, cur); + for(int k = 0; k < 6; k++) running[k] = next[k]; + for(int k = 0; k < 6; k++) prefix[i*6 + k] = next[k]; + } + uint64_t inv_total[6]; + fp_inv(inv_total, running); + uint64_t r[6]; + for(int k = 0; k < 6; k++) r[k] = inv_total[k]; + for(int i = half - 1; i > 0; i--) { + uint64_t prev[6]; + for(int k = 0; k < 6; k++) prev[k] = prefix[(i-1)*6 + k]; + uint64_t inv_i[6]; + fp_mul(inv_i, r, prev); + for(int k = 0; k < 6; k++) inv_dx[i*6 + k] = inv_i[k]; + uint64_t cur_dx[6]; + for(int k = 0; k < 6; k++) cur_dx[k] = dx_orig[i*6 + k]; + uint64_t new_r[6]; + fp_mul(new_r, r, cur_dx); + for(int k = 0; k < 6; k++) r[k] = new_r[k]; + } + for(int k = 0; k < 6; k++) inv_dx[k] = r[k]; + } + __syncthreads(); + + // Race-safe: read both pair-add inputs and passthrough source into + // registers BEFORE any thread writes. See accumulate_buckets_batched_ + // _affine_kernel for the rationale. + G1AffineSW lhs, rhs, last; + bool doPair = (tid < half); + bool doPass = (oddTail && tid == half); + if(doPair) { + lhs = pts[2*tid]; + rhs = pts[2*tid + 1]; + } + if(doPass) { + last = pts[active - 1]; + } + __syncthreads(); + + if(doPair) { + uint64_t inv[6]; + for(int k = 0; k < 6; k++) inv[k] = inv_dx[tid*6 + k]; + G1AffineSW out; + g1sw_pair_add_with_inv_dx(out, lhs, rhs, inv); + pts[tid] = out; + } + if(doPass) { + pts[half] = last; + } + __syncthreads(); + + active = (active + 1) >> 1; + } + + if(tid == 0) { + for(int k = 0; k < 6; k++) out_aos[k] = pts[0].x[k]; + for(int k = 0; k < 6; k++) out_aos[6 + k] = pts[0].y[k]; + } +} + +cudaError_t test_batched_affine_reduce_run(const uint64_t *points_aos, uint64_t *out_aos, int N) { + uint64_t *d_in, *d_out; + cudaMalloc(&d_in, (size_t)N * 12 * sizeof(uint64_t)); + cudaMalloc(&d_out, 12 * sizeof(uint64_t)); + cudaMemcpy(d_in, points_aos, (size_t)N * 12 * sizeof(uint64_t), cudaMemcpyHostToDevice); + test_batched_affine_reduce_kernel<<<1, MAX_BATCHED_AFFINE_B>>>(d_in, d_out, N); + cudaError_t err = cudaDeviceSynchronize(); + cudaMemcpy(out_aos, d_out, 12 * sizeof(uint64_t), cudaMemcpyDeviceToHost); + cudaFree(d_in); cudaFree(d_out); + return err; +} + +cudaError_t test_sw_to_te_run(const uint64_t *p_sw, uint64_t *out_te) { + uint64_t *d_in, *d_out; + cudaMalloc(&d_in, 12 * sizeof(uint64_t)); + cudaMalloc(&d_out, 24 * sizeof(uint64_t)); + cudaMemcpy(d_in, p_sw, 12 * sizeof(uint64_t), cudaMemcpyHostToDevice); + test_sw_to_te_kernel<<<1, 32>>>(d_in, d_out); + cudaError_t err = cudaDeviceSynchronize(); + cudaMemcpy(out_te, d_out, 24 * sizeof(uint64_t), cudaMemcpyDeviceToHost); + cudaFree(d_in); cudaFree(d_out); + return err; +} + +// Copy the last-run per-phase timings into out (length = PHASE_COUNT, 9 floats). +// Returns the number of phases written (always PHASE_COUNT when valid). +int msm_get_phase_timings(MSMContext *ctx, float *out) { + if(!ctx || !out) return 0; + for(int i = 0; i < PHASE_COUNT; i++) out[i] = ctx->phase_timings_ms[i]; + return PHASE_COUNT; +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/ntt.cu b/prover/gpu/cuda/src/plonk/ntt.cu new file mode 100644 index 00000000000..f382b39601c --- /dev/null +++ b/prover/gpu/cuda/src/plonk/ntt.cu @@ -0,0 +1,1212 @@ +// ═══════════════════════════════════════════════════════════════════════════════ +// NTT (Number Theoretic Transform) for BLS12-377 scalar field Fr +// +// The NTT is the finite-field analog of the FFT, computing: +// Forward: Ŷ[k] = Σᵢ Y[i] · ωⁱᵏ (evaluation at roots of unity) +// Inverse: Y[i] = (1/n) Σₖ Ŷ[k] · ω⁻ⁱᵏ (interpolation) +// +// where ω is a primitive n-th root of unity in Fr (BLS12-377 scalar field). +// +// ┌─────────────────────────────────────────────────────────────────────────┐ +// │ DIF (Decimation-In-Frequency) Butterfly s = stage (0 = first) │ +// │ │ +// │ a ──────┬────── a' = a + b │ +// │ │ │ +// │ × ω │ +// │ │ │ +// │ b ──────┴────── b' = (a - b) · ω │ +// │ │ +// │ Natural input → bit-reversed output │ +// │ Stages: s = 0 (largest groups) down to s = log₂(n)-1 (pairs) │ +// ├─────────────────────────────────────────────────────────────────────────┤ +// │ DIT (Decimation-In-Time) Butterfly │ +// │ │ +// │ a ──────┬────── a' = a + ω·b │ +// │ │ │ +// │ × ω │ +// │ │ │ +// │ b ──────┴────── b' = a - ω·b │ +// │ │ +// │ Bit-reversed input → natural output (with 1/n scale) │ +// │ Stages: s = log₂(n)-1 (pairs) down to s = 0 (largest groups) │ +// └─────────────────────────────────────────────────────────────────────────┘ +// +// Kernel dispatch strategy (minimizes kernel launches): +// +// Forward (DIF): radix-8 → radix-2 → fused tail +// Inverse (DIT): fused tail → radix-8 → radix-2 +// +// Example for n = 2²⁰ (20 stages): +// DIF: stages 0-2 (r8), 3-5 (r8), 6-8 (r8), 9-19 (fused tail, 11 stages) +// = 4 kernel launches instead of 20 +// +// Fused tail: last 11-12 stages run entirely in shared memory (one block per +// chunk of 2¹¹ or 2¹² elements), eliminating global memory round-trips. +// Adaptive: tail_log=12 on GPUs with ≥128KB shared memory, else tail_log=11. +// +// Data layout: SoA (4 limb arrays of n uint64s each), same as FrVector. +// Twiddle layout: SoA (4 limb arrays), n/2 elements in Montgomery form. +// Twiddle indexing: flat table ω⁰, ω¹, ..., ω^(n/2-1). +// Stage s butterfly at position j uses twiddle at index j · 2ˢ. +// ═══════════════════════════════════════════════════════════════════════════════ + +#include "fr_arith.cuh" +#include +#include + +namespace gnark_gpu { + +void launch_transpose_aos_to_soa_fr(uint64_t *limb0, uint64_t *limb1, uint64_t *limb2, + uint64_t *limb3, const uint64_t *aos_data, size_t count, + cudaStream_t stream); + +namespace { + +constexpr unsigned NTT_THREADS = 256; +constexpr uint32_t NTT_FUSED_TAIL_MIN_N = 1u << 22; + +} // namespace + +// ============================================================================= +// DIF butterfly kernel (one stage of forward NTT) +// +// For stage s (0-indexed from MSB): +// half = n >> (s+1) +// For each butterfly tid in [0, n/2): +// group = tid / half; j = tid % half +// idx_a = group * 2*half + j; idx_b = idx_a + half +// tw_idx = j * (1 << s) [into flat twiddle table of n/2 entries] +// a' = a + b; b' = (a - b) * w +// ============================================================================= + +__global__ void ntt_dif_butterfly_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + uint32_t num_butterflies, uint32_t half, uint32_t half_mask, uint32_t tw_stride) +{ + uint32_t tid = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_butterflies) return; + + // half is always a power-of-two: replace costly div/mod with bit ops. + uint32_t j = tid & half_mask; + uint32_t group_base = tid & ~half_mask; + uint32_t idx_a = (group_base << 1) | j; + uint32_t idx_b = idx_a + half; + uint32_t tw_idx = j * tw_stride; + + // Load a, b + uint64_t a[4] = { d0[idx_a], d1[idx_a], d2[idx_a], d3[idx_a] }; + uint64_t b[4] = { d0[idx_b], d1[idx_b], d2[idx_b], d3[idx_b] }; + + // Load twiddle through read-only cache path. + uint64_t w[4] = { + __ldg(tw0 + tw_idx), __ldg(tw1 + tw_idx), + __ldg(tw2 + tw_idx), __ldg(tw3 + tw_idx) + }; + + // DIF butterfly: a' = a + b; b' = (a - b) * w + uint64_t sum[4], diff[4], prod[4]; + fr_add(sum, a, b); + fr_sub(diff, a, b); + fr_mul(prod, diff, w); + + // Store + d0[idx_a] = sum[0]; d1[idx_a] = sum[1]; d2[idx_a] = sum[2]; d3[idx_a] = sum[3]; + d0[idx_b] = prod[0]; d1[idx_b] = prod[1]; d2[idx_b] = prod[2]; d3[idx_b] = prod[3]; +} + +// ============================================================================= +// Radix-8 DIF butterfly kernel: fuses three adjacent stages (s, s+1, s+2). +// +// Each thread processes 8 elements from one radix-8 group. +// Stage s: butterflies at distance half_s = n >> (s+1) +// Stage s+1: butterflies at distance half_s1 = half_s/2 +// Stage s+2: butterflies at distance half_s2 = half_s/4 +// +// 8 element positions within a group of 2*half_s: +// p0 = base+j, p1 = p0+half_s2, p2 = p0+half_s1, p3 = p2+half_s2 +// p4 = p0+half_s, p5 = p4+half_s2, p6 = p4+half_s1, p7 = p6+half_s2 +// +// Twiddle loads: 4 (stage s) + 2 (stage s+1) + 1 (stage s+2) = 7 total. +// Arithmetic: 12 fr_mul + 12 fr_add + 12 fr_sub per thread. +// ============================================================================= + +__global__ void ntt_dif_radix8_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + uint32_t n, int stage_s) +{ + uint32_t tid = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + uint32_t num_r8 = n >> 3; + if (tid >= num_r8) return; + + uint32_t half_s = n >> (stage_s + 1); + uint32_t half_s1 = half_s >> 1; + uint32_t half_s2 = half_s >> 2; + + uint32_t j = tid & (half_s2 - 1); + uint32_t group = tid >> (__ffs(half_s2) - 1); + + uint32_t base = group * (2 * half_s); + uint32_t p0 = base + j; + uint32_t p1 = p0 + half_s2; + uint32_t p2 = p0 + half_s1; + uint32_t p3 = p2 + half_s2; + uint32_t p4 = p0 + half_s; + uint32_t p5 = p4 + half_s2; + uint32_t p6 = p4 + half_s1; + uint32_t p7 = p6 + half_s2; + + // Load 8 elements + uint64_t a0[4] = { d0[p0], d1[p0], d2[p0], d3[p0] }; + uint64_t a1[4] = { d0[p1], d1[p1], d2[p1], d3[p1] }; + uint64_t a2[4] = { d0[p2], d1[p2], d2[p2], d3[p2] }; + uint64_t a3[4] = { d0[p3], d1[p3], d2[p3], d3[p3] }; + uint64_t a4[4] = { d0[p4], d1[p4], d2[p4], d3[p4] }; + uint64_t a5[4] = { d0[p5], d1[p5], d2[p5], d3[p5] }; + uint64_t a6[4] = { d0[p6], d1[p6], d2[p6], d3[p6] }; + uint64_t a7[4] = { d0[p7], d1[p7], d2[p7], d3[p7] }; + + uint32_t tw_stride_s = 1u << stage_s; + uint32_t tw_stride_s1 = tw_stride_s << 1; + uint32_t tw_stride_s2 = tw_stride_s << 2; + + uint64_t w[4], sum[4], diff[4]; + uint32_t twi; + + // --- Stage s: 4 DIF butterflies at distance half_s --- + // Pairs: (a0,a4), (a1,a5), (a2,a6), (a3,a7) + twi = j * tw_stride_s; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_add(sum, a0, a4); fr_sub(diff, a0, a4); fr_mul(a4, diff, w); + a0[0]=sum[0]; a0[1]=sum[1]; a0[2]=sum[2]; a0[3]=sum[3]; + + twi = (j + half_s2) * tw_stride_s; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_add(sum, a1, a5); fr_sub(diff, a1, a5); fr_mul(a5, diff, w); + a1[0]=sum[0]; a1[1]=sum[1]; a1[2]=sum[2]; a1[3]=sum[3]; + + twi = (j + half_s1) * tw_stride_s; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_add(sum, a2, a6); fr_sub(diff, a2, a6); fr_mul(a6, diff, w); + a2[0]=sum[0]; a2[1]=sum[1]; a2[2]=sum[2]; a2[3]=sum[3]; + + twi = (j + half_s1 + half_s2) * tw_stride_s; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_add(sum, a3, a7); fr_sub(diff, a3, a7); fr_mul(a7, diff, w); + a3[0]=sum[0]; a3[1]=sum[1]; a3[2]=sum[2]; a3[3]=sum[3]; + + // --- Stage s+1: 4 DIF butterflies at distance half_s1 --- + // Top: (a0,a2), (a1,a3) Bottom: (a4,a6), (a5,a7) + uint64_t ws1_0[4], ws1_1[4]; + twi = j * tw_stride_s1; + ws1_0[0] = __ldg(tw0+twi); ws1_0[1] = __ldg(tw1+twi); + ws1_0[2] = __ldg(tw2+twi); ws1_0[3] = __ldg(tw3+twi); + twi = (j + half_s2) * tw_stride_s1; + ws1_1[0] = __ldg(tw0+twi); ws1_1[1] = __ldg(tw1+twi); + ws1_1[2] = __ldg(tw2+twi); ws1_1[3] = __ldg(tw3+twi); + + fr_add(sum, a0, a2); fr_sub(diff, a0, a2); fr_mul(a2, diff, ws1_0); + a0[0]=sum[0]; a0[1]=sum[1]; a0[2]=sum[2]; a0[3]=sum[3]; + + fr_add(sum, a1, a3); fr_sub(diff, a1, a3); fr_mul(a3, diff, ws1_1); + a1[0]=sum[0]; a1[1]=sum[1]; a1[2]=sum[2]; a1[3]=sum[3]; + + fr_add(sum, a4, a6); fr_sub(diff, a4, a6); fr_mul(a6, diff, ws1_0); + a4[0]=sum[0]; a4[1]=sum[1]; a4[2]=sum[2]; a4[3]=sum[3]; + + fr_add(sum, a5, a7); fr_sub(diff, a5, a7); fr_mul(a7, diff, ws1_1); + a5[0]=sum[0]; a5[1]=sum[1]; a5[2]=sum[2]; a5[3]=sum[3]; + + // --- Stage s+2: 4 DIF butterflies at distance half_s2 --- + // Pairs: (a0,a1), (a2,a3), (a4,a5), (a6,a7) — all same twiddle + twi = j * tw_stride_s2; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + + fr_add(sum, a0, a1); fr_sub(diff, a0, a1); fr_mul(a1, diff, w); + a0[0]=sum[0]; a0[1]=sum[1]; a0[2]=sum[2]; a0[3]=sum[3]; + + fr_add(sum, a2, a3); fr_sub(diff, a2, a3); fr_mul(a3, diff, w); + a2[0]=sum[0]; a2[1]=sum[1]; a2[2]=sum[2]; a2[3]=sum[3]; + + fr_add(sum, a4, a5); fr_sub(diff, a4, a5); fr_mul(a5, diff, w); + a4[0]=sum[0]; a4[1]=sum[1]; a4[2]=sum[2]; a4[3]=sum[3]; + + fr_add(sum, a6, a7); fr_sub(diff, a6, a7); fr_mul(a7, diff, w); + a6[0]=sum[0]; a6[1]=sum[1]; a6[2]=sum[2]; a6[3]=sum[3]; + + // Store + d0[p0]=a0[0]; d1[p0]=a0[1]; d2[p0]=a0[2]; d3[p0]=a0[3]; + d0[p1]=a1[0]; d1[p1]=a1[1]; d2[p1]=a1[2]; d3[p1]=a1[3]; + d0[p2]=a2[0]; d1[p2]=a2[1]; d2[p2]=a2[2]; d3[p2]=a2[3]; + d0[p3]=a3[0]; d1[p3]=a3[1]; d2[p3]=a3[2]; d3[p3]=a3[3]; + d0[p4]=a4[0]; d1[p4]=a4[1]; d2[p4]=a4[2]; d3[p4]=a4[3]; + d0[p5]=a5[0]; d1[p5]=a5[1]; d2[p5]=a5[2]; d3[p5]=a5[3]; + d0[p6]=a6[0]; d1[p6]=a6[1]; d2[p6]=a6[2]; d3[p6]=a6[3]; + d0[p7]=a7[0]; d1[p7]=a7[1]; d2[p7]=a7[2]; d3[p7]=a7[3]; +} + +// ============================================================================= +// DIF fused tail kernel: processes the LAST TAIL_LOG stages in shared memory. +// +// For n = 2²⁰ and TAIL_LOG = 11: +// - Each block handles a chunk of 2¹¹ = 2048 contiguous elements +// - 512 blocks cover the full array (2²⁰ / 2¹¹) +// - Stages 9 through 19 (11 stages) execute entirely in shared memory +// - One global load + one global store replaces 11 × 2 global accesses +// +// Shared memory: 4 limbs × 2^TAIL_LOG × 8 bytes +// TAIL_LOG=11: 4 × 2048 × 8 = 64 KiB (fits 99 KB optin on Blackwell) +// TAIL_LOG=12: 4 × 4096 × 8 = 128 KiB (needs ≥128 KB; A100/H100) +// +// Each thread processes multiple butterflies per stage via strided access. +// __syncthreads() between stages ensures data consistency. +// ============================================================================= +template +__global__ void __launch_bounds__(1024, 1) ntt_dif_tail_fused_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + uint32_t n, int stage_start) +{ + constexpr uint32_t span = 1u << TAIL_LOG; + constexpr uint32_t butterflies_per_chunk = span >> 1; + + uint32_t chunk = (uint32_t)blockIdx.x; + uint32_t base = chunk * span; + uint32_t t = threadIdx.x; + uint32_t P = blockDim.x; + + extern __shared__ uint64_t shmem[]; + uint64_t *s0 = shmem; + uint64_t *s1 = s0 + span; + uint64_t *s2 = s1 + span; + uint64_t *s3 = s2 + span; + + // Load: each thread handles span/P elements + for (uint32_t i = t; i < span; i += P) { + uint32_t global_idx = base + i; + if (global_idx < n) { + s0[i] = d0[global_idx]; + s1[i] = d1[global_idx]; + s2[i] = d2[global_idx]; + s3[i] = d3[global_idx]; + } + } + __syncthreads(); + + #pragma unroll + for (int st = 0; st < TAIL_LOG; st++) { + int s = stage_start + st; + uint32_t half = n >> (s + 1); + uint32_t half_mask = half - 1; + uint32_t tw_stride = 1u << s; + + for (uint32_t bt = t; bt < butterflies_per_chunk; bt += P) { + uint32_t j = bt & half_mask; + uint32_t group_base = bt & ~half_mask; + uint32_t idx_a = (group_base << 1) | j; + uint32_t idx_b = idx_a + half; + uint32_t tw_idx = j * tw_stride; + + uint64_t a[4] = { s0[idx_a], s1[idx_a], s2[idx_a], s3[idx_a] }; + uint64_t b[4] = { s0[idx_b], s1[idx_b], s2[idx_b], s3[idx_b] }; + uint64_t w[4] = { + __ldg(tw0 + tw_idx), __ldg(tw1 + tw_idx), + __ldg(tw2 + tw_idx), __ldg(tw3 + tw_idx) + }; + + uint64_t sum[4], diff[4], prod[4]; + fr_add(sum, a, b); + fr_sub(diff, a, b); + fr_mul(prod, diff, w); + + s0[idx_a] = sum[0]; s1[idx_a] = sum[1]; s2[idx_a] = sum[2]; s3[idx_a] = sum[3]; + s0[idx_b] = prod[0]; s1[idx_b] = prod[1]; s2[idx_b] = prod[2]; s3[idx_b] = prod[3]; + } + __syncthreads(); + } + + // Store: each thread handles span/P elements + for (uint32_t i = t; i < span; i += P) { + uint32_t global_idx = base + i; + if (global_idx < n) { + d0[global_idx] = s0[i]; + d1[global_idx] = s1[i]; + d2[global_idx] = s2[i]; + d3[global_idx] = s3[i]; + } + } +} + +// ============================================================================= +// Fused ScaleByPowers + first DIF butterfly stage for CosetFFT. +// Eliminates one full memory round-trip by computing v[i] *= g^i inline +// during the first butterfly stage (s=0) of the DIF NTT. +// +// For s=0: idx_a = tid, idx_b = tid + n/2, tw_idx = tid. +// g^(n/2) is passed as a parameter to avoid recomputation. +// ============================================================================= + +__global__ void ntt_dif_first_stage_fused_scale_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + const uint64_t g0, const uint64_t g1, + const uint64_t g2, const uint64_t g3, + const uint64_t gh0, const uint64_t gh1, // g^(n/2) + const uint64_t gh2, const uint64_t gh3, + uint32_t num_butterflies) +{ + constexpr unsigned ITEMS_PER_THREAD = 4; + uint32_t block_start = (uint32_t)blockIdx.x * blockDim.x * ITEMS_PER_THREAD; + uint32_t tid0 = block_start + threadIdx.x; + + // --- Compute g^idx_a using shared memory power table --- + __shared__ uint64_t sh_power[4]; // g^block_start + __shared__ uint64_t sh_pow2[9][4]; // g^(2^k), k=8 is g^256 + + if (threadIdx.x == 0) { + uint64_t pow2[4] = {g0, g1, g2, g3}; + #pragma unroll + for (int k = 0; k < 9; k++) { + sh_pow2[k][0] = pow2[0]; sh_pow2[k][1] = pow2[1]; + sh_pow2[k][2] = pow2[2]; sh_pow2[k][3] = pow2[3]; + uint64_t sq[4]; + fr_mul(sq, pow2, pow2); + pow2[0] = sq[0]; pow2[1] = sq[1]; + pow2[2] = sq[2]; pow2[3] = sq[3]; + } + + // g^block_start via repeated squaring + uint64_t base[4] = {g0, g1, g2, g3}; + uint64_t result[4] = { + Fr_params::ONE[0], Fr_params::ONE[1], + Fr_params::ONE[2], Fr_params::ONE[3] + }; + size_t exp = block_start; + while (exp > 0) { + if (exp & 1) { + uint64_t tmp[4]; + fr_mul(tmp, result, base); + result[0] = tmp[0]; result[1] = tmp[1]; + result[2] = tmp[2]; result[3] = tmp[3]; + } + uint64_t tmp[4]; + fr_mul(tmp, base, base); + base[0] = tmp[0]; base[1] = tmp[1]; + base[2] = tmp[2]; base[3] = tmp[3]; + exp >>= 1; + } + sh_power[0] = result[0]; sh_power[1] = result[1]; + sh_power[2] = result[2]; sh_power[3] = result[3]; + } + __syncthreads(); + + if (tid0 >= num_butterflies) return; + + // Reconstruct g^threadIdx from binary power table + uint64_t my_power[4] = { + Fr_params::ONE[0], Fr_params::ONE[1], + Fr_params::ONE[2], Fr_params::ONE[3] + }; + unsigned t = threadIdx.x; + #pragma unroll + for (int bit = 0; bit < 8; bit++) { + if ((t >> bit) & 1u) { + uint64_t p2[4] = {sh_pow2[bit][0], sh_pow2[bit][1], + sh_pow2[bit][2], sh_pow2[bit][3]}; + uint64_t tmp[4]; + fr_mul(tmp, my_power, p2); + my_power[0] = tmp[0]; my_power[1] = tmp[1]; + my_power[2] = tmp[2]; my_power[3] = tmp[3]; + } + } + + // g^idx_a = g^block_start * g^threadIdx + uint64_t g_a[4]; + { + uint64_t bp[4] = {sh_power[0], sh_power[1], sh_power[2], sh_power[3]}; + fr_mul(g_a, bp, my_power); + } + + uint64_t stride[4] = { + sh_pow2[8][0], sh_pow2[8][1], sh_pow2[8][2], sh_pow2[8][3], + }; + uint64_t gh[4] = {gh0, gh1, gh2, gh3}; + + #pragma unroll + for (unsigned item = 0; item < ITEMS_PER_THREAD; item++) { + uint32_t tid = tid0 + item * blockDim.x; + if (tid < num_butterflies) { + // Stage 0 indexing: idx_a = tid, idx_b = tid + n/2 + uint32_t idx_a = tid; + uint32_t idx_b = tid + num_butterflies; + + // g^idx_b = g^idx_a * g^(n/2) + uint64_t g_b[4]; + fr_mul(g_b, g_a, gh); + + // Load a, b and apply scale + uint64_t a_raw[4] = {d0[idx_a], d1[idx_a], d2[idx_a], d3[idx_a]}; + uint64_t b_raw[4] = {d0[idx_b], d1[idx_b], d2[idx_b], d3[idx_b]}; + uint64_t a[4], b[4]; + fr_mul(a, a_raw, g_a); + fr_mul(b, b_raw, g_b); + + // Load twiddle (stage 0: tw_idx = tid) + uint64_t w[4] = { + __ldg(tw0 + tid), __ldg(tw1 + tid), + __ldg(tw2 + tid), __ldg(tw3 + tid) + }; + + // DIF butterfly: a' = a + b; b' = (a - b) * w + uint64_t sum[4], diff[4], prod[4]; + fr_add(sum, a, b); + fr_sub(diff, a, b); + fr_mul(prod, diff, w); + + d0[idx_a] = sum[0]; d1[idx_a] = sum[1]; d2[idx_a] = sum[2]; d3[idx_a] = sum[3]; + d0[idx_b] = prod[0]; d1[idx_b] = prod[1]; d2[idx_b] = prod[2]; d3[idx_b] = prod[3]; + } + + if (item + 1 < ITEMS_PER_THREAD) { + uint64_t next[4]; + fr_mul(next, g_a, stride); + g_a[0] = next[0]; g_a[1] = next[1]; + g_a[2] = next[2]; g_a[3] = next[3]; + } + } +} + +// ============================================================================= +// DIT butterfly kernel (one stage of inverse NTT) +// +// For stage s (0-indexed from LSB): +// half = n >> (s+1) +// Same indexing as DIF. +// t = b * w; a' = a + t; b' = a - t +// +// Template: FUSE_SCALE multiplies outputs by scale before storing (for inv NTT). +// ============================================================================= + +template +__global__ void ntt_dit_butterfly_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + uint32_t num_butterflies, uint32_t half, uint32_t half_mask, uint32_t tw_stride, + uint64_t scale0, uint64_t scale1, uint64_t scale2, uint64_t scale3) +{ + uint32_t tid = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= num_butterflies) return; + + uint32_t j = tid & half_mask; + uint32_t group_base = tid & ~half_mask; + uint32_t idx_a = (group_base << 1) | j; + uint32_t idx_b = idx_a + half; + uint32_t tw_idx = j * tw_stride; + + // Load a, b + uint64_t a[4] = { d0[idx_a], d1[idx_a], d2[idx_a], d3[idx_a] }; + uint64_t b[4] = { d0[idx_b], d1[idx_b], d2[idx_b], d3[idx_b] }; + + // Load twiddle through read-only cache path. + uint64_t w[4] = { + __ldg(tw0 + tw_idx), __ldg(tw1 + tw_idx), + __ldg(tw2 + tw_idx), __ldg(tw3 + tw_idx) + }; + + // DIT butterfly: t = b * w; a' = a + t; b' = a - t + uint64_t t[4], sum[4], diff[4]; + fr_mul(t, b, w); + fr_add(sum, a, t); + fr_sub(diff, a, t); + + if constexpr (FUSE_SCALE) { + uint64_t sc[4] = {scale0, scale1, scale2, scale3}; + uint64_t r[4]; + fr_mul(r, sum, sc); sum[0]=r[0]; sum[1]=r[1]; sum[2]=r[2]; sum[3]=r[3]; + fr_mul(r, diff, sc); diff[0]=r[0]; diff[1]=r[1]; diff[2]=r[2]; diff[3]=r[3]; + } + + // Store + d0[idx_a] = sum[0]; d1[idx_a] = sum[1]; d2[idx_a] = sum[2]; d3[idx_a] = sum[3]; + d0[idx_b] = diff[0]; d1[idx_b] = diff[1]; d2[idx_b] = diff[2]; d3[idx_b] = diff[3]; +} + +// ============================================================================= +// Radix-8 DIT butterfly kernel: fuses three adjacent stages (s, s-1, s-2). +// +// Each thread processes 8 elements from one radix-8 group. +// Stage s (innermost, first): half_s = n >> (s+1) +// Stage s-1: half_s1 = 2*half_s +// Stage s-2 (outermost, last): half_s2 = 4*half_s +// +// 8 element positions in a group of 8*half_s: +// p0 = base+j, p1 = p0+half_s, p2 = p0+2*half_s, p3 = p1+2*half_s +// p4 = p0+4*half_s, p5 = p1+4*half_s, p6 = p2+4*half_s, p7 = p3+4*half_s +// +// Twiddle loads: 1 (stage s) + 2 (stage s-1) + 4 (stage s-2) = 7 total. +// Template: FUSE_SCALE multiplies outputs by scale before storing. +// ============================================================================= + +template +__global__ void ntt_dit_radix8_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + uint32_t n, int stage_s, + uint64_t scale0, uint64_t scale1, uint64_t scale2, uint64_t scale3) +{ + uint32_t tid = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + uint32_t num_r8 = n >> 3; + if (tid >= num_r8) return; + + uint32_t half_s = n >> (stage_s + 1); + uint32_t half_s1 = half_s << 1; // 2*half_s = n >> s + uint32_t half_s2 = half_s << 2; // 4*half_s = n >> (s-1) + + uint32_t j = tid & (half_s - 1); + uint32_t group = tid >> (__ffs(half_s) - 1); + + uint32_t base = group * (8 * half_s); + uint32_t p0 = base + j; + uint32_t p1 = p0 + half_s; + uint32_t p2 = p0 + half_s1; + uint32_t p3 = p1 + half_s1; + uint32_t p4 = p0 + half_s2; + uint32_t p5 = p1 + half_s2; + uint32_t p6 = p2 + half_s2; + uint32_t p7 = p3 + half_s2; + + // Load 8 elements + uint64_t a0[4] = { d0[p0], d1[p0], d2[p0], d3[p0] }; + uint64_t a1[4] = { d0[p1], d1[p1], d2[p1], d3[p1] }; + uint64_t a2[4] = { d0[p2], d1[p2], d2[p2], d3[p2] }; + uint64_t a3[4] = { d0[p3], d1[p3], d2[p3], d3[p3] }; + uint64_t a4[4] = { d0[p4], d1[p4], d2[p4], d3[p4] }; + uint64_t a5[4] = { d0[p5], d1[p5], d2[p5], d3[p5] }; + uint64_t a6[4] = { d0[p6], d1[p6], d2[p6], d3[p6] }; + uint64_t a7[4] = { d0[p7], d1[p7], d2[p7], d3[p7] }; + + uint32_t tw_stride_s = 1u << stage_s; + uint32_t tw_stride_s1 = tw_stride_s >> 1; // 1 << (s-1) + uint32_t tw_stride_s2 = tw_stride_s >> 2; // 1 << (s-2) + + uint64_t w[4], t[4], sum[4], diff[4]; + uint32_t twi; + + // --- Stage s: 4 DIT butterflies at distance half_s --- + // Pairs: (a0,a1), (a2,a3), (a4,a5), (a6,a7) — all same twiddle + twi = j * tw_stride_s; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + + fr_mul(t, a1, w); fr_add(sum, a0, t); fr_sub(diff, a0, t); + a0[0]=sum[0]; a0[1]=sum[1]; a0[2]=sum[2]; a0[3]=sum[3]; + a1[0]=diff[0]; a1[1]=diff[1]; a1[2]=diff[2]; a1[3]=diff[3]; + + fr_mul(t, a3, w); fr_add(sum, a2, t); fr_sub(diff, a2, t); + a2[0]=sum[0]; a2[1]=sum[1]; a2[2]=sum[2]; a2[3]=sum[3]; + a3[0]=diff[0]; a3[1]=diff[1]; a3[2]=diff[2]; a3[3]=diff[3]; + + fr_mul(t, a5, w); fr_add(sum, a4, t); fr_sub(diff, a4, t); + a4[0]=sum[0]; a4[1]=sum[1]; a4[2]=sum[2]; a4[3]=sum[3]; + a5[0]=diff[0]; a5[1]=diff[1]; a5[2]=diff[2]; a5[3]=diff[3]; + + fr_mul(t, a7, w); fr_add(sum, a6, t); fr_sub(diff, a6, t); + a6[0]=sum[0]; a6[1]=sum[1]; a6[2]=sum[2]; a6[3]=sum[3]; + a7[0]=diff[0]; a7[1]=diff[1]; a7[2]=diff[2]; a7[3]=diff[3]; + + // --- Stage s-1: 4 DIT butterflies at distance 2*half_s --- + // Pairs: (a0,a2), (a1,a3), (a4,a6), (a5,a7) + uint64_t ws1_a[4], ws1_b[4]; + twi = j * tw_stride_s1; + ws1_a[0] = __ldg(tw0+twi); ws1_a[1] = __ldg(tw1+twi); + ws1_a[2] = __ldg(tw2+twi); ws1_a[3] = __ldg(tw3+twi); + twi = (j + half_s) * tw_stride_s1; + ws1_b[0] = __ldg(tw0+twi); ws1_b[1] = __ldg(tw1+twi); + ws1_b[2] = __ldg(tw2+twi); ws1_b[3] = __ldg(tw3+twi); + + fr_mul(t, a2, ws1_a); fr_add(sum, a0, t); fr_sub(diff, a0, t); + a0[0]=sum[0]; a0[1]=sum[1]; a0[2]=sum[2]; a0[3]=sum[3]; + a2[0]=diff[0]; a2[1]=diff[1]; a2[2]=diff[2]; a2[3]=diff[3]; + + fr_mul(t, a3, ws1_b); fr_add(sum, a1, t); fr_sub(diff, a1, t); + a1[0]=sum[0]; a1[1]=sum[1]; a1[2]=sum[2]; a1[3]=sum[3]; + a3[0]=diff[0]; a3[1]=diff[1]; a3[2]=diff[2]; a3[3]=diff[3]; + + fr_mul(t, a6, ws1_a); fr_add(sum, a4, t); fr_sub(diff, a4, t); + a4[0]=sum[0]; a4[1]=sum[1]; a4[2]=sum[2]; a4[3]=sum[3]; + a6[0]=diff[0]; a6[1]=diff[1]; a6[2]=diff[2]; a6[3]=diff[3]; + + fr_mul(t, a7, ws1_b); fr_add(sum, a5, t); fr_sub(diff, a5, t); + a5[0]=sum[0]; a5[1]=sum[1]; a5[2]=sum[2]; a5[3]=sum[3]; + a7[0]=diff[0]; a7[1]=diff[1]; a7[2]=diff[2]; a7[3]=diff[3]; + + // --- Stage s-2: 4 DIT butterflies at distance 4*half_s --- + // Pairs: (a0,a4), (a1,a5), (a2,a6), (a3,a7) — 4 different twiddles + twi = j * tw_stride_s2; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_mul(t, a4, w); fr_add(sum, a0, t); fr_sub(diff, a0, t); + a0[0]=sum[0]; a0[1]=sum[1]; a0[2]=sum[2]; a0[3]=sum[3]; + a4[0]=diff[0]; a4[1]=diff[1]; a4[2]=diff[2]; a4[3]=diff[3]; + + twi = (j + half_s) * tw_stride_s2; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_mul(t, a5, w); fr_add(sum, a1, t); fr_sub(diff, a1, t); + a1[0]=sum[0]; a1[1]=sum[1]; a1[2]=sum[2]; a1[3]=sum[3]; + a5[0]=diff[0]; a5[1]=diff[1]; a5[2]=diff[2]; a5[3]=diff[3]; + + twi = (j + half_s1) * tw_stride_s2; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_mul(t, a6, w); fr_add(sum, a2, t); fr_sub(diff, a2, t); + a2[0]=sum[0]; a2[1]=sum[1]; a2[2]=sum[2]; a2[3]=sum[3]; + a6[0]=diff[0]; a6[1]=diff[1]; a6[2]=diff[2]; a6[3]=diff[3]; + + twi = (j + half_s1 + half_s) * tw_stride_s2; + w[0] = __ldg(tw0+twi); w[1] = __ldg(tw1+twi); + w[2] = __ldg(tw2+twi); w[3] = __ldg(tw3+twi); + fr_mul(t, a7, w); fr_add(sum, a3, t); fr_sub(diff, a3, t); + a3[0]=sum[0]; a3[1]=sum[1]; a3[2]=sum[2]; a3[3]=sum[3]; + a7[0]=diff[0]; a7[1]=diff[1]; a7[2]=diff[2]; a7[3]=diff[3]; + + // Optional fused scale (1/n for inverse NTT) + if constexpr (FUSE_SCALE) { + uint64_t sc[4] = {scale0, scale1, scale2, scale3}; + uint64_t r[4]; + fr_mul(r, a0, sc); a0[0]=r[0]; a0[1]=r[1]; a0[2]=r[2]; a0[3]=r[3]; + fr_mul(r, a1, sc); a1[0]=r[0]; a1[1]=r[1]; a1[2]=r[2]; a1[3]=r[3]; + fr_mul(r, a2, sc); a2[0]=r[0]; a2[1]=r[1]; a2[2]=r[2]; a2[3]=r[3]; + fr_mul(r, a3, sc); a3[0]=r[0]; a3[1]=r[1]; a3[2]=r[2]; a3[3]=r[3]; + fr_mul(r, a4, sc); a4[0]=r[0]; a4[1]=r[1]; a4[2]=r[2]; a4[3]=r[3]; + fr_mul(r, a5, sc); a5[0]=r[0]; a5[1]=r[1]; a5[2]=r[2]; a5[3]=r[3]; + fr_mul(r, a6, sc); a6[0]=r[0]; a6[1]=r[1]; a6[2]=r[2]; a6[3]=r[3]; + fr_mul(r, a7, sc); a7[0]=r[0]; a7[1]=r[1]; a7[2]=r[2]; a7[3]=r[3]; + } + + // Store + d0[p0]=a0[0]; d1[p0]=a0[1]; d2[p0]=a0[2]; d3[p0]=a0[3]; + d0[p1]=a1[0]; d1[p1]=a1[1]; d2[p1]=a1[2]; d3[p1]=a1[3]; + d0[p2]=a2[0]; d1[p2]=a2[1]; d2[p2]=a2[2]; d3[p2]=a2[3]; + d0[p3]=a3[0]; d1[p3]=a3[1]; d2[p3]=a3[2]; d3[p3]=a3[3]; + d0[p4]=a4[0]; d1[p4]=a4[1]; d2[p4]=a4[2]; d3[p4]=a4[3]; + d0[p5]=a5[0]; d1[p5]=a5[1]; d2[p5]=a5[2]; d3[p5]=a5[3]; + d0[p6]=a6[0]; d1[p6]=a6[1]; d2[p6]=a6[2]; d3[p6]=a6[3]; + d0[p7]=a7[0]; d1[p7]=a7[1]; d2[p7]=a7[2]; d3[p7]=a7[3]; +} + +// ============================================================================= +// DIT fused tail kernel: processes the FIRST TAIL_LOG stages in shared memory. +// +// DIT runs stages from highest (s = log_n-1) down to lowest (s = 0). +// The fused tail handles the highest stages (largest s, smallest butterfly span) +// which fit in shared memory. This is the OPPOSITE of DIF tail which handles +// the lowest stages. +// +// For n = 2²⁰ and TAIL_LOG = 11: +// - Stages 19 down to 9 (11 stages) run first in shared memory +// - Then radix-8/4/2 kernels handle stages 8 down to 0 +// +// The stage iteration goes: stage_start, stage_start-1, ..., stage_start-TAIL_LOG+1 +// (highest s first = smallest half = smallest butterfly span = fits in shared memory) +// ============================================================================= + +template +__global__ void __launch_bounds__(1024, 1) ntt_dit_tail_fused_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t *__restrict__ tw0, const uint64_t *__restrict__ tw1, + const uint64_t *__restrict__ tw2, const uint64_t *__restrict__ tw3, + uint32_t n, int stage_start) // stage_start = TAIL_LOG-1 (highest of the tail stages) +{ + constexpr uint32_t span = 1u << TAIL_LOG; + constexpr uint32_t butterflies_per_chunk = span >> 1; + + uint32_t chunk = (uint32_t)blockIdx.x; + uint32_t base = chunk * span; + uint32_t t = threadIdx.x; + uint32_t P = blockDim.x; + + extern __shared__ uint64_t shmem[]; + uint64_t *s0 = shmem; + uint64_t *s1 = s0 + span; + uint64_t *s2 = s1 + span; + uint64_t *s3 = s2 + span; + + // Load: each thread handles span/P elements + for (uint32_t i = t; i < span; i += P) { + uint32_t global_idx = base + i; + if (global_idx < n) { + s0[i] = d0[global_idx]; + s1[i] = d1[global_idx]; + s2[i] = d2[global_idx]; + s3[i] = d3[global_idx]; + } + } + __syncthreads(); + + // DIT: process stages from stage_start down to 0 + #pragma unroll + for (int st = 0; st < TAIL_LOG; st++) { + int s = stage_start - st; // stages: stage_start, stage_start-1, ..., 0 + uint32_t half = n >> (s + 1); + uint32_t half_mask = half - 1; + uint32_t tw_stride = 1u << s; + + for (uint32_t bt = t; bt < butterflies_per_chunk; bt += P) { + uint32_t j = bt & half_mask; + uint32_t group_base = bt & ~half_mask; + uint32_t idx_a = (group_base << 1) | j; + uint32_t idx_b = idx_a + half; + uint32_t tw_idx = j * tw_stride; + + uint64_t a[4] = { s0[idx_a], s1[idx_a], s2[idx_a], s3[idx_a] }; + uint64_t b[4] = { s0[idx_b], s1[idx_b], s2[idx_b], s3[idx_b] }; + uint64_t w[4] = { + __ldg(tw0 + tw_idx), __ldg(tw1 + tw_idx), + __ldg(tw2 + tw_idx), __ldg(tw3 + tw_idx) + }; + + // DIT butterfly: t = b * w; a' = a + t; b' = a - t + uint64_t tw_b[4], sum[4], diff[4]; + fr_mul(tw_b, b, w); + fr_add(sum, a, tw_b); + fr_sub(diff, a, tw_b); + + s0[idx_a] = sum[0]; s1[idx_a] = sum[1]; s2[idx_a] = sum[2]; s3[idx_a] = sum[3]; + s0[idx_b] = diff[0]; s1[idx_b] = diff[1]; s2[idx_b] = diff[2]; s3[idx_b] = diff[3]; + } + __syncthreads(); + } + + // Store: each thread handles span/P elements + for (uint32_t i = t; i < span; i += P) { + uint32_t global_idx = base + i; + if (global_idx < n) { + d0[global_idx] = s0[i]; + d1[global_idx] = s1[i]; + d2[global_idx] = s2[i]; + d3[global_idx] = s3[i]; + } + } +} + +// ============================================================================= +// Bit-reversal permutation kernel (in-place) +// Each thread handles one swap where i < bit_reverse(i). +// ============================================================================= + +__device__ __forceinline__ uint32_t bit_reverse(uint32_t x, int log_n) { + x = __brev(x); + x >>= (32 - log_n); + return x; +} + +__global__ void ntt_bit_reverse_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + size_t n, int log_n) +{ + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + size_t j = bit_reverse((uint32_t)i, log_n); + if (i >= j) return; // only swap once per pair + + // Swap elements i and j + uint64_t tmp; + + tmp = d0[i]; d0[i] = d0[j]; d0[j] = tmp; + tmp = d1[i]; d1[i] = d1[j]; d1[j] = tmp; + tmp = d2[i]; d2[i] = d2[j]; d2[j] = tmp; + tmp = d3[i]; d3[i] = d3[j]; d3[j] = tmp; +} + +// ============================================================================= +// Scale kernel: multiply all elements by a constant (1/n for inverse NTT) +// Kept as fallback for edge cases where fused scale cannot be applied. +// ============================================================================= + +__global__ void ntt_scale_kernel( + uint64_t *__restrict__ d0, uint64_t *__restrict__ d1, + uint64_t *__restrict__ d2, uint64_t *__restrict__ d3, + const uint64_t inv_n0, const uint64_t inv_n1, + const uint64_t inv_n2, const uint64_t inv_n3, + size_t n) +{ + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + uint64_t a[4] = { d0[i], d1[i], d2[i], d3[i] }; + uint64_t c[4] = { inv_n0, inv_n1, inv_n2, inv_n3 }; + uint64_t r[4]; + fr_mul(r, a, c); + + d0[i] = r[0]; d1[i] = r[1]; d2[i] = r[2]; d3[i] = r[3]; +} + +// ============================================================================= +// NTT Domain: holds twiddle factors on GPU +// ============================================================================= + +struct NTTDomain { + size_t size; + int log_size; + int tail_log; // adaptive: 11 or 12 based on GPU shared memory capacity + // SoA twiddle arrays, each n/2 elements + uint64_t *d_twiddles_fwd[4]; + uint64_t *d_twiddles_inv[4]; + // 1/n in Montgomery form + uint64_t inv_n[4]; +}; + +// ============================================================================= +// Host-side functions +// ============================================================================= + +NTTDomain *ntt_domain_create(size_t size, const uint64_t *fwd_twiddles_aos, + const uint64_t *inv_twiddles_aos, const uint64_t inv_n[4], + cudaStream_t stream) { + NTTDomain *dom = new NTTDomain; + dom->size = size; + + // Compute log2(size) + int log_n = 0; + { size_t tmp = size; while (tmp > 1) { tmp >>= 1; log_n++; } } + dom->log_size = log_n; + + // Query max shared memory per block for adaptive tail sizing + int max_shmem = 0; + cudaDeviceGetAttribute(&max_shmem, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + // tail_log=12 needs 4 * 4096 * 8 = 131072 bytes of shared memory + // tail_log=11 needs 4 * 2048 * 8 = 65536 bytes (fits in 99KB optin on Blackwell) + dom->tail_log = (max_shmem >= 131072 && log_n >= 12) ? 12 : 11; + + // Copy inv_n + for (int i = 0; i < 4; i++) dom->inv_n[i] = inv_n[i]; + + size_t half_n = size / 2; + + // Allocate device twiddle arrays (SoA) + for (int i = 0; i < 4; i++) { + cudaMalloc(&dom->d_twiddles_fwd[i], half_n * sizeof(uint64_t)); + cudaMalloc(&dom->d_twiddles_inv[i], half_n * sizeof(uint64_t)); + } + + if (half_n > 0) { + uint64_t *tw_aos = nullptr; + cudaMalloc(&tw_aos, half_n * 4 * sizeof(uint64_t)); + + // Forward twiddles: copy AoS once, transpose on GPU. + cudaMemcpyAsync(tw_aos, fwd_twiddles_aos, half_n * 4 * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + launch_transpose_aos_to_soa_fr(dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + tw_aos, half_n, stream); + // Reuse tw_aos for inverse twiddles only after forward transpose is done. + cudaStreamSynchronize(stream); + + // Inverse twiddles: copy AoS once, transpose on GPU. + cudaMemcpyAsync(tw_aos, inv_twiddles_aos, half_n * 4 * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + launch_transpose_aos_to_soa_fr(dom->d_twiddles_inv[0], dom->d_twiddles_inv[1], + dom->d_twiddles_inv[2], dom->d_twiddles_inv[3], + tw_aos, half_n, stream); + + cudaStreamSynchronize(stream); + cudaFree(tw_aos); + } + + return dom; +} + +void ntt_domain_destroy(NTTDomain *dom) { + if (!dom) return; + for (int i = 0; i < 4; i++) { + cudaFree(dom->d_twiddles_fwd[i]); + cudaFree(dom->d_twiddles_inv[i]); + } + delete dom; +} + +// ============================================================================= +// Helper: launch DIF fused tail (dispatches tail_log=11 or 12) +// ============================================================================= + +static void launch_dif_tail(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + uint32_t n, int stage_start, cudaStream_t stream) { + int tail_log = dom->tail_log; + uint32_t span = 1u << tail_log; + unsigned tail_threads = (span > 1024) ? 1024 : span; + unsigned tail_blocks = (n + span - 1) / span; + size_t shmem_bytes = 4ull * span * sizeof(uint64_t); + if (tail_log == 12) { + cudaFuncSetAttribute(ntt_dif_tail_fused_kernel<12>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shmem_bytes); + ntt_dif_tail_fused_kernel<12><<>>( + d0, d1, d2, d3, + dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + n, stage_start); + } else { + cudaFuncSetAttribute(ntt_dif_tail_fused_kernel<11>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shmem_bytes); + ntt_dif_tail_fused_kernel<11><<>>( + d0, d1, d2, d3, + dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + n, stage_start); + } +} + +static void launch_dit_tail(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + uint32_t n, int stage_start, cudaStream_t stream) { + int tail_log = dom->tail_log; + uint32_t span = 1u << tail_log; + unsigned tail_threads = (span > 1024) ? 1024 : span; + unsigned tail_blocks = (n + span - 1) / span; + size_t shmem_bytes = 4ull * span * sizeof(uint64_t); + + if (tail_log == 12) { + cudaFuncSetAttribute(ntt_dit_tail_fused_kernel<12>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shmem_bytes); + ntt_dit_tail_fused_kernel<12><<>>( + d0, d1, d2, d3, + dom->d_twiddles_inv[0], dom->d_twiddles_inv[1], + dom->d_twiddles_inv[2], dom->d_twiddles_inv[3], + n, stage_start); + } else { + cudaFuncSetAttribute(ntt_dit_tail_fused_kernel<11>, + cudaFuncAttributeMaxDynamicSharedMemorySize, shmem_bytes); + ntt_dit_tail_fused_kernel<11><<>>( + d0, d1, d2, d3, + dom->d_twiddles_inv[0], dom->d_twiddles_inv[1], + dom->d_twiddles_inv[2], dom->d_twiddles_inv[3], + n, stage_start); + } +} + +static bool should_use_fused_tail(const NTTDomain *dom, uint32_t n) { + return (dom->log_size > dom->tail_log) && (n >= NTT_FUSED_TAIL_MIN_N); +} + +// ============================================================================= +// Forward NTT (DIF): radix-8 → radix-4 → radix-2 → fused tail +// ============================================================================= + +void launch_ntt_forward(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + cudaStream_t stream) { + uint32_t n = (uint32_t)dom->size; + uint32_t num_butterflies = n >> 1; + unsigned blocks_r2 = (num_butterflies + NTT_THREADS - 1) / NTT_THREADS; + uint32_t num_r8 = n >> 3; + unsigned blocks_r8 = (num_r8 + NTT_THREADS - 1) / NTT_THREADS; + + bool use_fused_tail = should_use_fused_tail(dom, n); + int regular_stages = dom->log_size; + if (use_fused_tail) { + regular_stages = dom->log_size - dom->tail_log; + } + + // DIF: radix-8 for 3 stages at a time, radix-2 for remainder + int s = 0; + for (; s + 2 < regular_stages; s += 3) { + ntt_dif_radix8_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + n, s); + } + for (; s < regular_stages; s++) { + uint32_t half = n >> (s + 1); + uint32_t half_mask = half - 1; + uint32_t tw_stride = 1u << s; + ntt_dif_butterfly_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + num_butterflies, half, half_mask, tw_stride); + } + + if (use_fused_tail) { + launch_dif_tail(dom, d0, d1, d2, d3, n, regular_stages, stream); + } +} + +// ============================================================================= +// Fused CosetFFT forward: ScaleByPowers + DIF NTT in one pass. +// Stage 0 uses fused scale kernel, then radix-8/4/2 for remaining stages. +// ============================================================================= + +void launch_ntt_forward_coset(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + const uint64_t g[4], const uint64_t g_half[4], + cudaStream_t stream) { + uint32_t n = (uint32_t)dom->size; + uint32_t num_butterflies = n >> 1; + unsigned blocks_r2 = (num_butterflies + NTT_THREADS - 1) / NTT_THREADS; + uint32_t num_r8 = n >> 3; + unsigned blocks_r8 = (num_r8 + NTT_THREADS - 1) / NTT_THREADS; + + bool use_fused_tail = should_use_fused_tail(dom, n); + int regular_stages = dom->log_size; + if (use_fused_tail) { + regular_stages = dom->log_size - dom->tail_log; + } + + // Stage 0: fused ScaleByPowers + DIF butterfly + constexpr unsigned first_stage_items_per_thread = 4; + unsigned blocks_first_stage = + (num_butterflies + NTT_THREADS * first_stage_items_per_thread - 1) / + (NTT_THREADS * first_stage_items_per_thread); + ntt_dif_first_stage_fused_scale_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + g[0], g[1], g[2], g[3], + g_half[0], g_half[1], g_half[2], g_half[3], + num_butterflies); + + // Stages 1+: radix-8 for 3 stages at a time, radix-2 for remainder + int s = 1; + for (; s + 2 < regular_stages; s += 3) { + ntt_dif_radix8_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + n, s); + } + for (; s < regular_stages; s++) { + uint32_t half = n >> (s + 1); + uint32_t half_mask = half - 1; + uint32_t tw_stride = 1u << s; + ntt_dif_butterfly_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_fwd[0], dom->d_twiddles_fwd[1], + dom->d_twiddles_fwd[2], dom->d_twiddles_fwd[3], + num_butterflies, half, half_mask, tw_stride); + } + + if (use_fused_tail) { + launch_dif_tail(dom, d0, d1, d2, d3, n, regular_stages, stream); + } +} + +// ============================================================================= +// Inverse NTT (DIT): fused tail → radix-8 → radix-4 → radix-2 +// Scale by 1/n is fused into the last kernel launch. +// ============================================================================= + +void launch_ntt_inverse(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + cudaStream_t stream) { + uint32_t n = (uint32_t)dom->size; + uint32_t num_butterflies = n >> 1; + unsigned blocks_r2 = (num_butterflies + NTT_THREADS - 1) / NTT_THREADS; + uint32_t num_r8 = n >> 3; + unsigned blocks_r8 = (num_r8 + NTT_THREADS - 1) / NTT_THREADS; + + bool use_fused_tail = should_use_fused_tail(dom, n); + + // DIT fused tail FIRST: stages log_n-1 down to log_n-tail_log + int first_regular; + if (use_fused_tail) { + launch_dit_tail(dom, d0, d1, d2, d3, n, dom->log_size - 1, stream); + first_regular = dom->log_size - dom->tail_log - 1; + } else { + first_regular = dom->log_size - 1; + } + + // Regular stages: from first_regular down to 0. + // Radix-8 for 3 stages at a time, radix-2 for remainder. + // The last kernel fuses the 1/n scale. + bool scaled = false; + int s = first_regular; + + // DIT radix-8: fuses stages (s, s-1, s-2) + for (; s - 2 >= 0; s -= 3) { + if (s < 3) { + // This is the last kernel — fuse scale + ntt_dit_radix8_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_inv[0], dom->d_twiddles_inv[1], + dom->d_twiddles_inv[2], dom->d_twiddles_inv[3], + n, s, + dom->inv_n[0], dom->inv_n[1], dom->inv_n[2], dom->inv_n[3]); + scaled = true; + } else { + ntt_dit_radix8_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_inv[0], dom->d_twiddles_inv[1], + dom->d_twiddles_inv[2], dom->d_twiddles_inv[3], + n, s, + 0, 0, 0, 0); + } + } + + // DIT radix-2 for remaining stages (0, 1, or 2 stages) + for (; s >= 0; s--) { + uint32_t half = n >> (s + 1); + uint32_t half_mask = half - 1; + uint32_t tw_stride = 1u << s; + if (s == 0) { + // Last stage — fuse 1/n scale + ntt_dit_butterfly_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_inv[0], dom->d_twiddles_inv[1], + dom->d_twiddles_inv[2], dom->d_twiddles_inv[3], + num_butterflies, half, half_mask, tw_stride, + dom->inv_n[0], dom->inv_n[1], dom->inv_n[2], dom->inv_n[3]); + scaled = true; + } else { + ntt_dit_butterfly_kernel<<>>( + d0, d1, d2, d3, + dom->d_twiddles_inv[0], dom->d_twiddles_inv[1], + dom->d_twiddles_inv[2], dom->d_twiddles_inv[3], + num_butterflies, half, half_mask, tw_stride, + 0, 0, 0, 0); + } + } + + // Fallback: separate scale kernel (only for edge cases like n=1) + if (!scaled) { + unsigned blocks_n = (n + NTT_THREADS - 1) / NTT_THREADS; + ntt_scale_kernel<<>>( + d0, d1, d2, d3, + dom->inv_n[0], dom->inv_n[1], dom->inv_n[2], dom->inv_n[3], + n); + } +} + +void launch_ntt_bit_reverse(NTTDomain *dom, uint64_t *d0, uint64_t *d1, uint64_t *d2, uint64_t *d3, + cudaStream_t stream) { + size_t n = dom->size; + constexpr unsigned threads = 256; + unsigned blocks = (n + threads - 1) / threads; + ntt_bit_reverse_kernel<<>>(d0, d1, d2, d3, n, dom->log_size); +} + +// Accessor: get forward twiddle pointers (used by PlonK constraint kernel) +void ntt_get_fwd_twiddles(const NTTDomain *dom, const uint64_t **out_ptrs) { + for (int i = 0; i < 4; i++) { + out_ptrs[i] = dom->d_twiddles_fwd[i]; + } +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/plonk_eval.cu b/prover/gpu/cuda/src/plonk/plonk_eval.cu new file mode 100644 index 00000000000..ac20ba76a4b --- /dev/null +++ b/prover/gpu/cuda/src/plonk/plonk_eval.cu @@ -0,0 +1,95 @@ +// ═══════════════════════════════════════════════════════════════════════════════ +// GPU chunked Horner polynomial evaluation at a single point z +// +// Evaluates p(z) = c₀ + c₁z + c₂z² + ... + cₙ₋₁zⁿ⁻¹ +// +// Strategy: Divide n coefficients into K chunks of 1024, each thread evaluates +// its chunk independently via Horner's method, then CPU combines. +// +// Chunk j evaluates: partial[j] = c[jK] + z·(c[jK+1] + z·(... + z·c[(j+1)K-1])) +// +// Full result: p(z) = partial[0] + z^K · partial[1] + z^(2K) · partial[2] + ... +// = Σⱼ partial[j] · z^(jK) +// +// The CPU computes z^K once, then combines via Horner on the partial results: +// p(z) = partial[0] + z^K · (partial[1] + z^K · (partial[2] + ...)) +// +// For n = 2²⁷: 131072 chunks × 1024 coeffs/chunk × 1023 fr_mul/chunk +// Each thread does 1023 sequential fr_muls — heavily compute-bound, ideal for GPU. +// ═══════════════════════════════════════════════════════════════════════════════ + +#include "fr_arith.cuh" +#include + +namespace gnark_gpu { + +constexpr int EVAL_CHUNK_SIZE = 1024; + +// Each thread evaluates one chunk of coefficients via Horner's method. +// chunk j computes: partial[j] = c[j*K] + z*(c[j*K+1] + z*(...+ z*c[(j+1)*K-1])) +// where K = EVAL_CHUNK_SIZE. +// The full polynomial is recovered as: p(z) = Σ_j partial[j] * z^(j*K). +__global__ void poly_eval_chunks_kernel( + const uint64_t *__restrict__ c0, + const uint64_t *__restrict__ c1, + const uint64_t *__restrict__ c2, + const uint64_t *__restrict__ c3, + const uint64_t z0, const uint64_t z1, + const uint64_t z2, const uint64_t z3, + uint64_t *__restrict__ out0, + uint64_t *__restrict__ out1, + uint64_t *__restrict__ out2, + uint64_t *__restrict__ out3, + size_t n) +{ + size_t chunk = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t start = chunk * EVAL_CHUNK_SIZE; + if (start >= n) return; + + size_t end = start + EVAL_CHUNK_SIZE; + if (end > n) end = n; + + uint64_t z[4] = {z0, z1, z2, z3}; + + // Horner: result = c[end-1]; for i = end-2 downto start: result = result*z + c[i] + size_t last = end - 1; + uint64_t r[4] = {c0[last], c1[last], c2[last], c3[last]}; + + for (size_t i = last; i > start; ) { + --i; + uint64_t t[4]; + fr_mul(t, r, z); + uint64_t ci[4] = {c0[i], c1[i], c2[i], c3[i]}; + fr_add(r, t, ci); + } + + out0[chunk] = r[0]; + out1[chunk] = r[1]; + out2[chunk] = r[2]; + out3[chunk] = r[3]; +} + +// Launch the chunked Horner evaluation kernel. +// out_partials must be pre-allocated device memory for (num_chunks) elements in SoA. +// Returns the number of chunks in *num_chunks_out. +void launch_poly_eval_chunks( + const uint64_t *c0, const uint64_t *c1, + const uint64_t *c2, const uint64_t *c3, + const uint64_t z[4], + uint64_t *out0, uint64_t *out1, + uint64_t *out2, uint64_t *out3, + size_t n, size_t *num_chunks_out, + cudaStream_t stream) +{ + size_t nc = (n + EVAL_CHUNK_SIZE - 1) / EVAL_CHUNK_SIZE; + *num_chunks_out = nc; + + constexpr unsigned threads = 256; + unsigned blocks = (nc + threads - 1) / threads; + poly_eval_chunks_kernel<<>>( + c0, c1, c2, c3, + z[0], z[1], z[2], z[3], + out0, out1, out2, out3, n); +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk/plonk_z.cu b/prover/gpu/cuda/src/plonk/plonk_z.cu new file mode 100644 index 00000000000..bc475f6df48 --- /dev/null +++ b/prover/gpu/cuda/src/plonk/plonk_z.cu @@ -0,0 +1,184 @@ +// ═══════════════════════════════════════════════════════════════════════════════ +// GPU Z-polynomial prefix product for PlonK permutation argument +// +// Computes Z[i] = Π_{k=0}^{i-1} ratio[k] (prefix product of ratio vector) +// +// The Z polynomial encodes the permutation argument in PlonK: +// ratio[i] = (L[i]+β·ω^i+γ)(R[i]+β·k₁·ω^i+γ)(O[i]+β·k₂·ω^i+γ) +// ───────────────────────────────────────────────────── +// (L[i]+β·S₁(ω^i)+γ)(R[i]+β·S₂(ω^i)+γ)(O[i]+β·S₃(ω^i)+γ) +// +// Three-phase parallel scan with GPU/CPU hybrid: +// +// ratio: [r₀ r₁ r₂ r₃ | r₄ r₅ r₆ r₇ | r₈ r₉ ...] +// ─── chunk 0 ── ─── chunk 1 ── ─ chunk 2 ─ +// +// Phase 1 (GPU): Local prefix product within each chunk of 1024 elements. +// chunk 0: [r₀, r₀r₁, r₀r₁r₂, r₀r₁r₂r₃] +// chunk 1: [r₄, r₄r₅, r₄r₅r₆, r₄r₅r₆r₇] +// → chunk_products: [r₀r₁r₂r₃, r₄r₅r₆r₇, ...] +// +// Phase 2 (CPU): Sequential scan of ~n/1024 chunk products. +// scanned_prefix[0] = cp[0] +// scanned_prefix[1] = cp[0] · cp[1] +// → At most ~131K elements for n=2²⁷, trivial on CPU. +// +// Phase 3 (GPU): Multiply each chunk's elements by its global prefix. +// chunk 1: each elem *= scanned_prefix[0] +// chunk 2: each elem *= scanned_prefix[1] +// Then shift right by 1: Z[0] = 1, Z[i] = prefix_product[i-1] +// ═══════════════════════════════════════════════════════════════════════════════ + +#include "fr_arith.cuh" +#include + +namespace gnark_gpu { + +constexpr size_t Z_CHUNK_SIZE = 1024; + +// Phase 1: Each thread processes one chunk of Z_CHUNK_SIZE ratios. +// Computes prefix products within the chunk in-place. +// z_out[chunk_start] = ratio[chunk_start] +// z_out[chunk_start+1] = ratio[chunk_start] * ratio[chunk_start+1] +// ... +// chunk_products[chunk_id] = product of all ratios in this chunk +__global__ void z_prefix_local_kernel( + uint64_t *__restrict__ z0, uint64_t *__restrict__ z1, + uint64_t *__restrict__ z2, uint64_t *__restrict__ z3, + const uint64_t *__restrict__ r0, const uint64_t *__restrict__ r1, + const uint64_t *__restrict__ r2, const uint64_t *__restrict__ r3, + uint64_t *__restrict__ cp0, uint64_t *__restrict__ cp1, + uint64_t *__restrict__ cp2, uint64_t *__restrict__ cp3, + size_t n) +{ + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + Z_CHUNK_SIZE - 1) / Z_CHUNK_SIZE; + if (chunk_id >= num_chunks) return; + + size_t start = chunk_id * Z_CHUNK_SIZE; + size_t end = start + Z_CHUNK_SIZE; + if (end > n) end = n; + + // Prefix product: z[start] = r[start], z[i] = z[i-1] * r[i] + uint64_t acc[4] = {r0[start], r1[start], r2[start], r3[start]}; + z0[start] = acc[0]; z1[start] = acc[1]; z2[start] = acc[2]; z3[start] = acc[3]; + + for (size_t i = start + 1; i < end; i++) { + uint64_t elem[4] = {r0[i], r1[i], r2[i], r3[i]}; + uint64_t prod[4]; + fr_mul(prod, acc, elem); + acc[0] = prod[0]; acc[1] = prod[1]; + acc[2] = prod[2]; acc[3] = prod[3]; + z0[i] = acc[0]; z1[i] = acc[1]; z2[i] = acc[2]; z3[i] = acc[3]; + } + + // Store chunk product + cp0[chunk_id] = acc[0]; cp1[chunk_id] = acc[1]; + cp2[chunk_id] = acc[2]; cp3[chunk_id] = acc[3]; +} + +// Phase 3: Apply global prefix fixup. +// For chunk k > 0: z[i] *= scanned_prefix[k-1] +__global__ void z_prefix_fixup_kernel( + uint64_t *__restrict__ z0, uint64_t *__restrict__ z1, + uint64_t *__restrict__ z2, uint64_t *__restrict__ z3, + const uint64_t *__restrict__ sp0, const uint64_t *__restrict__ sp1, + const uint64_t *__restrict__ sp2, const uint64_t *__restrict__ sp3, + size_t n) +{ + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + Z_CHUNK_SIZE - 1) / Z_CHUNK_SIZE; + if (chunk_id == 0 || chunk_id >= num_chunks) return; + + size_t start = chunk_id * Z_CHUNK_SIZE; + size_t end = start + Z_CHUNK_SIZE; + if (end > n) end = n; + + uint64_t prefix[4] = {sp0[chunk_id-1], sp1[chunk_id-1], + sp2[chunk_id-1], sp3[chunk_id-1]}; + + for (size_t i = start; i < end; i++) { + uint64_t elem[4] = {z0[i], z1[i], z2[i], z3[i]}; + uint64_t prod[4]; + fr_mul(prod, prefix, elem); + z0[i] = prod[0]; z1[i] = prod[1]; z2[i] = prod[2]; z3[i] = prod[3]; + } +} + +// Shift right by 1: z[i] = z[i-1] for i > 0, z[0] = Montgomery 1. +// After this, z[i] = product(ratio[0..i-1]) which is the Z polynomial. +__global__ void z_shift_right_kernel( + uint64_t *__restrict__ z0, uint64_t *__restrict__ z1, + uint64_t *__restrict__ z2, uint64_t *__restrict__ z3, + const uint64_t *__restrict__ src0, const uint64_t *__restrict__ src1, + const uint64_t *__restrict__ src2, const uint64_t *__restrict__ src3, + size_t n) +{ + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + + if (i == 0) { + z0[0] = Fr_params::ONE[0]; z1[0] = Fr_params::ONE[1]; + z2[0] = Fr_params::ONE[2]; z3[0] = Fr_params::ONE[3]; + } else { + z0[i] = src0[i-1]; z1[i] = src1[i-1]; + z2[i] = src2[i-1]; z3[i] = src3[i-1]; + } +} + +// Phase 1 launch: requires caller to provide cp[4] device arrays. +cudaError_t launch_z_prefix_phase1( + uint64_t *z0, uint64_t *z1, uint64_t *z2, uint64_t *z3, + const uint64_t *r0, const uint64_t *r1, const uint64_t *r2, const uint64_t *r3, + uint64_t *cp[4], + size_t n, cudaStream_t stream) +{ + if (n == 0) return cudaSuccess; + + size_t num_chunks = (n + Z_CHUNK_SIZE - 1) / Z_CHUNK_SIZE; + constexpr unsigned threads = 256; + unsigned blocks = (num_chunks + threads - 1) / threads; + + z_prefix_local_kernel<<>>( + z0, z1, z2, z3, r0, r1, r2, r3, + cp[0], cp[1], cp[2], cp[3], n); + + return cudaSuccess; +} + +// Phase 3 launch: requires caller to provide sp[4] device arrays (already uploaded). +cudaError_t launch_z_prefix_phase3( + uint64_t *z0, uint64_t *z1, uint64_t *z2, uint64_t *z3, + uint64_t *temp0, uint64_t *temp1, uint64_t *temp2, uint64_t *temp3, + uint64_t *sp[4], + size_t num_chunks, size_t n, cudaStream_t stream) +{ + if (n == 0) return cudaSuccess; + + constexpr unsigned threads = 256; + unsigned blocks_chunks = (num_chunks + threads - 1) / threads; + unsigned blocks_n = (n + threads - 1) / threads; + + // Apply global fixup + z_prefix_fixup_kernel<<>>( + z0, z1, z2, z3, sp[0], sp[1], sp[2], sp[3], n); + + // Shift right by 1: z[0]=1, z[i]=z_old[i-1] + // Copy z→temp, then shift temp→z + cudaError_t err; + err = cudaMemcpyAsync(temp0, z0, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(temp1, z1, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(temp2, z2, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + err = cudaMemcpyAsync(temp3, z3, n * sizeof(uint64_t), cudaMemcpyDeviceToDevice, stream); + if (err != cudaSuccess) return err; + + z_shift_right_kernel<<>>( + z0, z1, z2, z3, temp0, temp1, temp2, temp3, n); + + return cudaSuccess; +} + +} // namespace gnark_gpu diff --git a/prover/gpu/cuda/src/plonk2/ec.cuh b/prover/gpu/cuda/src/plonk2/ec.cuh new file mode 100644 index 00000000000..9b050680832 --- /dev/null +++ b/prover/gpu/cuda/src/plonk2/ec.cuh @@ -0,0 +1,338 @@ +#pragma once + +// Curve-generic short-Weierstrass elliptic-curve formulas for gpu/plonk2. +// +// All arithmetic is templated on a base-field Params struct (see field.cuh). +// We use the standard textbook formulas: +// - Affine point AoS: [X.l0..lN, Y.l0..lN] +// - Jacobian point: (X, Y, Z) representing (X/Z^2, Y/Z^3) +// - Infinity: Jacobian Z == 0; affine (0, 0) +// +// The MSM uses the mixed Jacobian + affine addition (Z2 = 1 implicit). +// For affine-affine addition (used only by single-point validation kernels) +// we promote first. + +#include "field.cuh" + +#include + +namespace gnark_gpu::plonk2 { + +template +struct AffinePoint { + uint64_t x[Fp::LIMBS]; + uint64_t y[Fp::LIMBS]; +}; + +template +struct JacobianPoint { + uint64_t x[Fp::LIMBS]; + uint64_t y[Fp::LIMBS]; + uint64_t z[Fp::LIMBS]; +}; + +template +__device__ __forceinline__ bool affine_is_infinity(const AffinePoint &p) { + return is_zero(p.x) && is_zero(p.y); +} + +template +__device__ __forceinline__ bool jacobian_is_infinity(const JacobianPoint &p) { + return is_zero(p.z); +} + +template +__device__ __forceinline__ void jacobian_set_infinity(JacobianPoint &p) { + one(p.x); + one(p.y); + zero(p.z); +} + +template +__device__ __forceinline__ void jacobian_from_affine(JacobianPoint &out, + const AffinePoint &p) { + if(affine_is_infinity(p)) { + jacobian_set_infinity(out); + return; + } + set(out.x, p.x); + set(out.y, p.y); + one(out.z); +} + +// out = 2 * a, a in affine. Result in Jacobian. +// Standard "mdbl-2007-bl" formulas. +template +__device__ __forceinline__ void jacobian_double_mixed(JacobianPoint &out, + const AffinePoint &a) { + if(affine_is_infinity(a) || is_zero(a.y)) { + jacobian_set_infinity(out); + return; + } + + uint64_t xx[Fp::LIMBS], yy[Fp::LIMBS], yyyy[Fp::LIMBS]; + uint64_t s[Fp::LIMBS], m[Fp::LIMBS], t[Fp::LIMBS]; + uint64_t tmp[Fp::LIMBS]; + + square(xx, a.x); + square(yy, a.y); + square(yyyy, yy); + + add(s, a.x, yy); + square(s, s); + sub(s, s, xx); + sub(s, s, yyyy); + double_element(s, s); + + double_element(m, xx); + add(m, m, xx); + + square(t, m); + sub(t, t, s); + sub(t, t, s); + + set(out.x, t); + + sub(tmp, s, t); + mul(out.y, tmp, m); + double_element(yyyy, yyyy); + double_element(yyyy, yyyy); + double_element(yyyy, yyyy); + sub(out.y, out.y, yyyy); + + double_element(out.z, a.y); +} + +// out = a + b, both affine, distinct, neither infinity. Result in Jacobian. +// Standard "mmadd-2007-bl" plus fallbacks for inf and a==b cases. +template +__device__ __forceinline__ void jacobian_add_affine_affine(JacobianPoint &out, + const AffinePoint &a, + const AffinePoint &b) { + if(affine_is_infinity(a)) { + jacobian_from_affine(out, b); + return; + } + if(affine_is_infinity(b)) { + jacobian_from_affine(out, a); + return; + } + if(equal(a.x, b.x)) { + if(equal(a.y, b.y)) { + jacobian_double_mixed(out, a); + } else { + jacobian_set_infinity(out); + } + return; + } + + uint64_t h[Fp::LIMBS], hh[Fp::LIMBS], i[Fp::LIMBS]; + uint64_t j[Fp::LIMBS], r[Fp::LIMBS], v[Fp::LIMBS]; + uint64_t tmp[Fp::LIMBS]; + + sub(h, b.x, a.x); + square(hh, h); + double_element(i, hh); + double_element(i, i); + mul(j, h, i); + sub(r, b.y, a.y); + double_element(r, r); + mul(v, a.x, i); + + square(out.x, r); + sub(out.x, out.x, j); + sub(out.x, out.x, v); + sub(out.x, out.x, v); + + sub(tmp, v, out.x); + mul(out.y, tmp, r); + mul(j, a.y, j); + double_element(j, j); + sub(out.y, out.y, j); + + double_element(out.z, h); +} + +// out = a + b, a Jacobian, b affine. Result Jacobian. +// Standard "madd-2007-bl" with fallbacks. +template +__device__ __forceinline__ void jacobian_add_jacobian_affine(JacobianPoint &out, + const JacobianPoint &a, + const AffinePoint &b) { + if(jacobian_is_infinity(a)) { + jacobian_from_affine(out, b); + return; + } + if(affine_is_infinity(b)) { + set(out.x, a.x); + set(out.y, a.y); + set(out.z, a.z); + return; + } + + uint64_t z1z1[Fp::LIMBS], u2[Fp::LIMBS], s2[Fp::LIMBS]; + uint64_t h[Fp::LIMBS], hh[Fp::LIMBS], i[Fp::LIMBS], j[Fp::LIMBS]; + uint64_t r[Fp::LIMBS], v[Fp::LIMBS], tmp[Fp::LIMBS]; + + square(z1z1, a.z); + mul(u2, b.x, z1z1); + mul(s2, b.y, a.z); + mul(s2, s2, z1z1); + + if(equal(a.x, u2)) { + if(equal(a.y, s2)) { + jacobian_double_mixed(out, b); + return; + } + jacobian_set_infinity(out); + return; + } + + sub(h, u2, a.x); + square(hh, h); + double_element(i, hh); + double_element(i, i); + mul(j, h, i); + + sub(r, s2, a.y); + double_element(r, r); + mul(v, a.x, i); + + square(out.x, r); + sub(out.x, out.x, j); + sub(out.x, out.x, v); + sub(out.x, out.x, v); + + sub(tmp, v, out.x); + mul(out.y, tmp, r); + mul(j, a.y, j); + double_element(j, j); + sub(out.y, out.y, j); + + add(out.z, a.z, h); + square(out.z, out.z); + sub(out.z, out.z, z1z1); + sub(out.z, out.z, hh); +} + +// out = 2a, a Jacobian. +// Standard "dbl-2009-l" (works for a != b == 0 curves, BN/BLS/BW6 are b != 0 +// but the formula does not depend on the curve coefficient since a' == 0 +// for these short-Weierstrass curves with Y^2 = X^3 + b). +template +__device__ __forceinline__ void jacobian_double(JacobianPoint &out, + const JacobianPoint &a) { + if(jacobian_is_infinity(a)) { + jacobian_set_infinity(out); + return; + } + + uint64_t a2[Fp::LIMBS], b2[Fp::LIMBS], c2[Fp::LIMBS]; + uint64_t d2[Fp::LIMBS], e2[Fp::LIMBS], f2[Fp::LIMBS], tmp[Fp::LIMBS]; + + square(a2, a.x); + square(b2, a.y); + square(c2, b2); + + add(d2, a.x, b2); + square(d2, d2); + sub(d2, d2, a2); + sub(d2, d2, c2); + double_element(d2, d2); + + double_element(e2, a2); + add(e2, e2, a2); + square(f2, e2); + + sub(out.x, f2, d2); + sub(out.x, out.x, d2); + + sub(tmp, d2, out.x); + mul(out.y, tmp, e2); + double_element(c2, c2); + double_element(c2, c2); + double_element(c2, c2); + sub(out.y, out.y, c2); + + mul(out.z, a.y, a.z); + double_element(out.z, out.z); +} + +// out = a + b, both Jacobian. +// Standard "add-2007-bl" with fallbacks. +template +__device__ __forceinline__ void jacobian_add(JacobianPoint &out, + const JacobianPoint &a, + const JacobianPoint &b) { + if(jacobian_is_infinity(a)) { + set(out.x, b.x); + set(out.y, b.y); + set(out.z, b.z); + return; + } + if(jacobian_is_infinity(b)) { + set(out.x, a.x); + set(out.y, a.y); + set(out.z, a.z); + return; + } + + uint64_t z1z1[Fp::LIMBS], z2z2[Fp::LIMBS]; + uint64_t u1[Fp::LIMBS], u2[Fp::LIMBS], s1[Fp::LIMBS], s2[Fp::LIMBS]; + uint64_t h[Fp::LIMBS], i[Fp::LIMBS], j[Fp::LIMBS], r[Fp::LIMBS], v[Fp::LIMBS]; + uint64_t tmp[Fp::LIMBS]; + + square(z1z1, a.z); + square(z2z2, b.z); + mul(u1, a.x, z2z2); + mul(u2, b.x, z1z1); + mul(s1, a.y, b.z); + mul(s1, s1, z2z2); + mul(s2, b.y, a.z); + mul(s2, s2, z1z1); + + if(equal(u1, u2)) { + if(equal(s1, s2)) { + jacobian_double(out, a); + return; + } + jacobian_set_infinity(out); + return; + } + + sub(h, u2, u1); + double_element(i, h); + square(i, i); + mul(j, h, i); + sub(r, s2, s1); + double_element(r, r); + mul(v, u1, i); + + square(out.x, r); + sub(out.x, out.x, j); + sub(out.x, out.x, v); + sub(out.x, out.x, v); + + sub(tmp, v, out.x); + mul(out.y, tmp, r); + mul(j, s1, j); + double_element(j, j); + sub(out.y, out.y, j); + + add(out.z, a.z, b.z); + square(out.z, out.z); + sub(out.z, out.z, z1z1); + sub(out.z, out.z, z2z2); + mul(out.z, out.z, h); +} + +// out = -a (negate Y). +template +__device__ __forceinline__ void jacobian_neg(JacobianPoint &out, + const JacobianPoint &a) { + set(out.x, a.x); + neg(out.y, a.y); + set(out.z, a.z); +} + +} // namespace gnark_gpu::plonk2 diff --git a/prover/gpu/cuda/src/plonk2/field.cuh b/prover/gpu/cuda/src/plonk2/field.cuh new file mode 100644 index 00000000000..6c4a9690c3f --- /dev/null +++ b/prover/gpu/cuda/src/plonk2/field.cuh @@ -0,0 +1,844 @@ +#pragma once + +// Curve-generic scalar-field primitives for gpu/plonk2. +// +// This layer favors one small, auditable implementation over per-curve copied +// arithmetic. It is intentionally separate from src/plonk/fr_arith.cuh, which +// remains the optimized BLS12-377 path used by the existing prover. + +#include "gnark_gpu.h" + +#include +#include + +#ifdef __CUDACC__ +#include +#else +#define __host__ +#define __device__ +#define __forceinline__ +#endif + +namespace gnark_gpu::plonk2 { + +static constexpr int MAX_FIELD_LIMBS = 12; +static constexpr int MAX_FR_LIMBS = MAX_FIELD_LIMBS; + +struct FrView { + uint64_t *limbs[MAX_FR_LIMBS]; +}; + +struct ConstFrView { + const uint64_t *limbs[MAX_FR_LIMBS]; +}; + +__host__ __device__ __forceinline__ ConstFrView make_const(FrView v) { + ConstFrView out{}; +#pragma unroll + for(int i = 0; i < MAX_FR_LIMBS; i++) out.limbs[i] = v.limbs[i]; + return out; +} + +struct BN254FrParams { + static constexpr int LIMBS = 4; + static constexpr int BITS = 254; + static constexpr gnark_gpu_plonk2_curve_id_t CURVE = GNARK_GPU_PLONK2_CURVE_BN254; + static constexpr uint64_t INV = 0xc2e1f593efffffffULL; + static constexpr uint64_t MODULUS[MAX_FR_LIMBS] = { + 0x43e1f593f0000001ULL, + 0x2833e84879b97091ULL, + 0xb85045b68181585dULL, + 0x30644e72e131a029ULL, + 0x0000000000000000ULL, + 0x0000000000000000ULL, + }; +}; + +struct BLS12377FrParams { + static constexpr int LIMBS = 4; + static constexpr int BITS = 253; + static constexpr gnark_gpu_plonk2_curve_id_t CURVE = GNARK_GPU_PLONK2_CURVE_BLS12_377; + static constexpr uint64_t INV = 0x0a117fffffffffffULL; + static constexpr uint64_t MODULUS[MAX_FR_LIMBS] = { + 0x0a11800000000001ULL, + 0x59aa76fed0000001ULL, + 0x60b44d1e5c37b001ULL, + 0x12ab655e9a2ca556ULL, + 0x0000000000000000ULL, + 0x0000000000000000ULL, + }; +}; + +struct BW6761FrParams { + static constexpr int LIMBS = 6; + static constexpr int BITS = 377; + static constexpr gnark_gpu_plonk2_curve_id_t CURVE = GNARK_GPU_PLONK2_CURVE_BW6_761; + static constexpr uint64_t INV = 0x8508bfffffffffffULL; + static constexpr uint64_t MODULUS[MAX_FR_LIMBS] = { + 0x8508c00000000001ULL, + 0x170b5d4430000000ULL, + 0x1ef3622fba094800ULL, + 0x1a22d9f300f5138fULL, + 0xc63b05c06ca1493bULL, + 0x01ae3a4617c510eaULL, + }; +}; + +struct BN254FpParams { + static constexpr int LIMBS = 4; + static constexpr int BITS = 254; + static constexpr uint64_t INV = 0x87d20782e4866389ULL; +}; + +struct BLS12377FpParams { + static constexpr int LIMBS = 6; + static constexpr int BITS = 377; + static constexpr uint64_t INV = 0x8508bfffffffffffULL; +}; + +struct BW6761FpParams { + static constexpr int LIMBS = 12; + static constexpr int BITS = 761; + static constexpr uint64_t INV = 0x0a5593568fa798ddULL; +}; + +template +__device__ __forceinline__ uint64_t modulus_limb(int i); + +template <> +__device__ __forceinline__ uint64_t modulus_limb(int i) { + switch(i) { + case 0: + return 0x43e1f593f0000001ULL; + case 1: + return 0x2833e84879b97091ULL; + case 2: + return 0xb85045b68181585dULL; + case 3: + return 0x30644e72e131a029ULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t modulus_limb(int i) { + switch(i) { + case 0: + return 0x0a11800000000001ULL; + case 1: + return 0x59aa76fed0000001ULL; + case 2: + return 0x60b44d1e5c37b001ULL; + case 3: + return 0x12ab655e9a2ca556ULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t modulus_limb(int i) { + switch(i) { + case 0: + return 0x8508c00000000001ULL; + case 1: + return 0x170b5d4430000000ULL; + case 2: + return 0x1ef3622fba094800ULL; + case 3: + return 0x1a22d9f300f5138fULL; + case 4: + return 0xc63b05c06ca1493bULL; + case 5: + return 0x01ae3a4617c510eaULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t modulus_limb(int i) { + switch(i) { + case 0: + return 0x3c208c16d87cfd47ULL; + case 1: + return 0x97816a916871ca8dULL; + case 2: + return 0xb85045b68181585dULL; + case 3: + return 0x30644e72e131a029ULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t modulus_limb(int i) { + switch(i) { + case 0: + return 0x8508c00000000001ULL; + case 1: + return 0x170b5d4430000000ULL; + case 2: + return 0x1ef3622fba094800ULL; + case 3: + return 0x1a22d9f300f5138fULL; + case 4: + return 0xc63b05c06ca1493bULL; + case 5: + return 0x01ae3a4617c510eaULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t modulus_limb(int i) { + switch(i) { + case 0: + return 0xf49d00000000008bULL; + case 1: + return 0xe6913e6870000082ULL; + case 2: + return 0x160cf8aeeaf0a437ULL; + case 3: + return 0x98a116c25667a8f8ULL; + case 4: + return 0x71dcd3dc73ebff2eULL; + case 5: + return 0x8689c8ed12f9fd90ULL; + case 6: + return 0x03cebaff25b42304ULL; + case 7: + return 0x707ba638e584e919ULL; + case 8: + return 0x528275ef8087be41ULL; + case 9: + return 0xb926186a81d14688ULL; + case 10: + return 0xd187c94004faff3eULL; + case 11: + return 0x0122e824fb83ce0aULL; + default: + return 0; + } +} + +template +__device__ __forceinline__ uint64_t one_limb(int i); + +template <> +__device__ __forceinline__ uint64_t one_limb(int i) { + switch(i) { + case 0: + return 0xac96341c4ffffffbULL; + case 1: + return 0x36fc76959f60cd29ULL; + case 2: + return 0x666ea36f7879462eULL; + case 3: + return 0x0e0a77c19a07df2fULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t one_limb(int i) { + switch(i) { + case 0: + return 0x7d1c7ffffffffff3ULL; + case 1: + return 0x7257f50f6ffffff2ULL; + case 2: + return 0x16d81575512c0feeULL; + case 3: + return 0x0d4bda322bbb9a9dULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t one_limb(int i) { + switch(i) { + case 0: + return 0x02cdffffffffff68ULL; + case 1: + return 0x51409f837fffffb1ULL; + case 2: + return 0x9f7db3a98a7d3ff2ULL; + case 3: + return 0x7b4e97b76e7c6305ULL; + case 4: + return 0x4cf495bf803c84e8ULL; + case 5: + return 0x008d6661e2fdf49aULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t one_limb(int i) { + switch(i) { + case 0: + return 0xd35d438dc58f0d9dULL; + case 1: + return 0x0a78eb28f5c70b3dULL; + case 2: + return 0x666ea36f7879462cULL; + case 3: + return 0x0e0a77c19a07df2fULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t one_limb(int i) { + switch(i) { + case 0: + return 0x02cdffffffffff68ULL; + case 1: + return 0x51409f837fffffb1ULL; + case 2: + return 0x9f7db3a98a7d3ff2ULL; + case 3: + return 0x7b4e97b76e7c6305ULL; + case 4: + return 0x4cf495bf803c84e8ULL; + case 5: + return 0x008d6661e2fdf49aULL; + default: + return 0; + } +} + +template <> +__device__ __forceinline__ uint64_t one_limb(int i) { + switch(i) { + case 0: + return 0x0202ffffffff85d5ULL; + case 1: + return 0x5a5826358fff8ce7ULL; + case 2: + return 0x9e996e43827faadeULL; + case 3: + return 0xda6aff320ee47df4ULL; + case 4: + return 0xece9cb3e1d94b80bULL; + case 5: + return 0xc0e667a25248240bULL; + case 6: + return 0xa74da5bfdcad3905ULL; + case 7: + return 0x2352e7fe462f2103ULL; + case 8: + return 0x7b56588008b1c87cULL; + case 9: + return 0x45848a63e711022fULL; + case 10: + return 0xd7a81ebb9f65a9dfULL; + case 11: + return 0x0051f77ef127e87dULL; + default: + return 0; + } +} + +__host__ __device__ __forceinline__ int curve_base_limbs(gnark_gpu_plonk2_curve_id_t curve) { + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return BN254FpParams::LIMBS; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return BLS12377FpParams::LIMBS; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return BW6761FpParams::LIMBS; + default: + return 0; + } +} + +__host__ __device__ __forceinline__ int curve_limbs(gnark_gpu_plonk2_curve_id_t curve) { + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return 4; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return 6; + default: + return 0; + } +} + +__host__ __device__ __forceinline__ int curve_bits(gnark_gpu_plonk2_curve_id_t curve) { + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return BN254FrParams::BITS; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return BLS12377FrParams::BITS; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return BW6761FrParams::BITS; + default: + return 0; + } +} + +__device__ __forceinline__ uint64_t add_carry(uint64_t a, uint64_t b, uint64_t &carry) { + uint64_t s = a + b; + uint64_t c = s < a; + uint64_t r = s + carry; + c += r < s; + carry = c; + return r; +} + +__device__ __forceinline__ uint64_t sub_borrow(uint64_t a, uint64_t b, uint64_t &borrow) { + uint64_t bb = b + borrow; + uint64_t bcarry = bb < b; + uint64_t r = a - bb; + borrow = (a < bb) || bcarry; + return r; +} + +template +__device__ __forceinline__ void load(uint64_t out[Params::LIMBS], ConstFrView v, size_t idx) { +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + out[i] = __ldg(v.limbs[i] + idx); + } +} + +template +__device__ __forceinline__ void store(FrView v, size_t idx, const uint64_t in[Params::LIMBS]) { +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + v.limbs[i][idx] = in[i]; + } +} + +template +__device__ __forceinline__ uint64_t subtract_modulus( + uint64_t out[Params::LIMBS], const uint64_t in[Params::LIMBS]) { + uint64_t borrow = 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + out[i] = sub_borrow(in[i], modulus_limb(i), borrow); + } + return borrow; +} + +template +__device__ __forceinline__ void add(uint64_t r[Params::LIMBS], + const uint64_t a[Params::LIMBS], + const uint64_t b[Params::LIMBS]) { + uint64_t sum[Params::LIMBS]; + uint64_t carry = 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + sum[i] = add_carry(a[i], b[i], carry); + } + + uint64_t reduced[Params::LIMBS]; + uint64_t borrow = subtract_modulus(reduced, sum); + bool use_reduced = carry != 0 || borrow == 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + r[i] = use_reduced ? reduced[i] : sum[i]; + } +} + +template +__device__ __forceinline__ void sub(uint64_t r[Params::LIMBS], + const uint64_t a[Params::LIMBS], + const uint64_t b[Params::LIMBS]) { + uint64_t diff[Params::LIMBS]; + uint64_t borrow = 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + diff[i] = sub_borrow(a[i], b[i], borrow); + } + + if(borrow == 0) { +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) r[i] = diff[i]; + return; + } + + uint64_t carry = 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + r[i] = add_carry(diff[i], modulus_limb(i), carry); + } +} + +template +__device__ __forceinline__ void mul(uint64_t r[Params::LIMBS], + const uint64_t a[Params::LIMBS], + const uint64_t b[Params::LIMBS]) { + uint64_t t[Params::LIMBS + 1]; +#pragma unroll + for(int i = 0; i <= Params::LIMBS; i++) t[i] = 0; + +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + unsigned __int128 carry = 0; +#pragma unroll + for(int j = 0; j < Params::LIMBS; j++) { + unsigned __int128 uv = + (unsigned __int128)t[j] + + (unsigned __int128)a[j] * (unsigned __int128)b[i] + + carry; + t[j] = (uint64_t)uv; + carry = uv >> 64; + } + unsigned __int128 top = (unsigned __int128)t[Params::LIMBS] + carry; + t[Params::LIMBS] = (uint64_t)top; + + uint64_t m = t[0] * Params::INV; + carry = 0; +#pragma unroll + for(int j = 0; j < Params::LIMBS; j++) { + unsigned __int128 uv = + (unsigned __int128)t[j] + + (unsigned __int128)m * (unsigned __int128)modulus_limb(j) + + carry; + uint64_t word = (uint64_t)uv; + carry = uv >> 64; + if(j > 0) t[j - 1] = word; + } + top = (unsigned __int128)t[Params::LIMBS] + carry; + t[Params::LIMBS - 1] = (uint64_t)top; + t[Params::LIMBS] = (uint64_t)(top >> 64); + } + + uint64_t candidate[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) candidate[i] = t[i]; + + uint64_t reduced[Params::LIMBS]; + uint64_t borrow = subtract_modulus(reduced, candidate); + bool use_reduced = t[Params::LIMBS] != 0 || borrow == 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + r[i] = use_reduced ? reduced[i] : candidate[i]; + } +} + +__device__ __forceinline__ uint32_t limb32_from64(const uint64_t *a, int i) { + const uint64_t word = a[i >> 1]; + return static_cast((word >> ((i & 1) * 32)) & 0xffffffffULL); +} + +__device__ __forceinline__ uint32_t bw6761_fp_modulus32_limb(int i) { + switch(i) { + case 0: + return 0x0000008bU; + case 1: + return 0xf49d0000U; + case 2: + return 0x70000082U; + case 3: + return 0xe6913e68U; + case 4: + return 0xeaf0a437U; + case 5: + return 0x160cf8aeU; + case 6: + return 0x5667a8f8U; + case 7: + return 0x98a116c2U; + case 8: + return 0x73ebff2eU; + case 9: + return 0x71dcd3dcU; + case 10: + return 0x12f9fd90U; + case 11: + return 0x8689c8edU; + case 12: + return 0x25b42304U; + case 13: + return 0x03cebaffU; + case 14: + return 0xe584e919U; + case 15: + return 0x707ba638U; + case 16: + return 0x8087be41U; + case 17: + return 0x528275efU; + case 18: + return 0x81d14688U; + case 19: + return 0xb926186aU; + case 20: + return 0x04faff3eU; + case 21: + return 0xd187c940U; + case 22: + return 0xfb83ce0aU; + case 23: + return 0x0122e824U; + default: + return 0; + } +} + +__device__ __forceinline__ uint32_t bw6761_fp_sub_modulus32( + uint32_t reduced[24], const uint32_t in[24]) { + uint32_t borrow = 0; +#pragma unroll + for(int i = 0; i < 24; i++) { + const uint32_t mod = bw6761_fp_modulus32_limb(i); + const uint32_t bb = mod + borrow; + const uint32_t bcarry = bb < mod; + reduced[i] = in[i] - bb; + borrow = (in[i] < bb) || bcarry; + } + return borrow; +} + +__device__ __forceinline__ uint32_t bw6761_fp_mad_wide32( + uint32_t &lo, + uint32_t a, + uint32_t b, + uint32_t addend, + uint32_t carry_in) { + uint32_t hi; + asm volatile( + "{\n\t" + "mad.lo.cc.u32 %0, %2, %3, %4;\n\t" + "madc.hi.u32 %1, %2, %3, 0;\n\t" + "add.cc.u32 %0, %0, %5;\n\t" + "addc.u32 %1, %1, 0;\n\t" + "}" + : "=&r"(lo), "=&r"(hi) + : "r"(a), "r"(b), "r"(addend), "r"(carry_in)); + return hi; +} + +// Pack LIMBS64 pairs of adjacent 32-bit words into LIMBS64 64-bit words. +template +__device__ __forceinline__ void bw6761_pack32( + uint64_t r[LIMBS64], const uint32_t in[LIMBS64 * 2]) { +#pragma unroll + for(int i = 0; i < LIMBS64; i++) { + r[i] = static_cast(in[2 * i]) | + (static_cast(in[2 * i + 1]) << 32); + } +} + +template <> +__device__ __forceinline__ void mul( + uint64_t r[BW6761FpParams::LIMBS], + const uint64_t a[BW6761FpParams::LIMBS], + const uint64_t b[BW6761FpParams::LIMBS]) { + static constexpr int LIMBS32 = 24; + static constexpr uint32_t INV32 = 0x8fa798ddU; + + uint32_t t[LIMBS32 + 1]; + for(int i = 0; i <= LIMBS32; i++) t[i] = 0; + + for(int i = 0; i < LIMBS32; i++) { + const uint32_t bi = limb32_from64(b, i); + uint32_t carry = 0; + for(int j = 0; j < LIMBS32; j++) { + uint32_t lo; + carry = bw6761_fp_mad_wide32( + lo, limb32_from64(a, j), bi, t[j], carry); + t[j] = lo; + } + uint64_t top = static_cast(t[LIMBS32]) + carry; + t[LIMBS32] = static_cast(top); + + const uint32_t m = t[0] * INV32; + carry = 0; + for(int j = 0; j < LIMBS32; j++) { + uint32_t word; + carry = bw6761_fp_mad_wide32( + word, m, bw6761_fp_modulus32_limb(j), t[j], carry); + if(j > 0) t[j - 1] = word; + } + top = static_cast(t[LIMBS32]) + carry; + t[LIMBS32 - 1] = static_cast(top); + t[LIMBS32] = static_cast(top >> 32); + } + + uint32_t candidate[LIMBS32]; + for(int i = 0; i < LIMBS32; i++) candidate[i] = t[i]; + + uint32_t reduced[LIMBS32]; + const uint32_t borrow = bw6761_fp_sub_modulus32(reduced, candidate); + const bool use_reduced = t[LIMBS32] != 0 || borrow == 0; + bw6761_pack32<12>(r, use_reduced ? reduced : candidate); +} + +// ─── BW6-761 Fr PTX CIOS (6 × 64-bit = 12 × 32-bit limbs) ────────────────── +// +// Fr modulus (377 bits): +// 0x01ae3a4617c510ea_c63b05c06ca1493b_1a22d9f300f5138f +// _1ef3622fba094800_170b5d4430000000_8508c00000000001 +// +// INV32 = -Fr[0]^{-1} mod 2^32 = 0xffffffff +// (Fr modulus low 32-bit limb = 0x00000001, so -1^{-1} mod 2^32 = 0xffffffff) +// +// Reuses bw6761_fp_mad_wide32 and packs result to 6 × 64-bit via bw6761_pack32<6>. +// ───────────────────────────────────────────────────────────────────────────── + +__device__ __forceinline__ uint32_t bw6761_fr_modulus32_limb(int i) { + // Fr = 0x8508c00000000001 0x170b5d4430000000 + // 0x1ef3622fba094800 0x1a22d9f300f5138f + // 0xc63b05c06ca1493b 0x01ae3a4617c510ea + // In 32-bit little-endian (12 words): + switch(i) { + case 0: return 0x00000001U; // lo32(limb[0]) + case 1: return 0x8508c000U; // hi32(limb[0]) + case 2: return 0x30000000U; // lo32(limb[1]) + case 3: return 0x170b5d44U; // hi32(limb[1]) + case 4: return 0xba094800U; // lo32(limb[2]) + case 5: return 0x1ef3622fU; // hi32(limb[2]) + case 6: return 0x00f5138fU; // lo32(limb[3]) + case 7: return 0x1a22d9f3U; // hi32(limb[3]) + case 8: return 0x6ca1493bU; // lo32(limb[4]) + case 9: return 0xc63b05c0U; // hi32(limb[4]) + case 10: return 0x17c510eaU; // lo32(limb[5]) + case 11: return 0x01ae3a46U; // hi32(limb[5]) + default: return 0; + } +} + +__device__ __forceinline__ uint32_t bw6761_fr_sub_modulus32( + uint32_t reduced[12], const uint32_t in[12]) { + uint32_t borrow = 0; +#pragma unroll + for(int i = 0; i < 12; i++) { + const uint32_t mod = bw6761_fr_modulus32_limb(i); + const uint32_t bb = mod + borrow; + const uint32_t bcarry = bb < mod; + reduced[i] = in[i] - bb; + borrow = (in[i] < bb) || bcarry; + } + return borrow; +} + +// PTX-optimized CIOS Montgomery multiply for BW6-761 Fr (12 × 32-bit limbs). +// Uses the same bw6761_fp_mad_wide32 helper with INV32 = 0xffffffff. +// +// Outer loop uses #pragma unroll 1 (no unroll) to bound register pressure: +// full 12-iter unroll of the outer loop with 12-iter inner loops would +// spill t[0..12] to local memory on GPUs with <64 registers/thread. +template <> +__device__ __forceinline__ void mul( + uint64_t r[BW6761FrParams::LIMBS], + const uint64_t a[BW6761FrParams::LIMBS], + const uint64_t b[BW6761FrParams::LIMBS]) { + static constexpr int LIMBS32 = 12; + static constexpr uint32_t INV32 = 0xffffffffU; // -Fr[0]^{-1} mod 2^32 + + uint32_t t[LIMBS32 + 1]; +#pragma unroll + for(int i = 0; i <= LIMBS32; i++) t[i] = 0; + +#pragma unroll 1 + for(int i = 0; i < LIMBS32; i++) { + const uint32_t bi = limb32_from64(b, i); + uint32_t carry = 0; +#pragma unroll + for(int j = 0; j < LIMBS32; j++) { + uint32_t lo; + carry = bw6761_fp_mad_wide32( + lo, limb32_from64(a, j), bi, t[j], carry); + t[j] = lo; + } + uint64_t top = static_cast(t[LIMBS32]) + carry; + t[LIMBS32] = static_cast(top); + + const uint32_t m = t[0] * INV32; + carry = 0; +#pragma unroll + for(int j = 0; j < LIMBS32; j++) { + uint32_t word; + carry = bw6761_fp_mad_wide32( + word, m, bw6761_fr_modulus32_limb(j), t[j], carry); + if(j > 0) t[j - 1] = word; + } + top = static_cast(t[LIMBS32]) + carry; + t[LIMBS32 - 1] = static_cast(top); + t[LIMBS32] = static_cast(top >> 32); + } + + uint32_t candidate[LIMBS32]; +#pragma unroll + for(int i = 0; i < LIMBS32; i++) candidate[i] = t[i]; + + uint32_t reduced[LIMBS32]; + const uint32_t borrow = bw6761_fr_sub_modulus32(reduced, candidate); + const bool use_reduced = t[LIMBS32] != 0 || borrow == 0; + bw6761_pack32<6>(r, use_reduced ? reduced : candidate); +} + +template +__device__ __forceinline__ void zero(uint64_t r[Params::LIMBS]) { +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) r[i] = 0; +} + +template +__device__ __forceinline__ void one(uint64_t r[Params::LIMBS]) { +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) r[i] = one_limb(i); +} + +template +__device__ __forceinline__ void set(uint64_t r[Params::LIMBS], const uint64_t a[Params::LIMBS]) { +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) r[i] = a[i]; +} + +template +__device__ __forceinline__ bool is_zero(const uint64_t a[Params::LIMBS]) { + uint64_t acc = 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) acc |= a[i]; + return acc == 0; +} + +template +__device__ __forceinline__ bool equal(const uint64_t a[Params::LIMBS], const uint64_t b[Params::LIMBS]) { + uint64_t acc = 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) acc |= a[i] ^ b[i]; + return acc == 0; +} + +template +__device__ __forceinline__ void double_element(uint64_t r[Params::LIMBS], + const uint64_t a[Params::LIMBS]) { + add(r, a, a); +} + +template +__device__ __forceinline__ void square(uint64_t r[Params::LIMBS], + const uint64_t a[Params::LIMBS]) { + mul(r, a, a); +} + +template +__device__ __forceinline__ void neg(uint64_t r[Params::LIMBS], + const uint64_t a[Params::LIMBS]) { + if(is_zero(a)) { + zero(r); + return; + } + + uint64_t borrow = 0; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + r[i] = sub_borrow(modulus_limb(i), a[i], borrow); + } +} + +} // namespace gnark_gpu::plonk2 diff --git a/prover/gpu/cuda/src/plonk2/g1.cu b/prover/gpu/cuda/src/plonk2/g1.cu new file mode 100644 index 00000000000..d22bd0dde94 --- /dev/null +++ b/prover/gpu/cuda/src/plonk2/g1.cu @@ -0,0 +1,268 @@ +// Single-point validation kernels for gpu/plonk2. +// +// These are not used by the MSM. They exist only so the curve-generic G1 +// formulas in ec.cuh can be tested against gnark-crypto from Go before the +// MSM batches them up. + +#include "ec.cuh" + +#include + +namespace gnark_gpu::plonk2 { + +namespace { + +template +__device__ __forceinline__ void load_affine(AffinePoint &p, const uint64_t *raw) { +#pragma unroll + for(int i = 0; i < Fp::LIMBS; i++) { + p.x[i] = raw[i]; + p.y[i] = raw[Fp::LIMBS + i]; + } +} + +template +__device__ __forceinline__ void store_jacobian(const JacobianPoint &p, uint64_t *raw) { +#pragma unroll + for(int i = 0; i < Fp::LIMBS; i++) { + raw[i] = p.x[i]; + raw[Fp::LIMBS + i] = p.y[i]; + raw[2 * Fp::LIMBS + i] = p.z[i]; + } +} + +template +__device__ __forceinline__ void load_affine_at(AffinePoint &p, const uint64_t *raw, size_t idx) { + const uint64_t *point = raw + idx * (2 * Fp::LIMBS); + load_affine(p, point); +} + +template +__device__ __forceinline__ bool scalar_bit(const uint64_t *scalars, size_t idx, int bit) { + const uint64_t *scalar = scalars + idx * Fr::LIMBS; + int limb = bit / 64; + if(limb >= Fr::LIMBS) return false; + return ((scalar[limb] >> (bit & 63)) & 1ULL) != 0; +} + +template +__device__ __forceinline__ void scalar_mul_affine( + JacobianPoint &out, const AffinePoint &point, const uint64_t *scalars, size_t idx) { + + JacobianPoint acc, base, tmp; + jacobian_set_infinity(acc); + jacobian_from_affine(base, point); + + for(int bit = 0; bit < Fr::BITS; bit++) { + if(scalar_bit(scalars, idx, bit)) { + jacobian_add(tmp, acc, base); + set(acc.x, tmp.x); + set(acc.y, tmp.y); + set(acc.z, tmp.z); + } + jacobian_double(tmp, base); + set(base.x, tmp.x); + set(base.y, tmp.y); + set(base.z, tmp.z); + } + + set(out.x, acc.x); + set(out.y, acc.y); + set(out.z, acc.z); +} + +template +__global__ void g1_affine_add_kernel(const uint64_t *p_raw, const uint64_t *q_raw, uint64_t *out_raw) { + AffinePoint p, q; + JacobianPoint out; + load_affine(p, p_raw); + load_affine(q, q_raw); + jacobian_add_affine_affine(out, p, q); + store_jacobian(out, out_raw); +} + +template +__global__ void g1_affine_double_kernel(const uint64_t *p_raw, uint64_t *out_raw) { + AffinePoint p; + JacobianPoint out; + load_affine(p, p_raw); + jacobian_double_mixed(out, p); + store_jacobian(out, out_raw); +} + +template +__global__ void msm_naive_kernel( + const uint64_t *points, const uint64_t *scalars, size_t count, uint64_t *out_raw) { + + JacobianPoint acc, term, tmp; + jacobian_set_infinity(acc); + + for(size_t i = 0; i < count; i++) { + AffinePoint p; + load_affine_at(p, points, i); + scalar_mul_affine(term, p, scalars, i); + jacobian_add(tmp, acc, term); + set(acc.x, tmp.x); + set(acc.y, tmp.y); + set(acc.z, tmp.z); + } + + store_jacobian(acc, out_raw); +} + +template +cudaError_t run_g1_add(const uint64_t *p, const uint64_t *q, uint64_t *out, cudaStream_t stream) { + uint64_t *d_p = nullptr, *d_q = nullptr, *d_out = nullptr; + constexpr size_t input_words = 2 * Fp::LIMBS; + constexpr size_t output_words = 3 * Fp::LIMBS; + + cudaError_t err = cudaMalloc(&d_p, input_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_q, input_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_out, output_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + + err = cudaMemcpyAsync(d_p, p, input_words * sizeof(uint64_t), cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) goto done; + err = cudaMemcpyAsync(d_q, q, input_words * sizeof(uint64_t), cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) goto done; + + g1_affine_add_kernel<<<1, 1, 0, stream>>>(d_p, d_q, d_out); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + + err = cudaMemcpyAsync(out, d_out, output_words * sizeof(uint64_t), cudaMemcpyDeviceToHost, stream); + if(err != cudaSuccess) goto done; + err = cudaStreamSynchronize(stream); + +done: + if(d_p) cudaFree(d_p); + if(d_q) cudaFree(d_q); + if(d_out) cudaFree(d_out); + return err; +} + +template +cudaError_t run_g1_double(const uint64_t *p, uint64_t *out, cudaStream_t stream) { + uint64_t *d_p = nullptr, *d_out = nullptr; + constexpr size_t input_words = 2 * Fp::LIMBS; + constexpr size_t output_words = 3 * Fp::LIMBS; + + cudaError_t err = cudaMalloc(&d_p, input_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_out, output_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + + err = cudaMemcpyAsync(d_p, p, input_words * sizeof(uint64_t), cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) goto done; + + g1_affine_double_kernel<<<1, 1, 0, stream>>>(d_p, d_out); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + + err = cudaMemcpyAsync(out, d_out, output_words * sizeof(uint64_t), cudaMemcpyDeviceToHost, stream); + if(err != cudaSuccess) goto done; + err = cudaStreamSynchronize(stream); + +done: + if(d_p) cudaFree(d_p); + if(d_out) cudaFree(d_out); + return err; +} + +} // namespace + +cudaError_t g1_affine_add_run( + gnark_gpu_plonk2_curve_id_t curve, const uint64_t *p, + const uint64_t *q, uint64_t *out, cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return run_g1_add(p, q, out, stream); + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return run_g1_add(p, q, out, stream); + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return run_g1_add(p, q, out, stream); + default: + return cudaErrorInvalidValue; + } +} + +cudaError_t g1_affine_double_run( + gnark_gpu_plonk2_curve_id_t curve, const uint64_t *p, + uint64_t *out, cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return run_g1_double(p, out, stream); + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return run_g1_double(p, out, stream); + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return run_g1_double(p, out, stream); + default: + return cudaErrorInvalidValue; + } +} + +template +cudaError_t run_msm_naive( + const uint64_t *points, const uint64_t *scalars, size_t count, + uint64_t *out, cudaStream_t stream) { + + uint64_t *d_points = nullptr, *d_scalars = nullptr, *d_out = nullptr; + const size_t point_words = count * 2 * Fp::LIMBS; + const size_t scalar_words = count * Fr::LIMBS; + constexpr size_t output_words = 3 * Fp::LIMBS; + + cudaError_t err = cudaMalloc(&d_points, point_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_scalars, scalar_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_out, output_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + + err = cudaMemcpyAsync(d_points, points, point_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) goto done; + err = cudaMemcpyAsync(d_scalars, scalars, scalar_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) goto done; + + msm_naive_kernel<<<1, 1, 0, stream>>>(d_points, d_scalars, count, d_out); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + + err = cudaMemcpyAsync(out, d_out, output_words * sizeof(uint64_t), + cudaMemcpyDeviceToHost, stream); + if(err != cudaSuccess) goto done; + err = cudaStreamSynchronize(stream); + +done: + if(d_points) cudaFree(d_points); + if(d_scalars) cudaFree(d_scalars); + if(d_out) cudaFree(d_out); + return err; +} + +cudaError_t msm_naive_run( + gnark_gpu_plonk2_curve_id_t curve, const uint64_t *points, + const uint64_t *scalars, size_t count, uint64_t *out, + cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return run_msm_naive( + points, scalars, count, out, stream); + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return run_msm_naive( + points, scalars, count, out, stream); + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return run_msm_naive( + points, scalars, count, out, stream); + default: + return cudaErrorInvalidValue; + } +} + +} // namespace gnark_gpu::plonk2 diff --git a/prover/gpu/cuda/src/plonk2/kernels.cu b/prover/gpu/cuda/src/plonk2/kernels.cu new file mode 100644 index 00000000000..7bb9c458514 --- /dev/null +++ b/prover/gpu/cuda/src/plonk2/kernels.cu @@ -0,0 +1,2250 @@ +#include "field.cuh" + +#include + +namespace gnark_gpu::plonk2 { + +namespace { + +constexpr unsigned THREADS = 256; +constexpr unsigned NTT_THREADS = 256; +constexpr size_t Z_PREFIX_CHUNK_SIZE = 1024; +constexpr size_t POLY_EVAL_CHUNK_SIZE = 1024; +constexpr uint32_t NTT_FUSED_TAIL_MIN_N = 1u << 22; + +struct ScalarArg { + uint64_t limbs[MAX_FR_LIMBS]; +}; + +ScalarArg make_scalar_arg(gnark_gpu_plonk2_curve_id_t curve, const uint64_t *limbs) { + ScalarArg out{}; + int n = curve_limbs(curve); + for(int i = 0; i < n; i++) out.limbs[i] = limbs[i]; + return out; +} + +template +// copy_aos_to_soa_kernel: transpose field elements from AoS to SoA layout. +// Input src: n×LIMBS uint64s in Array-of-Structs (gnark-crypto) order. +// Output dst: LIMBS×n SoA layout for coalesced GPU access. +// Thread layout: 1D, one thread per element. +__global__ void copy_aos_to_soa_kernel(FrView dst, const uint64_t *__restrict__ src, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + const uint64_t *in = src + idx * Params::LIMBS; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + dst.limbs[i][idx] = in[i]; + } +} + +template +// copy_soa_to_aos_kernel: transpose field elements from SoA to AoS layout. +// Input src: LIMBS×n SoA (GPU-resident). Output dst: n×LIMBS AoS (host format). +// Thread layout: 1D, one thread per element. +__global__ void copy_soa_to_aos_kernel(uint64_t *__restrict__ dst, ConstFrView src, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t *out = dst + idx * Params::LIMBS; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + out[i] = src.limbs[i][idx]; + } +} + +template +// set_zero_kernel: set all n field elements to zero (Montgomery form of 0). +// Thread layout: 1D, one thread per element. +// Precondition: v must have count >= n. +__global__ void set_zero_kernel(FrView v, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + v.limbs[i][idx] = 0; + } +} + +template +// add_kernel: element-wise modular addition out[i] = a[i] + b[i] mod p. +// All vectors must have the same length n and be in Montgomery form. +// Thread layout: 1D, one thread per element. +__global__ void add_kernel(FrView out, ConstFrView a, ConstFrView b, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t av[Params::LIMBS], bv[Params::LIMBS], rv[Params::LIMBS]; + load(av, a, idx); + load(bv, b, idx); + add(rv, av, bv); + store(out, idx, rv); +} + +template +// sub_kernel: element-wise modular subtraction out[i] = a[i] - b[i] mod p. +// All vectors must have the same length n and be in Montgomery form. +// Thread layout: 1D, one thread per element. +__global__ void sub_kernel(FrView out, ConstFrView a, ConstFrView b, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t av[Params::LIMBS], bv[Params::LIMBS], rv[Params::LIMBS]; + load(av, a, idx); + load(bv, b, idx); + sub(rv, av, bv); + store(out, idx, rv); +} + +template +// mul_kernel: element-wise Montgomery multiplication out[i] = a[i] * b[i] mod p. +// All vectors must have the same length n and be in Montgomery form. +// Thread layout: 1D, one thread per element. +__global__ void mul_kernel(FrView out, ConstFrView a, ConstFrView b, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t av[Params::LIMBS], bv[Params::LIMBS], rv[Params::LIMBS]; + load(av, a, idx); + load(bv, b, idx); + mul(rv, av, bv); + store(out, idx, rv); +} + +template +// addmul_kernel: fused multiply-add out[i] = out[i] + a[i] * b[i] mod p. +// All vectors must have the same length n and be in Montgomery form. +// Thread layout: 1D, one thread per element. +__global__ void addmul_kernel(FrView out, ConstFrView a, ConstFrView b, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t ov[Params::LIMBS], av[Params::LIMBS], bv[Params::LIMBS]; + uint64_t prod[Params::LIMBS], rv[Params::LIMBS]; + load(ov, make_const(out), idx); + load(av, a, idx); + load(bv, b, idx); + mul(prod, av, bv); + add(rv, ov, prod); + store(out, idx, rv); +} + +template +// scalar_mul_kernel: broadcast scalar multiplication out[i] *= scalar mod p. +// scalar is a single field element broadcast to all n elements. +// Thread layout: 1D, one thread per element. +__global__ void scalar_mul_kernel(FrView out, ScalarArg scalar_arg, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t scalar[Params::LIMBS], ov[Params::LIMBS], rv[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) scalar[i] = scalar_arg.limbs[i]; + load(ov, make_const(out), idx); + mul(rv, ov, scalar); + store(out, idx, rv); +} + +template +// add_scalar_mul_kernel: fused scalar multiply-add out[i] += a[i] * scalar mod p. +// Broadcasts scalar across all n elements of a. +// Thread layout: 1D, one thread per element. +__global__ void add_scalar_mul_kernel( + FrView out, ConstFrView a, ScalarArg scalar_arg, size_t n) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t scalar[Params::LIMBS], ov[Params::LIMBS], av[Params::LIMBS]; + uint64_t prod[Params::LIMBS], rv[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) scalar[i] = scalar_arg.limbs[i]; + load(ov, make_const(out), idx); + load(av, a, idx); + mul(prod, av, scalar); + add(rv, ov, prod); + store(out, idx, rv); +} + +template +__device__ __forceinline__ bool modulus_minus_two_bit(int bit) { + uint64_t limb = modulus_limb(bit / 64); + if(bit < 64) limb -= 2; + return ((limb >> (bit & 63)) & 1ULL) != 0; +} + +template +__device__ __forceinline__ void inverse_pow(uint64_t out[Params::LIMBS], + const uint64_t in[Params::LIMBS]) { + uint64_t acc[Params::LIMBS], factor[Params::LIMBS]; + one(acc); + set(factor, in); + + for(int bit = 0; bit < Params::BITS; bit++) { + if(modulus_minus_two_bit(bit)) { + mul(acc, acc, factor); + } + if(bit + 1 < Params::BITS) { + square(factor, factor); + } + } + set(out, acc); +} + +template +// invert_kernel: finalize batch inversion (phase 2 of Montgomery batch invert). +// Applies pre-computed inverse products to recover individual inverses. +// Called after the forward scan; thread layout: 1D, one thread per element. +__global__ void invert_kernel(FrView data, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t value[Params::LIMBS], inv[Params::LIMBS]; + load(value, make_const(data), idx); + inverse_pow(inv, value); + store(data, idx, inv); +} + +template +// butterfly4_inverse_kernel: size-4 inverse DFT butterfly across 4 FrVectors. +// Implements the combined iDFT of 4 length-n blocks to recover h[0..n-1] from +// 4 coset evaluations. Used in decomposed iFFT(4n) for PlonK quotient recovery. +// omega4Inv: inverse of primitive 4th root of unity. quarter: 1/4 mod p. +__global__ void butterfly4_inverse_kernel( + FrView b0, FrView b1, FrView b2, FrView b3, + ScalarArg omega4_inv_arg, ScalarArg quarter_arg, size_t n) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t omega4_inv[Params::LIMBS], quarter[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + omega4_inv[i] = omega4_inv_arg.limbs[i]; + quarter[i] = quarter_arg.limbs[i]; + } + + uint64_t v0[Params::LIMBS], v1[Params::LIMBS], v2[Params::LIMBS], v3[Params::LIMBS]; + load(v0, make_const(b0), idx); + load(v1, make_const(b1), idx); + load(v2, make_const(b2), idx); + load(v3, make_const(b3), idx); + + uint64_t t0[Params::LIMBS], t1[Params::LIMBS], t2[Params::LIMBS], t3[Params::LIMBS]; + uint64_t u0[Params::LIMBS], u1[Params::LIMBS], u2[Params::LIMBS], u3[Params::LIMBS]; + add(t0, v0, v2); + sub(t1, v0, v2); + add(t2, v1, v3); + sub(t3, v1, v3); + mul(t3, t3, omega4_inv); + + add(u0, t0, t2); + add(u1, t1, t3); + sub(u2, t0, t2); + sub(u3, t1, t3); + + mul(u0, u0, quarter); + mul(u1, u1, quarter); + mul(u2, u2, quarter); + mul(u3, u3, quarter); + + store(b0, idx, u0); + store(b1, idx, u1); + store(b2, idx, u2); + store(b3, idx, u3); +} + +template +// reduce_blinded_coset_kernel: reduce a blinded polynomial for coset evaluation. +// dst[i] = src[i] + tail[i] * cosetPowN for i < tail_len +// dst[i] = src[i] for i >= tail_len +// Enables CosetFFT on n-element dst instead of the full (n+blinding) polynomial. +__global__ void reduce_blinded_coset_kernel( + FrView dst, ConstFrView src, const uint64_t *tail, + ScalarArg coset_pow_n_arg, size_t n, size_t tail_len) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t value[Params::LIMBS]; + load(value, src, idx); + if(idx < tail_len) { + uint64_t tail_value[Params::LIMBS], coset_pow_n[Params::LIMBS], scaled[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + tail_value[i] = __ldg(tail + idx * Params::LIMBS + i); + coset_pow_n[i] = coset_pow_n_arg.limbs[i]; + } + mul(scaled, tail_value, coset_pow_n); + add(value, value, scaled); + } + store(dst, idx, value); +} + +template +// compute_l1_den_kernel: compute the first Lagrange polynomial denominator. +// out[i] = coset_gen * omega^i - 1 for each domain point. +// Used as denominator in L1(X) = (X^n-1) / (n*(X-1)) at coset points. +// Thread layout: 1D, one thread per element. +__global__ void compute_l1_den_kernel( + FrView out, ConstFrView twiddles, ScalarArg coset_gen_arg, size_t n) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t coset_gen[Params::LIMBS], omega_i[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) coset_gen[i] = coset_gen_arg.limbs[i]; + + size_t half_n = n >> 1; + if(idx < half_n) { + load(omega_i, twiddles, idx); + } else { + uint64_t positive[Params::LIMBS], zero_value[Params::LIMBS]; + load(positive, twiddles, idx - half_n); + zero(zero_value); + sub(omega_i, zero_value, positive); + } + + uint64_t product[Params::LIMBS], one_value[Params::LIMBS], value[Params::LIMBS]; + mul(product, coset_gen, omega_i); + one(one_value); + sub(value, product, one_value); + store(out, idx, value); +} + +template +// gate_accum_kernel: fused PlonK gate constraint accumulation. +// Computes result[i] = (result[i] + Ql[i]*L + Qr[i]*R + Qm[i]*L*R +// + Qo[i]*O + Qk[i]) * zhKInv in a single pass. +// result must already hold the permutation+boundary contribution from perm_boundary_kernel. +// Thread layout: 1D, one thread per element. zhKInv = 1/(coset^n-1). +__global__ void gate_accum_kernel( + FrView result, + ConstFrView ql, ConstFrView qr, ConstFrView qm, ConstFrView qo, ConstFrView qk, + ConstFrView l, ConstFrView r, ConstFrView o, + ScalarArg zh_k_inv_arg, size_t n) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t acc[Params::LIMBS], zh_k_inv[Params::LIMBS]; + uint64_t l_value[Params::LIMBS], r_value[Params::LIMBS], o_value[Params::LIMBS]; + uint64_t q_value[Params::LIMBS], tmp[Params::LIMBS], lr[Params::LIMBS]; + +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + zh_k_inv[i] = zh_k_inv_arg.limbs[i]; + } + + load(acc, make_const(result), idx); + load(l_value, l, idx); + load(r_value, r, idx); + load(o_value, o, idx); + + load(q_value, ql, idx); + mul(tmp, q_value, l_value); + add(acc, acc, tmp); + + load(q_value, qr, idx); + mul(tmp, q_value, r_value); + add(acc, acc, tmp); + + load(q_value, qm, idx); + mul(lr, l_value, r_value); + mul(tmp, q_value, lr); + add(acc, acc, tmp); + + load(q_value, qo, idx); + mul(tmp, q_value, o_value); + add(acc, acc, tmp); + + load(q_value, qk, idx); + add(acc, acc, q_value); + + mul(acc, acc, zh_k_inv); + store(result, idx, acc); +} + +template +// linearize_static_kernel computes the fixed-selector part of the PlonK +// linearized polynomial in one pass. +// +// result[i] = z[i]*combinedZ + S3[i]*s1 + Ql[i]*l + Qr[i]*r +// + Qm[i]*rl + Qo[i]*o + Qk[i] +__global__ void linearize_static_kernel( + FrView result, + ConstFrView z, ConstFrView s3, + ConstFrView ql, ConstFrView qr, ConstFrView qm, ConstFrView qo, ConstFrView qk, + ScalarArg combined_z_arg, ScalarArg s1_arg, + ScalarArg l_arg, ScalarArg r_arg, ScalarArg rl_arg, ScalarArg o_arg, + size_t n) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t combined_z[Params::LIMBS], s1[Params::LIMBS]; + uint64_t l[Params::LIMBS], r[Params::LIMBS], rl[Params::LIMBS], o[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + combined_z[i] = combined_z_arg.limbs[i]; + s1[i] = s1_arg.limbs[i]; + l[i] = l_arg.limbs[i]; + r[i] = r_arg.limbs[i]; + rl[i] = rl_arg.limbs[i]; + o[i] = o_arg.limbs[i]; + } + + uint64_t acc[Params::LIMBS], value[Params::LIMBS], tmp[Params::LIMBS]; + + load(value, z, idx); + mul(acc, value, combined_z); + + load(value, s3, idx); + mul(tmp, value, s1); + add(acc, acc, tmp); + + load(value, ql, idx); + mul(tmp, value, l); + add(acc, acc, tmp); + + load(value, qr, idx); + mul(tmp, value, r); + add(acc, acc, tmp); + + load(value, qm, idx); + mul(tmp, value, rl); + add(acc, acc, tmp); + + load(value, qo, idx); + mul(tmp, value, o); + add(acc, acc, tmp); + + load(value, qk, idx); + add(acc, acc, value); + + store(result, idx, acc); +} + +template +// subtract_head_kernel applies PlonK blinding to the first coefficients of a +// GPU-resident canonical polynomial. tail is tiny (2-3 elements) in AoS form. +__global__ void subtract_head_kernel(FrView data, const uint64_t *tail, size_t tail_len) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= tail_len) return; + + uint64_t value[Params::LIMBS], tail_value[Params::LIMBS], out[Params::LIMBS]; + load(value, make_const(data), idx); +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + tail_value[i] = __ldg(tail + idx * Params::LIMBS + i); + } + sub(out, value, tail_value); + store(data, idx, out); +} + +template +__device__ __forceinline__ void omega_from_twiddles( + uint64_t out[Params::LIMBS], ConstFrView twiddles, size_t idx, size_t half_n) { + + if(idx < half_n) { + load(out, twiddles, idx); + return; + } + + uint64_t positive[Params::LIMBS], zero_value[Params::LIMBS]; + load(positive, twiddles, idx - half_n); + zero(zero_value); + sub(out, zero_value, positive); +} + +template +// perm_boundary_kernel: fused PlonK permutation + boundary constraint. +// Computes result[i] = alpha*(den-num) + alpha^2*(Z[i]-1)*L1[i] +// where num = Z[i]*ID_product(x_i), den = Z_next*Sigma_product(x_i). +// l1_den_inv[i] = 1/(coset_gen*omega^i-1); l1_scalar = (coset^n-1)/n. +// Thread layout: 1D, one thread per element. Reads twiddles for omega^i. +__global__ void perm_boundary_kernel( + FrView result, + ConstFrView l, ConstFrView r, ConstFrView o, ConstFrView z, + ConstFrView s1, ConstFrView s2, ConstFrView s3, ConstFrView l1_den_inv, + ConstFrView twiddles, + ScalarArg alpha_arg, ScalarArg beta_arg, ScalarArg gamma_arg, + ScalarArg l1_scalar_arg, ScalarArg coset_shift_arg, + ScalarArg coset_shift_sq_arg, ScalarArg coset_gen_arg, + size_t n) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t alpha[Params::LIMBS], beta[Params::LIMBS], gamma[Params::LIMBS]; + uint64_t l1_scalar[Params::LIMBS], coset_shift[Params::LIMBS]; + uint64_t coset_shift_sq[Params::LIMBS], coset_gen[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + alpha[i] = alpha_arg.limbs[i]; + beta[i] = beta_arg.limbs[i]; + gamma[i] = gamma_arg.limbs[i]; + l1_scalar[i] = l1_scalar_arg.limbs[i]; + coset_shift[i] = coset_shift_arg.limbs[i]; + coset_shift_sq[i] = coset_shift_sq_arg.limbs[i]; + coset_gen[i] = coset_gen_arg.limbs[i]; + } + + uint64_t l_value[Params::LIMBS], r_value[Params::LIMBS], o_value[Params::LIMBS]; + uint64_t z_value[Params::LIMBS], z_next[Params::LIMBS]; + load(l_value, l, idx); + load(r_value, r, idx); + load(o_value, o, idx); + load(z_value, z, idx); + load(z_next, z, idx + 1 < n ? idx + 1 : 0); + + uint64_t omega_i[Params::LIMBS], x_i[Params::LIMBS]; + omega_from_twiddles(omega_i, twiddles, idx, n >> 1); + mul(x_i, coset_gen, omega_i); + + uint64_t id1[Params::LIMBS], id2[Params::LIMBS], id3[Params::LIMBS]; + mul(id1, beta, x_i); + mul(id2, id1, coset_shift); + mul(id3, id1, coset_shift_sq); + + uint64_t t1[Params::LIMBS], t2[Params::LIMBS], t3[Params::LIMBS]; + add(t1, l_value, id1); + add(t1, t1, gamma); + add(t2, r_value, id2); + add(t2, t2, gamma); + add(t3, o_value, id3); + add(t3, t3, gamma); + + uint64_t num[Params::LIMBS], tmp[Params::LIMBS]; + mul(num, z_value, t1); + mul(tmp, num, t2); + mul(num, tmp, t3); + + uint64_t s_value[Params::LIMBS], beta_s[Params::LIMBS]; + load(s_value, s1, idx); + mul(beta_s, beta, s_value); + add(t1, l_value, beta_s); + add(t1, t1, gamma); + + load(s_value, s2, idx); + mul(beta_s, beta, s_value); + add(t2, r_value, beta_s); + add(t2, t2, gamma); + + load(s_value, s3, idx); + mul(beta_s, beta, s_value); + add(t3, o_value, beta_s); + add(t3, t3, gamma); + + uint64_t den[Params::LIMBS]; + mul(den, z_next, t1); + mul(tmp, den, t2); + mul(den, tmp, t3); + + uint64_t ordering[Params::LIMBS]; + sub(ordering, den, num); + + uint64_t l1_den_inv_value[Params::LIMBS], l1_value[Params::LIMBS]; + load(l1_den_inv_value, l1_den_inv, idx); + mul(l1_value, l1_scalar, l1_den_inv_value); + + uint64_t one_value[Params::LIMBS], z_minus_one[Params::LIMBS], local[Params::LIMBS]; + one(one_value); + sub(z_minus_one, z_value, one_value); + mul(local, z_minus_one, l1_value); + + uint64_t alpha_local[Params::LIMBS], sum[Params::LIMBS], out[Params::LIMBS]; + mul(alpha_local, alpha, local); + add(sum, ordering, alpha_local); + mul(out, alpha, sum); + store(result, idx, out); +} + +template +__device__ __forceinline__ void perm_identity_eval( + uint64_t out[Params::LIMBS], int64_t perm_idx, size_t n, unsigned log2n, + const uint64_t coset_shift[Params::LIMBS], + const uint64_t coset_shift_sq[Params::LIMBS], + ConstFrView twiddles) { + + size_t idx = (size_t)perm_idx; + size_t pos = idx & (n - 1); + size_t coset = idx >> log2n; + + uint64_t omega_pos[Params::LIMBS]; + omega_from_twiddles(omega_pos, twiddles, pos, n >> 1); + if(coset == 0) { + set(out, omega_pos); + } else if(coset == 1) { + mul(out, coset_shift, omega_pos); + } else { + mul(out, coset_shift_sq, omega_pos); + } +} + +template +// z_compute_factors_kernel: compute per-element Z polynomial ratio factors. +// On return l_inout[i] = numerator = Z[i]*(L+beta*ID+gamma)*(R+beta*k1*ID+gamma)*(O+beta*k2*ID+gamma) +// r_inout[i] = denominator = (L+beta*S1+gamma)*(R+beta*S2+gamma)*(O+beta*S3+gamma) +// ID = beta*omega^i is the domain identity. perm encodes the copy constraint mapping. +// Thread layout: 1D, one thread per element. +__global__ void z_compute_factors_kernel( + FrView l_inout, FrView r_inout, ConstFrView o, + const int64_t *perm, ConstFrView twiddles, + ScalarArg beta_arg, ScalarArg gamma_arg, + ScalarArg coset_shift_arg, ScalarArg coset_shift_sq_arg, + size_t n, unsigned log2n) { + + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + + uint64_t beta[Params::LIMBS], gamma[Params::LIMBS]; + uint64_t coset_shift[Params::LIMBS], coset_shift_sq[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + beta[i] = beta_arg.limbs[i]; + gamma[i] = gamma_arg.limbs[i]; + coset_shift[i] = coset_shift_arg.limbs[i]; + coset_shift_sq[i] = coset_shift_sq_arg.limbs[i]; + } + + uint64_t l_value[Params::LIMBS], r_value[Params::LIMBS], o_value[Params::LIMBS]; + load(l_value, make_const(l_inout), idx); + load(r_value, make_const(r_inout), idx); + load(o_value, o, idx); + + uint64_t omega_i[Params::LIMBS], beta_id0[Params::LIMBS]; + omega_from_twiddles(omega_i, twiddles, idx, n >> 1); + mul(beta_id0, beta, omega_i); + + uint64_t beta_id1[Params::LIMBS], beta_id2[Params::LIMBS]; + mul(beta_id1, coset_shift, beta_id0); + mul(beta_id2, coset_shift_sq, beta_id0); + + uint64_t t1[Params::LIMBS], t2[Params::LIMBS], t3[Params::LIMBS]; + add(t1, l_value, beta_id0); + add(t1, t1, gamma); + add(t2, r_value, beta_id1); + add(t2, t2, gamma); + add(t3, o_value, beta_id2); + add(t3, t3, gamma); + + uint64_t tmp[Params::LIMBS], num[Params::LIMBS]; + mul(tmp, t1, t2); + mul(num, tmp, t3); + + uint64_t sid0[Params::LIMBS], sid1[Params::LIMBS], sid2[Params::LIMBS]; + perm_identity_eval(sid0, perm[idx], n, log2n, coset_shift, coset_shift_sq, twiddles); + perm_identity_eval(sid1, perm[n + idx], n, log2n, coset_shift, coset_shift_sq, twiddles); + perm_identity_eval(sid2, perm[2 * n + idx], n, log2n, coset_shift, coset_shift_sq, twiddles); + + uint64_t beta_sid[Params::LIMBS]; + mul(beta_sid, beta, sid0); + add(t1, l_value, beta_sid); + add(t1, t1, gamma); + mul(beta_sid, beta, sid1); + add(t2, r_value, beta_sid); + add(t2, t2, gamma); + mul(beta_sid, beta, sid2); + add(t3, o_value, beta_sid); + add(t3, t3, gamma); + + uint64_t den[Params::LIMBS]; + mul(tmp, t1, t2); + mul(den, tmp, t3); + + store(l_inout, idx, num); + store(r_inout, idx, den); +} + +template +// z_prefix_local_kernel: phase 1 of parallel Z prefix product scan. +// Computes local prefix products within each chunk of Z_PREFIX_CHUNK_SIZE=1024. +// Outputs per-chunk products to chunk_products for the CPU sequential scan. +// Thread layout: 1D, one thread per chunk. +__global__ void z_prefix_local_kernel( + FrView z, ConstFrView ratio, uint64_t *chunk_products, size_t n) { + + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + Z_PREFIX_CHUNK_SIZE - 1) / Z_PREFIX_CHUNK_SIZE; + if(chunk_id >= num_chunks) return; + + size_t start = chunk_id * Z_PREFIX_CHUNK_SIZE; + size_t end = start + Z_PREFIX_CHUNK_SIZE; + if(end > n) end = n; + + uint64_t acc[Params::LIMBS], elem[Params::LIMBS]; + load(acc, ratio, start); + store(z, start, acc); + for(size_t i = start + 1; i < end; i++) { + load(elem, ratio, i); + mul(acc, acc, elem); + store(z, i, acc); + } + +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + chunk_products[chunk_id * Params::LIMBS + limb] = acc[limb]; + } +} + +template +// z_prefix_fixup_kernel: phase 3a of Z prefix scan — apply chunk corrections. +// Multiplies each element by its corresponding scanned chunk prefix. +// Called after CPU sequential scan of chunk products from phase 1. +// Thread layout: 1D, one thread per element. +__global__ void z_prefix_fixup_kernel(FrView z, const uint64_t *scanned_prefixes, size_t n) { + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + Z_PREFIX_CHUNK_SIZE - 1) / Z_PREFIX_CHUNK_SIZE; + if(chunk_id == 0 || chunk_id >= num_chunks) return; + + size_t start = chunk_id * Z_PREFIX_CHUNK_SIZE; + size_t end = start + Z_PREFIX_CHUNK_SIZE; + if(end > n) end = n; + + uint64_t prefix[Params::LIMBS], elem[Params::LIMBS], product[Params::LIMBS]; +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + prefix[limb] = scanned_prefixes[(chunk_id - 1) * Params::LIMBS + limb]; + } + for(size_t i = start; i < end; i++) { + load(elem, make_const(z), i); + mul(product, prefix, elem); + store(z, i, product); + } +} + +template +// z_prefix_shift_right_kernel: phase 3b of Z prefix scan — shift to get Z[0]=1. +// Moves the exclusive prefix product result right by one position. +// After this, z[0]=1 and z[i]=product(ratio[0..i-1]) for all i. +// Thread layout: 1D, one thread per element. +__global__ void z_prefix_shift_right_kernel(FrView z, ConstFrView src, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + if(idx == 0) { + uint64_t one_value[Params::LIMBS]; + one(one_value); + store(z, 0, one_value); + return; + } + uint64_t prev[Params::LIMBS]; + load(prev, src, idx - 1); + store(z, idx, prev); +} + +template +// poly_eval_chunks_kernel evaluates one coefficient chunk per thread using +// Horner. The CPU combines returned chunk partials with z^1024. +__global__ void poly_eval_chunks_kernel( + ConstFrView coeffs, ScalarArg z_arg, uint64_t *partials, size_t n) { + + size_t chunk_id = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t num_chunks = (n + POLY_EVAL_CHUNK_SIZE - 1) / POLY_EVAL_CHUNK_SIZE; + if(chunk_id >= num_chunks) return; + + size_t start = chunk_id * POLY_EVAL_CHUNK_SIZE; + size_t end = start + POLY_EVAL_CHUNK_SIZE; + if(end > n) end = n; + + uint64_t z[Params::LIMBS], acc[Params::LIMBS], coeff[Params::LIMBS]; +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) z[limb] = z_arg.limbs[limb]; + + load(acc, coeffs, end - 1); + for(size_t i = end - 1; i > start;) { + --i; + load(coeff, coeffs, i); + mul(acc, acc, z); + add(acc, acc, coeff); + } + +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + partials[chunk_id * Params::LIMBS + limb] = acc[limb]; + } +} + +template +// ntt_dif_stage_kernel: one stage of a Decimation-In-Frequency (DIF) NTT. +// Implements one radix-2 butterfly pass: a'=a+b, b'=(a-b)*w_k. +// Natural-order input; after all log2(n) stages output is bit-reversed. +// Thread layout: 1D, ceil(n/2) threads total. +__global__ void ntt_dif_stage_kernel( + FrView data, ConstFrView twiddles, size_t num_butterflies, + size_t half, size_t half_mask, size_t tw_stride) { + + size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(tid >= num_butterflies) return; + + size_t j = tid & half_mask; + size_t group_base = tid & ~half_mask; + size_t idx_a = (group_base << 1) | j; + size_t idx_b = idx_a + half; + size_t tw_idx = j * tw_stride; + + uint64_t a[Params::LIMBS], b[Params::LIMBS], w[Params::LIMBS]; + uint64_t sum[Params::LIMBS], diff[Params::LIMBS], prod[Params::LIMBS]; + ConstFrView data_const = make_const(data); + load(a, data_const, idx_a); + load(b, data_const, idx_b); + load(w, twiddles, tw_idx); + + add(sum, a, b); + sub(diff, a, b); + mul(prod, diff, w); + + store(data, idx_a, sum); + store(data, idx_b, prod); +} + +template +// ntt_dit_stage_kernel: one stage of a Decimation-In-Time (DIT) inverse NTT. +// Implements radix-2 butterfly: a'=a+w*b, b'=a-w*b. Bit-reversed input → natural output. +// Includes optional 1/n scaling factor for the inverse NTT normalization. +// Thread layout: 1D, ceil(n/2) threads. +__global__ void ntt_dit_stage_kernel( + FrView data, ConstFrView twiddles, size_t num_butterflies, + size_t half, size_t half_mask, size_t tw_stride) { + + size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(tid >= num_butterflies) return; + + size_t j = tid & half_mask; + size_t group_base = tid & ~half_mask; + size_t idx_a = (group_base << 1) | j; + size_t idx_b = idx_a + half; + size_t tw_idx = j * tw_stride; + + uint64_t a[Params::LIMBS], b[Params::LIMBS], w[Params::LIMBS]; + uint64_t wb[Params::LIMBS], sum[Params::LIMBS], diff[Params::LIMBS]; + ConstFrView data_const = make_const(data); + load(a, data_const, idx_a); + load(b, data_const, idx_b); + load(w, twiddles, tw_idx); + + mul(wb, b, w); + add(sum, a, wb); + sub(diff, a, wb); + + store(data, idx_a, sum); + store(data, idx_b, diff); +} + +template +// scale_kernel: element-wise scalar multiplication v[i] *= scalar mod p. +// Used after inverse NTT to apply the 1/n normalization factor. +// Thread layout: 1D, one thread per element. +// scale_kernel: element-wise scalar multiplication v[i] *= scalar mod p. +// Used after inverse NTT to apply the 1/n normalization factor. +// Thread layout: 1D, one thread per element. +__global__ void scale_kernel(FrView data, const uint64_t *scalar, size_t n); + +template +// ntt_dit_stage_scale_kernel: final DIT NTT stage combined with 1/n scaling. +// Fuses the last butterfly pass with the normalization to save a kernel launch. +// Output is the normalized canonical polynomial in natural order. +// Thread layout: 1D, ceil(n/2) threads. +__global__ void ntt_dit_stage_scale_kernel( + FrView data, ConstFrView twiddles, const uint64_t *scale, + size_t num_butterflies, size_t half, size_t half_mask, size_t tw_stride) { + + size_t tid = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(tid >= num_butterflies) return; + + size_t j = tid & half_mask; + size_t group_base = tid & ~half_mask; + size_t idx_a = (group_base << 1) | j; + size_t idx_b = idx_a + half; + size_t tw_idx = j * tw_stride; + + uint64_t a[Params::LIMBS], b[Params::LIMBS], w[Params::LIMBS]; + uint64_t wb[Params::LIMBS], sum[Params::LIMBS], diff[Params::LIMBS]; + uint64_t scaled[Params::LIMBS], scale_value[Params::LIMBS]; + ConstFrView data_const = make_const(data); + load(a, data_const, idx_a); + load(b, data_const, idx_b); + load(w, twiddles, tw_idx); +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + scale_value[i] = __ldg(scale + i); + } + + mul(wb, b, w); + add(sum, a, wb); + sub(diff, a, wb); + mul(scaled, sum, scale_value); + store(data, idx_a, scaled); + mul(scaled, diff, scale_value); + store(data, idx_b, scaled); +} + +template +// ntt_dif_radix8_kernel: radix-8 NTT butterfly for the Decimation-In-Frequency pass. +// Processes 8 elements per thread group for better GPU occupancy vs radix-2. +// Natural input → bit-reversed output after log8(n) passes. +// Thread layout: 1D shared-memory tiled; 8×BLOCK_SIZE threads process 8 elements each. +__global__ void ntt_dif_radix8_kernel( + FrView data, ConstFrView twiddles, uint32_t n, int stage_s) { + + uint32_t tid = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + uint32_t num_r8 = n >> 3; + if(tid >= num_r8) return; + + uint32_t half_s = n >> (stage_s + 1); + uint32_t half_s1 = half_s >> 1; + uint32_t half_s2 = half_s >> 2; + + uint32_t j = tid & (half_s2 - 1); + uint32_t group = tid >> (__ffs(half_s2) - 1); + + uint32_t base = group * (2 * half_s); + uint32_t p0 = base + j; + uint32_t p1 = p0 + half_s2; + uint32_t p2 = p0 + half_s1; + uint32_t p3 = p2 + half_s2; + uint32_t p4 = p0 + half_s; + uint32_t p5 = p4 + half_s2; + uint32_t p6 = p4 + half_s1; + uint32_t p7 = p6 + half_s2; + + uint64_t a0[Params::LIMBS], a1[Params::LIMBS], a2[Params::LIMBS], a3[Params::LIMBS]; + uint64_t a4[Params::LIMBS], a5[Params::LIMBS], a6[Params::LIMBS], a7[Params::LIMBS]; + ConstFrView data_const = make_const(data); + load(a0, data_const, p0); + load(a1, data_const, p1); + load(a2, data_const, p2); + load(a3, data_const, p3); + load(a4, data_const, p4); + load(a5, data_const, p5); + load(a6, data_const, p6); + load(a7, data_const, p7); + + uint32_t tw_stride_s = 1u << stage_s; + uint32_t tw_stride_s1 = tw_stride_s << 1; + uint32_t tw_stride_s2 = tw_stride_s << 2; + + uint64_t w[Params::LIMBS], sum[Params::LIMBS], diff[Params::LIMBS]; + uint32_t twi; + + twi = j * tw_stride_s; + load(w, twiddles, twi); + add(sum, a0, a4); + sub(diff, a0, a4); + mul(a4, diff, w); + set(a0, sum); + + twi = (j + half_s2) * tw_stride_s; + load(w, twiddles, twi); + add(sum, a1, a5); + sub(diff, a1, a5); + mul(a5, diff, w); + set(a1, sum); + + twi = (j + half_s1) * tw_stride_s; + load(w, twiddles, twi); + add(sum, a2, a6); + sub(diff, a2, a6); + mul(a6, diff, w); + set(a2, sum); + + twi = (j + half_s1 + half_s2) * tw_stride_s; + load(w, twiddles, twi); + add(sum, a3, a7); + sub(diff, a3, a7); + mul(a7, diff, w); + set(a3, sum); + + uint64_t ws1_0[Params::LIMBS], ws1_1[Params::LIMBS]; + twi = j * tw_stride_s1; + load(ws1_0, twiddles, twi); + twi = (j + half_s2) * tw_stride_s1; + load(ws1_1, twiddles, twi); + + add(sum, a0, a2); + sub(diff, a0, a2); + mul(a2, diff, ws1_0); + set(a0, sum); + + add(sum, a1, a3); + sub(diff, a1, a3); + mul(a3, diff, ws1_1); + set(a1, sum); + + add(sum, a4, a6); + sub(diff, a4, a6); + mul(a6, diff, ws1_0); + set(a4, sum); + + add(sum, a5, a7); + sub(diff, a5, a7); + mul(a7, diff, ws1_1); + set(a5, sum); + + twi = j * tw_stride_s2; + load(w, twiddles, twi); + + add(sum, a0, a1); + sub(diff, a0, a1); + mul(a1, diff, w); + set(a0, sum); + + add(sum, a2, a3); + sub(diff, a2, a3); + mul(a3, diff, w); + set(a2, sum); + + add(sum, a4, a5); + sub(diff, a4, a5); + mul(a5, diff, w); + set(a4, sum); + + add(sum, a6, a7); + sub(diff, a6, a7); + mul(a7, diff, w); + set(a6, sum); + + store(data, p0, a0); + store(data, p1, a1); + store(data, p2, a2); + store(data, p3, a3); + store(data, p4, a4); + store(data, p5, a5); + store(data, p6, a6); + store(data, p7, a7); +} + +template +// ntt_dit_radix8_kernel: radix-8 inverse NTT butterfly for the Decimation-In-Time pass. +// Processes 8 elements per thread group for better throughput. +// Bit-reversed input → natural-order output after log8(n) passes. +// Thread layout: 1D shared-memory tiled; 8×BLOCK_SIZE threads process 8 elements each. +__global__ void ntt_dit_radix8_kernel( + FrView data, ConstFrView twiddles, const uint64_t *scale, uint32_t n, int stage_s) { + + uint32_t tid = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + uint32_t num_r8 = n >> 3; + if(tid >= num_r8) return; + + uint32_t half_s = n >> (stage_s + 1); + uint32_t half_s1 = half_s << 1; + uint32_t half_s2 = half_s << 2; + + uint32_t j = tid & (half_s - 1); + uint32_t group = tid >> (__ffs(half_s) - 1); + + uint32_t base = group * (8 * half_s); + uint32_t p0 = base + j; + uint32_t p1 = p0 + half_s; + uint32_t p2 = p0 + half_s1; + uint32_t p3 = p1 + half_s1; + uint32_t p4 = p0 + half_s2; + uint32_t p5 = p1 + half_s2; + uint32_t p6 = p2 + half_s2; + uint32_t p7 = p3 + half_s2; + + uint64_t a0[Params::LIMBS], a1[Params::LIMBS], a2[Params::LIMBS], a3[Params::LIMBS]; + uint64_t a4[Params::LIMBS], a5[Params::LIMBS], a6[Params::LIMBS], a7[Params::LIMBS]; + ConstFrView data_const = make_const(data); + load(a0, data_const, p0); + load(a1, data_const, p1); + load(a2, data_const, p2); + load(a3, data_const, p3); + load(a4, data_const, p4); + load(a5, data_const, p5); + load(a6, data_const, p6); + load(a7, data_const, p7); + + uint32_t tw_stride_s = 1u << stage_s; + uint32_t tw_stride_s1 = tw_stride_s >> 1; + uint32_t tw_stride_s2 = tw_stride_s >> 2; + + uint64_t w[Params::LIMBS], t[Params::LIMBS], sum[Params::LIMBS], diff[Params::LIMBS]; + uint32_t twi; + + twi = j * tw_stride_s; + load(w, twiddles, twi); + mul(t, a1, w); + add(sum, a0, t); + sub(diff, a0, t); + set(a0, sum); + set(a1, diff); + + mul(t, a3, w); + add(sum, a2, t); + sub(diff, a2, t); + set(a2, sum); + set(a3, diff); + + mul(t, a5, w); + add(sum, a4, t); + sub(diff, a4, t); + set(a4, sum); + set(a5, diff); + + mul(t, a7, w); + add(sum, a6, t); + sub(diff, a6, t); + set(a6, sum); + set(a7, diff); + + uint64_t ws1_a[Params::LIMBS], ws1_b[Params::LIMBS]; + twi = j * tw_stride_s1; + load(ws1_a, twiddles, twi); + twi = (j + half_s) * tw_stride_s1; + load(ws1_b, twiddles, twi); + + mul(t, a2, ws1_a); + add(sum, a0, t); + sub(diff, a0, t); + set(a0, sum); + set(a2, diff); + + mul(t, a3, ws1_b); + add(sum, a1, t); + sub(diff, a1, t); + set(a1, sum); + set(a3, diff); + + mul(t, a6, ws1_a); + add(sum, a4, t); + sub(diff, a4, t); + set(a4, sum); + set(a6, diff); + + mul(t, a7, ws1_b); + add(sum, a5, t); + sub(diff, a5, t); + set(a5, sum); + set(a7, diff); + + twi = j * tw_stride_s2; + load(w, twiddles, twi); + mul(t, a4, w); + add(sum, a0, t); + sub(diff, a0, t); + set(a0, sum); + set(a4, diff); + + twi = (j + half_s) * tw_stride_s2; + load(w, twiddles, twi); + mul(t, a5, w); + add(sum, a1, t); + sub(diff, a1, t); + set(a1, sum); + set(a5, diff); + + twi = (j + half_s1) * tw_stride_s2; + load(w, twiddles, twi); + mul(t, a6, w); + add(sum, a2, t); + sub(diff, a2, t); + set(a2, sum); + set(a6, diff); + + twi = (j + half_s1 + half_s) * tw_stride_s2; + load(w, twiddles, twi); + mul(t, a7, w); + add(sum, a3, t); + sub(diff, a3, t); + set(a3, sum); + set(a7, diff); + + if constexpr(FUSE_SCALE) { + uint64_t scale_value[Params::LIMBS], scaled[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + scale_value[i] = __ldg(scale + i); + } + mul(scaled, a0, scale_value); set(a0, scaled); + mul(scaled, a1, scale_value); set(a1, scaled); + mul(scaled, a2, scale_value); set(a2, scaled); + mul(scaled, a3, scale_value); set(a3, scaled); + mul(scaled, a4, scale_value); set(a4, scaled); + mul(scaled, a5, scale_value); set(a5, scaled); + mul(scaled, a6, scale_value); set(a6, scaled); + mul(scaled, a7, scale_value); set(a7, scaled); + } + + store(data, p0, a0); + store(data, p1, a1); + store(data, p2, a2); + store(data, p3, a3); + store(data, p4, a4); + store(data, p5, a5); + store(data, p6, a6); + store(data, p7, a7); +} + +template +// ntt_dif_tail_fused_kernel: fused DIF NTT tail stages in shared memory. +// Combines the last TAIL_LOG radix-2 DIF stages into shared memory for +// reduced global memory traffic. Processes a span of 2^TAIL_LOG elements. +// Thread layout: 1024 threads per block, 1 block per span. +__global__ void __launch_bounds__(1024, 1) ntt_dif_tail_fused_kernel( + FrView data, ConstFrView twiddles, uint32_t n, int stage_start) { + + constexpr uint32_t span = 1u << TAIL_LOG; + constexpr uint32_t butterflies_per_chunk = span >> 1; + + uint32_t chunk = (uint32_t)blockIdx.x; + uint32_t base = chunk * span; + uint32_t t = threadIdx.x; + uint32_t p = blockDim.x; + + extern __shared__ uint64_t shmem[]; + uint64_t *s[MAX_FR_LIMBS]; + s[0] = shmem; +#pragma unroll + for(int limb = 1; limb < Params::LIMBS; limb++) { + s[limb] = s[limb - 1] + span; + } + + for(uint32_t i = t; i < span; i += p) { + uint32_t global_idx = base + i; + if(global_idx < n) { +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + s[limb][i] = data.limbs[limb][global_idx]; + } + } + } + __syncthreads(); + +#pragma unroll + for(int st = 0; st < TAIL_LOG; st++) { + int stage = stage_start + st; + uint32_t half = n >> (stage + 1); + uint32_t half_mask = half - 1; + uint32_t tw_stride = 1u << stage; + + for(uint32_t bt = t; bt < butterflies_per_chunk; bt += p) { + uint32_t j = bt & half_mask; + uint32_t group_base = bt & ~half_mask; + uint32_t idx_a = (group_base << 1) | j; + uint32_t idx_b = idx_a + half; + uint32_t tw_idx = j * tw_stride; + + uint64_t a[Params::LIMBS], b[Params::LIMBS], w[Params::LIMBS]; + uint64_t sum[Params::LIMBS], diff[Params::LIMBS], prod[Params::LIMBS]; +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + a[limb] = s[limb][idx_a]; + b[limb] = s[limb][idx_b]; + } + load(w, twiddles, tw_idx); + + add(sum, a, b); + sub(diff, a, b); + mul(prod, diff, w); + +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + s[limb][idx_a] = sum[limb]; + s[limb][idx_b] = prod[limb]; + } + } + __syncthreads(); + } + + for(uint32_t i = t; i < span; i += p) { + uint32_t global_idx = base + i; + if(global_idx < n) { +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + data.limbs[limb][global_idx] = s[limb][i]; + } + } + } +} + +template +// ntt_dit_tail_fused_kernel: fused DIT inverse NTT tail stages in shared memory. +// Combines the last TAIL_LOG radix-2 DIT stages (with 1/n scaling on last stage). +// Reduces global memory traffic for the most frequent butterfly stages. +// Thread layout: 1024 threads per block, 1 block per span. +__global__ void __launch_bounds__(1024, 1) ntt_dit_tail_fused_kernel( + FrView data, ConstFrView twiddles, uint32_t n, int stage_start) { + + constexpr uint32_t span = 1u << TAIL_LOG; + constexpr uint32_t butterflies_per_chunk = span >> 1; + + uint32_t chunk = (uint32_t)blockIdx.x; + uint32_t base = chunk * span; + uint32_t t = threadIdx.x; + uint32_t p = blockDim.x; + + extern __shared__ uint64_t shmem[]; + uint64_t *s[MAX_FR_LIMBS]; + s[0] = shmem; +#pragma unroll + for(int limb = 1; limb < Params::LIMBS; limb++) { + s[limb] = s[limb - 1] + span; + } + + for(uint32_t i = t; i < span; i += p) { + uint32_t global_idx = base + i; + if(global_idx < n) { +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + s[limb][i] = data.limbs[limb][global_idx]; + } + } + } + __syncthreads(); + +#pragma unroll + for(int st = 0; st < TAIL_LOG; st++) { + int stage = stage_start - st; + uint32_t half = n >> (stage + 1); + uint32_t half_mask = half - 1; + uint32_t tw_stride = 1u << stage; + + for(uint32_t bt = t; bt < butterflies_per_chunk; bt += p) { + uint32_t j = bt & half_mask; + uint32_t group_base = bt & ~half_mask; + uint32_t idx_a = (group_base << 1) | j; + uint32_t idx_b = idx_a + half; + uint32_t tw_idx = j * tw_stride; + + uint64_t a[Params::LIMBS], b[Params::LIMBS], w[Params::LIMBS]; + uint64_t wb[Params::LIMBS], sum[Params::LIMBS], diff[Params::LIMBS]; +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + a[limb] = s[limb][idx_a]; + b[limb] = s[limb][idx_b]; + } + load(w, twiddles, tw_idx); + + mul(wb, b, w); + add(sum, a, wb); + sub(diff, a, wb); + +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + s[limb][idx_a] = sum[limb]; + s[limb][idx_b] = diff[limb]; + } + } + __syncthreads(); + } + + for(uint32_t i = t; i < span; i += p) { + uint32_t global_idx = base + i; + if(global_idx < n) { +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + data.limbs[limb][global_idx] = s[limb][i]; + } + } + } +} + +template +// scale_kernel: element-wise scalar multiplication v[i] *= scalar mod p. +// Used after inverse NTT to apply the 1/n normalization factor. +// Thread layout: 1D, one thread per element. +// scale_kernel: element-wise scalar multiplication v[i] *= scalar mod p. +// Used after inverse NTT to apply the 1/n normalization factor. +// Thread layout: 1D, one thread per element. +__global__ void scale_kernel(FrView data, const uint64_t *scalar, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(idx >= n) return; + uint64_t a[Params::LIMBS], out[Params::LIMBS]; + load(a, make_const(data), idx); + mul(out, a, scalar); + store(data, idx, out); +} + +template +__device__ __forceinline__ void pow_uint64( + uint64_t out[Params::LIMBS], const uint64_t base[Params::LIMBS], uint64_t exp) { + + uint64_t acc[Params::LIMBS], factor[Params::LIMBS]; + one(acc); + set(factor, base); + + while(exp != 0) { + if((exp & 1ULL) != 0) { + mul(acc, acc, factor); + } + exp >>= 1; + if(exp != 0) { + square(factor, factor); + } + } + + set(out, acc); +} + +template +// scale_by_powers_kernel: multiply v[i] *= g^i for i in [0,n). +// Used for coset FFT preparation: given poly p(X), computes p(g*X). +// Thread layout: 1D, one thread per element. g is the coset generator. +__global__ void scale_by_powers_kernel( + FrView data, ScalarArg generator_arg, const uint64_t *local_powers, size_t n) { + + __shared__ uint64_t block_base[Params::LIMBS]; + uint64_t generator[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) generator[i] = generator_arg.limbs[i]; + + size_t block_start = (size_t)blockIdx.x * blockDim.x; + + if(threadIdx.x == 0) { + pow_uint64(block_base, generator, (uint64_t)block_start); + } + __syncthreads(); + + size_t idx = block_start + threadIdx.x; + if(idx >= n) return; + + uint64_t local_power[Params::LIMBS], power[Params::LIMBS]; + uint64_t value[Params::LIMBS], out[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) { + local_power[i] = __ldg(local_powers + (size_t)threadIdx.x * Params::LIMBS + i); + } + mul(power, block_base, local_power); + load(value, make_const(data), idx); + mul(out, value, power); + store(data, idx, out); +} + +template +// local_power_table_kernel: build a local table of g^{local_idx} values. +// Used as a subproblem in scale_by_powers for large n (chunked approach). +// Thread layout: 1D, one thread per chunk element. +__global__ void local_power_table_kernel(ScalarArg generator_arg, uint64_t *local_powers) { + uint64_t generator[Params::LIMBS], power[Params::LIMBS]; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) generator[i] = generator_arg.limbs[i]; + + one(power); + for(unsigned i = 0; i < THREADS; i++) { +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + local_powers[(size_t)i * Params::LIMBS + limb] = power[limb]; + } + mul(power, power, generator); + } +} + +__device__ __forceinline__ size_t bit_reverse(size_t x, int log_n) { + uint32_t y = __brev((uint32_t)x); + return (size_t)(y >> (32 - log_n)); +} + +template +// bit_reverse_kernel: in-place bit-reversal permutation of n elements. +// Swaps element i with element bit_reverse(i, log2(n)) for all i < bit_reverse(i). +// Required to convert between natural and bit-reversed NTT orderings. +// Thread layout: 1D, n/2 threads (each thread handles one swap pair). +__global__ void bit_reverse_kernel(FrView data, size_t n, int log_n) { + size_t i = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if(i >= n) return; + + size_t j = bit_reverse(i, log_n); + if(j <= i) return; + +#pragma unroll + for(int limb = 0; limb < Params::LIMBS; limb++) { + uint64_t a = data.limbs[limb][i]; + uint64_t b = data.limbs[limb][j]; + data.limbs[limb][i] = b; + data.limbs[limb][j] = a; + } +} + +int log2_exact(size_t n) { + int log = 0; + while(n > 1) { + n >>= 1; + log++; + } + return log; +} + +template +int select_tail_log(int log_n) { + if(log_n < 10) return 0; + + int max_shmem = 0; + cudaDeviceGetAttribute(&max_shmem, cudaDevAttrMaxSharedMemoryPerBlockOptin, 0); + for(int candidate = 12; candidate >= 10; candidate--) { + size_t required = (size_t)Params::LIMBS * ((size_t)1 << candidate) * sizeof(uint64_t); + if(log_n > candidate && required <= (size_t)max_shmem) { + return candidate; + } + } + return 0; +} + +template +uint32_t radix8_min_n() { + return 1u << 18; +} + +template <> +uint32_t radix8_min_n() { + return 1u << 21; +} + +template +void set_dynamic_shared_memory(Kernel kernel, size_t shmem_bytes) { + cudaFuncSetAttribute(kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, (int)shmem_bytes); +} + +template +void launch_dif_tail_fixed(FrView data, ConstFrView twiddles, uint32_t n, + int stage_start, cudaStream_t stream) { + constexpr uint32_t span = 1u << TAIL_LOG; + unsigned threads = span > 1024 ? 1024 : span; + unsigned blocks = (n + span - 1) / span; + size_t shmem_bytes = (size_t)Params::LIMBS * span * sizeof(uint64_t); + set_dynamic_shared_memory(ntt_dif_tail_fused_kernel, shmem_bytes); + ntt_dif_tail_fused_kernel<<>>( + data, twiddles, n, stage_start); +} + +template +void launch_dif_tail(FrView data, ConstFrView twiddles, uint32_t n, + int stage_start, int tail_log, cudaStream_t stream) { + switch(tail_log) { + case 12: + launch_dif_tail_fixed(data, twiddles, n, stage_start, stream); + break; + case 11: + launch_dif_tail_fixed(data, twiddles, n, stage_start, stream); + break; + case 10: + launch_dif_tail_fixed(data, twiddles, n, stage_start, stream); + break; + default: + break; + } +} + +template +void launch_dit_tail_fixed(FrView data, ConstFrView twiddles, uint32_t n, + int stage_start, cudaStream_t stream) { + constexpr uint32_t span = 1u << TAIL_LOG; + unsigned threads = span > 1024 ? 1024 : span; + unsigned blocks = (n + span - 1) / span; + size_t shmem_bytes = (size_t)Params::LIMBS * span * sizeof(uint64_t); + set_dynamic_shared_memory(ntt_dit_tail_fused_kernel, shmem_bytes); + ntt_dit_tail_fused_kernel<<>>( + data, twiddles, n, stage_start); +} + +template +void launch_dit_tail(FrView data, ConstFrView twiddles, uint32_t n, + int stage_start, int tail_log, cudaStream_t stream) { + switch(tail_log) { + case 12: + launch_dit_tail_fixed(data, twiddles, n, stage_start, stream); + break; + case 11: + launch_dit_tail_fixed(data, twiddles, n, stage_start, stream); + break; + case 10: + launch_dit_tail_fixed(data, twiddles, n, stage_start, stream); + break; + default: + break; + } +} + +template +void launch_ntt_forward_typed(FrView data, ConstFrView twiddles, size_t n, + cudaStream_t stream) { + const int log_n = log2_exact(n); + const size_t butterflies = n >> 1; + unsigned blocks_r2 = (unsigned)((butterflies + NTT_THREADS - 1) / NTT_THREADS); + uint32_t n32 = (uint32_t)n; + uint32_t radix8_count = n32 >> 3; + unsigned blocks_r8 = (radix8_count + NTT_THREADS - 1) / NTT_THREADS; + + int tail_log = select_tail_log(log_n); + bool use_tail = tail_log > 0 && n >= NTT_FUSED_TAIL_MIN_N; + int regular_stages = use_tail ? log_n - tail_log : log_n; + + int stage = 0; + if(n >= radix8_min_n()) { + for(; stage + 2 < regular_stages; stage += 3) { + ntt_dif_radix8_kernel<<>>( + data, twiddles, n32, stage); + } + } + for(; stage < regular_stages; stage++) { + size_t half = n >> (stage + 1); + size_t tw_stride = (size_t)1 << stage; + ntt_dif_stage_kernel<<>>( + data, twiddles, butterflies, half, half - 1, tw_stride); + } + + if(use_tail) { + launch_dif_tail(data, twiddles, n32, regular_stages, tail_log, stream); + } +} + +template +void launch_ntt_inverse_typed(FrView data, ConstFrView twiddles, const uint64_t *inv_n, + size_t n, cudaStream_t stream) { + const int log_n = log2_exact(n); + const size_t butterflies = n >> 1; + unsigned blocks_r2 = (unsigned)((butterflies + NTT_THREADS - 1) / NTT_THREADS); + uint32_t n32 = (uint32_t)n; + uint32_t radix8_count = n32 >> 3; + unsigned blocks_r8 = (radix8_count + NTT_THREADS - 1) / NTT_THREADS; + + int tail_log = select_tail_log(log_n); + bool use_tail = tail_log > 0 && n >= NTT_FUSED_TAIL_MIN_N; + int stage = log_n - 1; + if(use_tail) { + launch_dit_tail(data, twiddles, n32, stage, tail_log, stream); + stage -= tail_log; + } + + bool scaled = false; + if(n >= radix8_min_n()) { + for(; stage - 2 >= 0; stage -= 3) { + if(stage < 3) { + ntt_dit_radix8_kernel<<>>( + data, twiddles, inv_n, n32, stage); + scaled = true; + } else { + ntt_dit_radix8_kernel<<>>( + data, twiddles, inv_n, n32, stage); + } + } + } + for(; stage >= 0; stage--) { + size_t half = n >> (stage + 1); + size_t tw_stride = (size_t)1 << stage; + if(stage == 0) { + ntt_dit_stage_scale_kernel<<>>( + data, twiddles, inv_n, butterflies, half, half - 1, tw_stride); + scaled = true; + } else { + ntt_dit_stage_kernel<<>>( + data, twiddles, butterflies, half, half - 1, tw_stride); + } + } + + if(!scaled) { + unsigned scale_blocks = (unsigned)((n + NTT_THREADS - 1) / NTT_THREADS); + scale_kernel<<>>(data, inv_n, n); + } +} + +} // namespace + +void launch_copy_aos_to_soa( + gnark_gpu_plonk2_curve_id_t curve, FrView dst, const uint64_t *src, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + copy_aos_to_soa_kernel<<>>(dst, src, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + copy_aos_to_soa_kernel<<>>(dst, src, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + copy_aos_to_soa_kernel<<>>(dst, src, n); + break; + default: + break; + } +} + +void launch_copy_soa_to_aos( + gnark_gpu_plonk2_curve_id_t curve, uint64_t *dst, ConstFrView src, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + copy_soa_to_aos_kernel<<>>(dst, src, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + copy_soa_to_aos_kernel<<>>(dst, src, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + copy_soa_to_aos_kernel<<>>(dst, src, n); + break; + default: + break; + } +} + +void launch_set_zero( + gnark_gpu_plonk2_curve_id_t curve, FrView v, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + set_zero_kernel<<>>(v, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + set_zero_kernel<<>>(v, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + set_zero_kernel<<>>(v, n); + break; + default: + break; + } +} + +void launch_add( + gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, ConstFrView b, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + add_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + add_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + add_kernel<<>>(out, a, b, n); + break; + default: + break; + } +} + +void launch_sub( + gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, ConstFrView b, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + sub_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + sub_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + sub_kernel<<>>(out, a, b, n); + break; + default: + break; + } +} + +void launch_mul( + gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, ConstFrView b, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + mul_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + mul_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + mul_kernel<<>>(out, a, b, n); + break; + default: + break; + } +} + +void launch_addmul( + gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, ConstFrView b, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + addmul_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + addmul_kernel<<>>(out, a, b, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + addmul_kernel<<>>(out, a, b, n); + break; + default: + break; + } +} + +void launch_scalar_mul( + gnark_gpu_plonk2_curve_id_t curve, FrView out, const uint64_t *scalar, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + ScalarArg scalar_arg = make_scalar_arg(curve, scalar); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + scalar_mul_kernel<<>>( + out, scalar_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + scalar_mul_kernel<<>>( + out, scalar_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + scalar_mul_kernel<<>>( + out, scalar_arg, n); + break; + default: + break; + } +} + +void launch_add_scalar_mul( + gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView a, + const uint64_t *scalar, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + ScalarArg scalar_arg = make_scalar_arg(curve, scalar); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + add_scalar_mul_kernel<<>>( + out, a, scalar_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + add_scalar_mul_kernel<<>>( + out, a, scalar_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + add_scalar_mul_kernel<<>>( + out, a, scalar_arg, n); + break; + default: + break; + } +} + +void launch_batch_invert( + gnark_gpu_plonk2_curve_id_t curve, FrView data, size_t n, + cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + invert_kernel<<>>(data, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + invert_kernel<<>>(data, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + invert_kernel<<>>(data, n); + break; + default: + break; + } +} + +void launch_butterfly4_inverse( + gnark_gpu_plonk2_curve_id_t curve, FrView b0, FrView b1, FrView b2, FrView b3, + const uint64_t *omega4_inv, const uint64_t *quarter, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + ScalarArg omega4_inv_arg = make_scalar_arg(curve, omega4_inv); + ScalarArg quarter_arg = make_scalar_arg(curve, quarter); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + butterfly4_inverse_kernel<<>>( + b0, b1, b2, b3, omega4_inv_arg, quarter_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + butterfly4_inverse_kernel<<>>( + b0, b1, b2, b3, omega4_inv_arg, quarter_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + butterfly4_inverse_kernel<<>>( + b0, b1, b2, b3, omega4_inv_arg, quarter_arg, n); + break; + default: + break; + } +} + +void launch_reduce_blinded_coset( + gnark_gpu_plonk2_curve_id_t curve, FrView dst, ConstFrView src, + const uint64_t *tail, size_t tail_len, const uint64_t *coset_pow_n, + size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + ScalarArg coset_pow_n_arg = make_scalar_arg(curve, coset_pow_n); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + reduce_blinded_coset_kernel<<>>( + dst, src, tail, coset_pow_n_arg, n, tail_len); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + reduce_blinded_coset_kernel<<>>( + dst, src, tail, coset_pow_n_arg, n, tail_len); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + reduce_blinded_coset_kernel<<>>( + dst, src, tail, coset_pow_n_arg, n, tail_len); + break; + default: + break; + } +} + +void launch_compute_l1_den( + gnark_gpu_plonk2_curve_id_t curve, FrView out, ConstFrView twiddles, + const uint64_t *coset_gen, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + ScalarArg coset_gen_arg = make_scalar_arg(curve, coset_gen); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + compute_l1_den_kernel<<>>( + out, twiddles, coset_gen_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + compute_l1_den_kernel<<>>( + out, twiddles, coset_gen_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + compute_l1_den_kernel<<>>( + out, twiddles, coset_gen_arg, n); + break; + default: + break; + } +} + +void launch_gate_accum( + gnark_gpu_plonk2_curve_id_t curve, FrView result, + ConstFrView ql, ConstFrView qr, ConstFrView qm, ConstFrView qo, ConstFrView qk, + ConstFrView l, ConstFrView r, ConstFrView o, + const uint64_t *zh_k_inv, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + ScalarArg zh_k_inv_arg = make_scalar_arg(curve, zh_k_inv); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + gate_accum_kernel<<>>( + result, ql, qr, qm, qo, qk, l, r, o, zh_k_inv_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + gate_accum_kernel<<>>( + result, ql, qr, qm, qo, qk, l, r, o, zh_k_inv_arg, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + gate_accum_kernel<<>>( + result, ql, qr, qm, qo, qk, l, r, o, zh_k_inv_arg, n); + break; + default: + break; + } +} + +void launch_linearize_static( + gnark_gpu_plonk2_curve_id_t curve, FrView result, + ConstFrView z, ConstFrView s3, + ConstFrView ql, ConstFrView qr, ConstFrView qm, ConstFrView qo, ConstFrView qk, + const uint64_t *scalars, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + int limbs = curve_limbs(curve); + ScalarArg combined_z = make_scalar_arg(curve, scalars); + ScalarArg s1 = make_scalar_arg(curve, scalars + limbs); + ScalarArg l = make_scalar_arg(curve, scalars + 2 * limbs); + ScalarArg r = make_scalar_arg(curve, scalars + 3 * limbs); + ScalarArg rl = make_scalar_arg(curve, scalars + 4 * limbs); + ScalarArg o = make_scalar_arg(curve, scalars + 5 * limbs); + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + linearize_static_kernel<<>>( + result, z, s3, ql, qr, qm, qo, qk, combined_z, s1, l, r, rl, o, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + linearize_static_kernel<<>>( + result, z, s3, ql, qr, qm, qo, qk, combined_z, s1, l, r, rl, o, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + linearize_static_kernel<<>>( + result, z, s3, ql, qr, qm, qo, qk, combined_z, s1, l, r, rl, o, n); + break; + default: + break; + } +} + +void launch_subtract_head( + gnark_gpu_plonk2_curve_id_t curve, FrView data, const uint64_t *tail, + size_t tail_len, cudaStream_t stream) { + + if(tail_len == 0) return; + unsigned blocks = (unsigned)((tail_len + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + subtract_head_kernel<<>>( + data, tail, tail_len); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + subtract_head_kernel<<>>( + data, tail, tail_len); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + subtract_head_kernel<<>>( + data, tail, tail_len); + break; + default: + break; + } +} + +void launch_perm_boundary( + gnark_gpu_plonk2_curve_id_t curve, FrView result, + ConstFrView l, ConstFrView r, ConstFrView o, ConstFrView z, + ConstFrView s1, ConstFrView s2, ConstFrView s3, ConstFrView l1_den_inv, + ConstFrView twiddles, const uint64_t *params, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + int limbs = curve_limbs(curve); + ScalarArg alpha = make_scalar_arg(curve, params); + ScalarArg beta = make_scalar_arg(curve, params + limbs); + ScalarArg gamma = make_scalar_arg(curve, params + 2 * limbs); + ScalarArg l1_scalar = make_scalar_arg(curve, params + 3 * limbs); + ScalarArg coset_shift = make_scalar_arg(curve, params + 4 * limbs); + ScalarArg coset_shift_sq = make_scalar_arg(curve, params + 5 * limbs); + ScalarArg coset_gen = make_scalar_arg(curve, params + 6 * limbs); + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + perm_boundary_kernel<<>>( + result, l, r, o, z, s1, s2, s3, l1_den_inv, twiddles, + alpha, beta, gamma, l1_scalar, coset_shift, coset_shift_sq, coset_gen, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + perm_boundary_kernel<<>>( + result, l, r, o, z, s1, s2, s3, l1_den_inv, twiddles, + alpha, beta, gamma, l1_scalar, coset_shift, coset_shift_sq, coset_gen, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + perm_boundary_kernel<<>>( + result, l, r, o, z, s1, s2, s3, l1_den_inv, twiddles, + alpha, beta, gamma, l1_scalar, coset_shift, coset_shift_sq, coset_gen, n); + break; + default: + break; + } +} + +void launch_z_compute_factors( + gnark_gpu_plonk2_curve_id_t curve, FrView l_inout, FrView r_inout, + ConstFrView o, const int64_t *perm, ConstFrView twiddles, + const uint64_t *params, size_t n, unsigned log2n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + int limbs = curve_limbs(curve); + ScalarArg beta = make_scalar_arg(curve, params); + ScalarArg gamma = make_scalar_arg(curve, params + limbs); + ScalarArg coset_shift = make_scalar_arg(curve, params + 2 * limbs); + ScalarArg coset_shift_sq = make_scalar_arg(curve, params + 3 * limbs); + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + z_compute_factors_kernel<<>>( + l_inout, r_inout, o, perm, twiddles, beta, gamma, + coset_shift, coset_shift_sq, n, log2n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + z_compute_factors_kernel<<>>( + l_inout, r_inout, o, perm, twiddles, beta, gamma, + coset_shift, coset_shift_sq, n, log2n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + z_compute_factors_kernel<<>>( + l_inout, r_inout, o, perm, twiddles, beta, gamma, + coset_shift, coset_shift_sq, n, log2n); + break; + default: + break; + } +} + +void launch_z_prefix_phase1( + gnark_gpu_plonk2_curve_id_t curve, FrView z, ConstFrView ratio, + uint64_t *chunk_products, size_t n, cudaStream_t stream) { + + size_t num_chunks = (n + Z_PREFIX_CHUNK_SIZE - 1) / Z_PREFIX_CHUNK_SIZE; + unsigned blocks = (unsigned)((num_chunks + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + z_prefix_local_kernel<<>>( + z, ratio, chunk_products, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + z_prefix_local_kernel<<>>( + z, ratio, chunk_products, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + z_prefix_local_kernel<<>>( + z, ratio, chunk_products, n); + break; + default: + break; + } +} + +void launch_z_prefix_phase3( + gnark_gpu_plonk2_curve_id_t curve, FrView z, FrView temp, + const uint64_t *scanned_prefixes, size_t num_chunks, size_t n, + cudaStream_t stream) { + + unsigned chunk_blocks = (unsigned)((num_chunks + THREADS - 1) / THREADS); + unsigned n_blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + z_prefix_fixup_kernel<<>>( + z, scanned_prefixes, n); + for(int i = 0; i < BN254FrParams::LIMBS; i++) { + cudaMemcpyAsync(temp.limbs[i], z.limbs[i], n * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, stream); + } + z_prefix_shift_right_kernel<<>>( + z, make_const(temp), n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + z_prefix_fixup_kernel<<>>( + z, scanned_prefixes, n); + for(int i = 0; i < BLS12377FrParams::LIMBS; i++) { + cudaMemcpyAsync(temp.limbs[i], z.limbs[i], n * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, stream); + } + z_prefix_shift_right_kernel<<>>( + z, make_const(temp), n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + z_prefix_fixup_kernel<<>>( + z, scanned_prefixes, n); + for(int i = 0; i < BW6761FrParams::LIMBS; i++) { + cudaMemcpyAsync(temp.limbs[i], z.limbs[i], n * sizeof(uint64_t), + cudaMemcpyDeviceToDevice, stream); + } + z_prefix_shift_right_kernel<<>>( + z, make_const(temp), n); + break; + default: + break; + } +} + +void launch_poly_eval_chunks( + gnark_gpu_plonk2_curve_id_t curve, ConstFrView coeffs, const uint64_t *z, + uint64_t *partials, size_t n, cudaStream_t stream) { + + size_t num_chunks = (n + POLY_EVAL_CHUNK_SIZE - 1) / POLY_EVAL_CHUNK_SIZE; + unsigned blocks = (unsigned)((num_chunks + THREADS - 1) / THREADS); + ScalarArg z_arg = make_scalar_arg(curve, z); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + poly_eval_chunks_kernel<<>>( + coeffs, z_arg, partials, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + poly_eval_chunks_kernel<<>>( + coeffs, z_arg, partials, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + poly_eval_chunks_kernel<<>>( + coeffs, z_arg, partials, n); + break; + default: + break; + } +} + +void launch_ntt_forward( + gnark_gpu_plonk2_curve_id_t curve, FrView data, ConstFrView twiddles, + size_t n, cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + launch_ntt_forward_typed(data, twiddles, n, stream); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + launch_ntt_forward_typed(data, twiddles, n, stream); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + launch_ntt_forward_typed(data, twiddles, n, stream); + break; + default: + break; + } +} + +void launch_ntt_inverse( + gnark_gpu_plonk2_curve_id_t curve, FrView data, ConstFrView twiddles, + const uint64_t *inv_n, size_t n, cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + launch_ntt_inverse_typed(data, twiddles, inv_n, n, stream); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + launch_ntt_inverse_typed(data, twiddles, inv_n, n, stream); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + launch_ntt_inverse_typed(data, twiddles, inv_n, n, stream); + break; + default: + break; + } +} + +void launch_scale_by_powers( + gnark_gpu_plonk2_curve_id_t curve, FrView data, const uint64_t *generator, + uint64_t *local_powers, size_t n, cudaStream_t stream) { + + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + ScalarArg generator_arg = make_scalar_arg(curve, generator); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + local_power_table_kernel<<<1, 1, 0, stream>>>( + generator_arg, local_powers); + scale_by_powers_kernel<<>>( + data, generator_arg, local_powers, n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + local_power_table_kernel<<<1, 1, 0, stream>>>( + generator_arg, local_powers); + scale_by_powers_kernel<<>>( + data, generator_arg, local_powers, n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + local_power_table_kernel<<<1, 1, 0, stream>>>( + generator_arg, local_powers); + scale_by_powers_kernel<<>>( + data, generator_arg, local_powers, n); + break; + default: + break; + } +} + +void launch_bit_reverse( + gnark_gpu_plonk2_curve_id_t curve, FrView data, size_t n, cudaStream_t stream) { + + int log_n = log2_exact(n); + unsigned blocks = (unsigned)((n + THREADS - 1) / THREADS); + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + bit_reverse_kernel<<>>(data, n, log_n); + break; + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + bit_reverse_kernel<<>>(data, n, log_n); + break; + case GNARK_GPU_PLONK2_CURVE_BW6_761: + bit_reverse_kernel<<>>(data, n, log_n); + break; + default: + break; + } +} + +} // namespace gnark_gpu::plonk2 diff --git a/prover/gpu/cuda/src/plonk2/mimc.cu b/prover/gpu/cuda/src/plonk2/mimc.cu new file mode 100644 index 00000000000..f672022f5cd --- /dev/null +++ b/prover/gpu/cuda/src/plonk2/mimc.cu @@ -0,0 +1,893 @@ +#include "gnark_gpu.h" +#include "field.cuh" + +#include +#include +#include +#include +#include +#include + +namespace { + +using Params = gnark_gpu::plonk2::BLS12377FrParams; + +constexpr unsigned THREADS = 256; +constexpr unsigned SIS_THREADS = 64; +constexpr unsigned SIS_COLS_PER_BLOCK = 2; +constexpr int MIMC_ROUNDS = 62; +constexpr int SIS_DEGREE = 64; +constexpr int SIS_LIMBS_PER_FIELD = 16; +constexpr int SIS_TWIDDLES_SIZE = 69; + +enum RowKind : uint8_t { + ROW_KIND_REGULAR = 0, + ROW_KIND_CONSTANT = 1, +}; + +__host__ __forceinline__ int grid(size_t n) { + return static_cast((n + THREADS - 1) / THREADS); +} + +using Clock = std::chrono::steady_clock; +using TimePoint = std::chrono::time_point; + +bool timing_enabled() { + static const bool enabled = [] { + const char *v = std::getenv("LINEA_PROVER_GPU_PI_VORTEX_TIMINGS"); + return v != nullptr && v[0] != '\0' && !(v[0] == '0' && v[1] == '\0'); + }(); + return enabled; +} + +TimePoint now_if(bool enabled) { + return enabled ? Clock::now() : TimePoint{}; +} + +double elapsed_ms(TimePoint start, TimePoint stop) { + return std::chrono::duration(stop - start).count(); +} + +void log_timing(const char *name, size_t rows, size_t cols, size_t elems, + double malloc_ms, double h2d_ms, double static_h2d_ms, + double leaf_ms, double tree_ms, double d2h_ms, double total_ms) { + std::fprintf(stderr, + "[gpu-pi-vortex] op=%s rows=%zu cols=%zu elems=%zu " + "malloc=%.3fms h2d=%.3fms static_h2d=%.3fms leaf=%.3fms " + "tree=%.3fms d2h=%.3fms total=%.3fms\n", + name, rows, cols, elems, malloc_ms, h2d_ms, static_h2d_ms, + leaf_ms, tree_ms, d2h_ms, total_ms); +} + +gnark_gpu_error_t check(cudaError_t err) { + if(err == cudaSuccess) return GNARK_GPU_SUCCESS; + if(err == cudaErrorMemoryAllocation) return GNARK_GPU_ERROR_OUT_OF_MEMORY; + return GNARK_GPU_ERROR_CUDA; +} + +__device__ __forceinline__ void load_aos(uint64_t out[Params::LIMBS], + const uint64_t *src, size_t idx) { + const uint64_t *p = src + idx * Params::LIMBS; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) out[i] = p[i]; +} + +__device__ __forceinline__ void store_aos(uint64_t *dst, size_t idx, + const uint64_t in[Params::LIMBS]) { + uint64_t *p = dst + idx * Params::LIMBS; +#pragma unroll + for(int i = 0; i < Params::LIMBS; i++) p[i] = in[i]; +} + +__device__ __forceinline__ size_t twiddle_offset(int stage) { + size_t off = 0; +#pragma unroll + for(int i = 0; i < 6; i++) { + if(i >= stage) break; + off += 1 + (SIS_DEGREE >> (i + 1)); + } + return off; +} + +__device__ __forceinline__ void butterfly(uint64_t a[Params::LIMBS], + uint64_t b[Params::LIMBS]) { + uint64_t t[Params::LIMBS]; + gnark_gpu::plonk2::set(t, a); + gnark_gpu::plonk2::add(a, a, b); + gnark_gpu::plonk2::sub(b, t, b); +} + +__device__ __forceinline__ void set_raw_u64(uint64_t out[Params::LIMBS], uint64_t v) { + out[0] = v; +#pragma unroll + for(int i = 1; i < Params::LIMBS; i++) out[i] = 0; +} + +__device__ __forceinline__ void from_montgomery( + uint64_t out[Params::LIMBS], + const uint64_t in[Params::LIMBS]) { + uint64_t raw_one[Params::LIMBS] = {1, 0, 0, 0}; + gnark_gpu::plonk2::mul(out, in, raw_one); +} + +__device__ __forceinline__ uint16_t limb16(const uint64_t raw[Params::LIMBS], int limb) { + return static_cast((raw[limb >> 2] >> ((limb & 3) * 16)) & 0xffffULL); +} + +__device__ __forceinline__ void fft_dif_coset_64_at( + uint64_t a[SIS_DEGREE][Params::LIMBS], + const uint64_t *twiddles, + const uint64_t *coset, + int tid) { + + uint64_t c[Params::LIMBS]; + load_aos(c, coset, tid); + gnark_gpu::plonk2::mul(a[tid], a[tid], c); + __syncthreads(); + +#pragma unroll + for(int stage = 0; stage < 6; stage++) { + const int half = SIS_DEGREE >> (stage + 1); + const int segment = half << 1; + const int local = tid & (segment - 1); + if(local < half) { + const int i = tid - local + local; + const int j = i + half; + butterfly(a[i], a[j]); + if(local != 0) { + uint64_t tw[Params::LIMBS]; + load_aos(tw, twiddles, twiddle_offset(stage) + local); + gnark_gpu::plonk2::mul(a[j], a[j], tw); + } + } + __syncthreads(); + } +} + +__device__ __forceinline__ void fft_dif_coset_64( + uint64_t a[SIS_DEGREE][Params::LIMBS], + const uint64_t *twiddles, + const uint64_t *coset) { + + fft_dif_coset_64_at(a, twiddles, coset, threadIdx.x); +} + +__device__ __forceinline__ void fft_inverse_dit_coset_64_at( + uint64_t a[SIS_DEGREE][Params::LIMBS], + const uint64_t *twiddles_inv, + const uint64_t *coset_inv, + const uint64_t *cardinality_inv, + int tid) { + +#pragma unroll + for(int stage = 5; stage >= 0; stage--) { + const int half = SIS_DEGREE >> (stage + 1); + const int segment = half << 1; + const int local = tid & (segment - 1); + if(local < half) { + const int i = tid - local + local; + const int j = i + half; + if(local != 0) { + uint64_t tw[Params::LIMBS]; + load_aos(tw, twiddles_inv, twiddle_offset(stage) + local); + gnark_gpu::plonk2::mul(a[j], a[j], tw); + } + butterfly(a[i], a[j]); + } + __syncthreads(); + } + + uint64_t c[Params::LIMBS], n_inv[Params::LIMBS]; + load_aos(c, coset_inv, tid); + load_aos(n_inv, cardinality_inv, 0); + gnark_gpu::plonk2::mul(a[tid], a[tid], c); + gnark_gpu::plonk2::mul(a[tid], a[tid], n_inv); + __syncthreads(); +} + +__device__ __forceinline__ void fft_inverse_dit_coset_64( + uint64_t a[SIS_DEGREE][Params::LIMBS], + const uint64_t *twiddles_inv, + const uint64_t *coset_inv, + const uint64_t *cardinality_inv) { + + fft_inverse_dit_coset_64_at(a, twiddles_inv, coset_inv, cardinality_inv, threadIdx.x); +} + +__device__ __forceinline__ void mimc_encrypt( + uint64_t out[Params::LIMBS], + const uint64_t message[Params::LIMBS], + const uint64_t key[Params::LIMBS], + const uint64_t *constants) { + + uint64_t m[Params::LIMBS]; + gnark_gpu::plonk2::set(m, message); + + for(int r = 0; r < MIMC_ROUNDS; r++) { + uint64_t c[Params::LIMBS], tmp[Params::LIMBS]; + load_aos(c, constants, static_cast(r)); + gnark_gpu::plonk2::add(tmp, m, key); + gnark_gpu::plonk2::add(tmp, tmp, c); + + // tmp^17 = ((((tmp^2)^2)^2)^2) * tmp. + gnark_gpu::plonk2::square(m, tmp); + gnark_gpu::plonk2::square(m, m); + gnark_gpu::plonk2::square(m, m); + gnark_gpu::plonk2::square(m, m); + gnark_gpu::plonk2::mul(m, m, tmp); + } + + gnark_gpu::plonk2::add(out, m, key); +} + +__device__ __forceinline__ void mimc_absorb( + uint64_t state[Params::LIMBS], + const uint64_t message[Params::LIMBS], + const uint64_t *constants) { + + uint64_t encrypted[Params::LIMBS], next[Params::LIMBS]; + mimc_encrypt(encrypted, message, state, constants); + gnark_gpu::plonk2::add(next, encrypted, state); + gnark_gpu::plonk2::add(state, next, message); +} + +__global__ void sis_leaf_kernel( + const uint64_t *__restrict__ col_hashes, + size_t chunk_size, + const uint64_t *__restrict__ constants, + uint64_t *__restrict__ nodes, + size_t num_leaves) { + + size_t leaf = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if(leaf >= num_leaves) return; + + uint64_t state[Params::LIMBS]; + gnark_gpu::plonk2::zero(state); + + size_t base = leaf * chunk_size; + for(size_t j = 0; j < chunk_size; j++) { + uint64_t msg[Params::LIMBS]; + load_aos(msg, col_hashes, base + j); + mimc_absorb(state, msg, constants); + } + + store_aos(nodes, leaf, state); +} + +__global__ void parent_kernel( + const uint64_t *__restrict__ nodes, + size_t prev_offset, + size_t next_offset, + const uint64_t *__restrict__ constants, + uint64_t *__restrict__ out_nodes, + size_t num_parents) { + + size_t parent = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if(parent >= num_parents) return; + + uint64_t state[Params::LIMBS], left[Params::LIMBS], right[Params::LIMBS]; + gnark_gpu::plonk2::zero(state); + load_aos(left, nodes, prev_offset + 2 * parent); + load_aos(right, nodes, prev_offset + 2 * parent + 1); + mimc_absorb(state, left, constants); + mimc_absorb(state, right, constants); + store_aos(out_nodes, next_offset + parent, state); +} + +__global__ void sis_mimc_leaf_kernel( + const uint64_t *__restrict__ rows, + const uint8_t *__restrict__ row_kinds, + const uint64_t *__restrict__ row_constants, + size_t num_rows, + size_t num_cols, + const uint64_t *__restrict__ ag, + size_t num_polys, + const uint64_t *__restrict__ twiddles, + const uint64_t *__restrict__ twiddles_inv, + const uint64_t *__restrict__ coset, + const uint64_t *__restrict__ coset_inv, + const uint64_t *__restrict__ cardinality_inv, + const uint64_t *__restrict__ mimc_constants, + uint64_t *__restrict__ out_col_hashes, + uint64_t *__restrict__ out_nodes) { + + const size_t col = static_cast(blockIdx.x); + if(col >= num_cols || threadIdx.x >= SIS_THREADS) return; + + const int tid = threadIdx.x; + __shared__ uint64_t k[SIS_DEGREE][Params::LIMBS]; + __shared__ uint64_t res[SIS_DEGREE][Params::LIMBS]; + __shared__ uint64_t raw_rows[4][Params::LIMBS]; + + gnark_gpu::plonk2::zero(res[tid]); + + for(size_t poly = 0; poly < num_polys; poly++) { + const int local_row = tid / SIS_LIMBS_PER_FIELD; + const int local_limb = tid - local_row * SIS_LIMBS_PER_FIELD; + const size_t row = poly * 4 + static_cast(local_row); + + if(local_limb == 0) { + if(row < num_rows) { + uint64_t mont[Params::LIMBS]; + if(row_kinds[row] == ROW_KIND_CONSTANT) { + load_aos(mont, row_constants, row); + } else { + load_aos(mont, rows, row * num_cols + col); + } + from_montgomery(raw_rows[local_row], mont); + } else { + gnark_gpu::plonk2::zero(raw_rows[local_row]); + } + } + __syncthreads(); + + const uint16_t l = limb16(raw_rows[local_row], local_limb); + set_raw_u64(k[tid], static_cast(l)); + __syncthreads(); + + fft_dif_coset_64(k, twiddles, coset); + + uint64_t a[Params::LIMBS], prod[Params::LIMBS]; + load_aos(a, ag, poly * SIS_DEGREE + static_cast(tid)); + gnark_gpu::plonk2::mul(prod, k[tid], a); + gnark_gpu::plonk2::add(res[tid], res[tid], prod); + __syncthreads(); + } + + fft_inverse_dit_coset_64(res, twiddles_inv, coset_inv, cardinality_inv); + + store_aos(out_col_hashes, col * SIS_DEGREE + static_cast(tid), res[tid]); + __syncthreads(); + + if(tid == 0) { + uint64_t state[Params::LIMBS]; + gnark_gpu::plonk2::zero(state); + for(int j = 0; j < SIS_DEGREE; j++) { + mimc_absorb(state, res[j], mimc_constants); + } + store_aos(out_nodes, col, state); + } +} + +__global__ void sis_mimc_leaf_kernel_tiled2( + const uint64_t *__restrict__ rows, + const uint8_t *__restrict__ row_kinds, + const uint64_t *__restrict__ row_constants, + size_t num_rows, + size_t num_cols, + const uint64_t *__restrict__ ag, + size_t num_polys, + const uint64_t *__restrict__ twiddles, + const uint64_t *__restrict__ twiddles_inv, + const uint64_t *__restrict__ coset, + const uint64_t *__restrict__ coset_inv, + const uint64_t *__restrict__ cardinality_inv, + const uint64_t *__restrict__ mimc_constants, + uint64_t *__restrict__ out_col_hashes, + uint64_t *__restrict__ out_nodes) { + + const int tile = threadIdx.x / SIS_THREADS; + const int tid = threadIdx.x - tile * SIS_THREADS; + const size_t col = static_cast(blockIdx.x) * SIS_COLS_PER_BLOCK + + static_cast(tile); + if(tile >= SIS_COLS_PER_BLOCK || col >= num_cols) return; + + __shared__ uint64_t k[SIS_COLS_PER_BLOCK][SIS_DEGREE][Params::LIMBS]; + __shared__ uint64_t res[SIS_COLS_PER_BLOCK][SIS_DEGREE][Params::LIMBS]; + __shared__ uint64_t raw_rows[SIS_COLS_PER_BLOCK][4][Params::LIMBS]; + + gnark_gpu::plonk2::zero(res[tile][tid]); + + for(size_t poly = 0; poly < num_polys; poly++) { + const int local_row = tid / SIS_LIMBS_PER_FIELD; + const int local_limb = tid - local_row * SIS_LIMBS_PER_FIELD; + const size_t row = poly * 4 + static_cast(local_row); + + if(local_limb == 0) { + if(row < num_rows) { + uint64_t mont[Params::LIMBS]; + if(row_kinds[row] == ROW_KIND_CONSTANT) { + load_aos(mont, row_constants, row); + } else { + load_aos(mont, rows, row * num_cols + col); + } + from_montgomery(raw_rows[tile][local_row], mont); + } else { + gnark_gpu::plonk2::zero(raw_rows[tile][local_row]); + } + } + __syncthreads(); + + const uint16_t l = limb16(raw_rows[tile][local_row], local_limb); + set_raw_u64(k[tile][tid], static_cast(l)); + __syncthreads(); + + fft_dif_coset_64_at(k[tile], twiddles, coset, tid); + + uint64_t a[Params::LIMBS], prod[Params::LIMBS]; + load_aos(a, ag, poly * SIS_DEGREE + static_cast(tid)); + gnark_gpu::plonk2::mul(prod, k[tile][tid], a); + gnark_gpu::plonk2::add(res[tile][tid], res[tile][tid], prod); + __syncthreads(); + } + + fft_inverse_dit_coset_64_at(res[tile], twiddles_inv, coset_inv, cardinality_inv, tid); + + store_aos(out_col_hashes, col * SIS_DEGREE + static_cast(tid), res[tile][tid]); + __syncthreads(); + + if(tid == 0) { + uint64_t state[Params::LIMBS]; + gnark_gpu::plonk2::zero(state); + for(int j = 0; j < SIS_DEGREE; j++) { + mimc_absorb(state, res[tile][j], mimc_constants); + } + store_aos(out_nodes, col, state); + } +} + +bool is_power_of_two(size_t n) { + return n != 0 && (n & (n - 1)) == 0; +} + +} // namespace + +extern "C" gnark_gpu_error_t gnark_gpu_bls12377_mimc_sis_tree( + gnark_gpu_context_t ctx, + const uint64_t *col_hashes, + size_t num_leaves, + size_t chunk_size, + const uint64_t *constants, + uint64_t *out_nodes) { + + if(ctx == nullptr || col_hashes == nullptr || constants == nullptr || out_nodes == nullptr) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if(!is_power_of_two(num_leaves) || chunk_size == 0) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + const size_t input_elems = num_leaves * chunk_size; + if(input_elems / chunk_size != num_leaves) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + const size_t total_nodes = 2 * num_leaves - 1; + if(total_nodes < num_leaves) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + uint64_t *d_input = nullptr; + uint64_t *d_constants = nullptr; + uint64_t *d_nodes = nullptr; + const bool timed = timing_enabled(); + const auto t_total_start = now_if(timed); + auto t_phase = t_total_start; + double malloc_ms = 0; + double h2d_ms = 0; + double static_h2d_ms = 0; + double leaf_ms = 0; + double tree_ms = 0; + double d2h_ms = 0; + + auto cleanup = [&]() { + if(d_input != nullptr) cudaFree(d_input); + if(d_constants != nullptr) cudaFree(d_constants); + if(d_nodes != nullptr) cudaFree(d_nodes); + }; + + cudaError_t err = cudaMalloc(&d_input, input_elems * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_constants, MIMC_ROUNDS * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_nodes, total_nodes * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + if(timed) { + const auto t_now = Clock::now(); + malloc_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + err = cudaMemcpy(d_input, col_hashes, input_elems * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + if(timed) { + const auto t_now = Clock::now(); + h2d_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + err = cudaMemcpy(d_constants, constants, MIMC_ROUNDS * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + if(timed) { + const auto t_now = Clock::now(); + static_h2d_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + sis_leaf_kernel<<>>(d_input, chunk_size, d_constants, d_nodes, num_leaves); + err = cudaGetLastError(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + if(timed) { + err = cudaDeviceSynchronize(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + const auto t_now = Clock::now(); + leaf_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + size_t prev_offset = 0; + size_t next_offset = num_leaves; + size_t level_size = num_leaves; + while(level_size > 1) { + size_t parents = level_size / 2; + parent_kernel<<>>( + d_nodes, prev_offset, next_offset, d_constants, d_nodes, parents); + err = cudaGetLastError(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + prev_offset = next_offset; + next_offset += parents; + level_size = parents; + } + if(timed) { + err = cudaDeviceSynchronize(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + const auto t_now = Clock::now(); + tree_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + err = cudaMemcpy(out_nodes, d_nodes, total_nodes * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaDeviceSynchronize(); + if(timed) { + const auto t_now = Clock::now(); + d2h_ms = elapsed_ms(t_phase, t_now); + log_timing("mimc_tree", chunk_size, num_leaves, input_elems, + malloc_ms, h2d_ms, static_h2d_ms, leaf_ms, tree_ms, d2h_ms, + elapsed_ms(t_total_start, t_now)); + } + cleanup(); + return check(err); +} + +extern "C" gnark_gpu_error_t gnark_gpu_bls12377_sis_mimc_tree( + gnark_gpu_context_t ctx, + const uintptr_t *row_ptrs, + const uint8_t *row_kinds, + const uint64_t *row_constants, + size_t num_rows, + size_t num_cols, + const uint64_t *ag, + size_t num_polys, + const uint64_t *twiddles, + const uint64_t *twiddles_inv, + const uint64_t *coset, + const uint64_t *coset_inv, + const uint64_t *cardinality_inv, + const uint64_t *mimc_constants, + uint64_t *out_col_hashes, + uint64_t *out_nodes) { + + if(ctx == nullptr || row_ptrs == nullptr || row_kinds == nullptr || + row_constants == nullptr || ag == nullptr || twiddles == nullptr || + twiddles_inv == nullptr || coset == nullptr || coset_inv == nullptr || + cardinality_inv == nullptr || mimc_constants == nullptr || + out_col_hashes == nullptr || out_nodes == nullptr) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if(num_rows == 0 || num_cols == 0 || !is_power_of_two(num_cols) || num_polys == 0) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + if(num_polys != (num_rows * SIS_LIMBS_PER_FIELD + SIS_DEGREE - 1) / SIS_DEGREE) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + const size_t row_elems = num_rows * num_cols; + const size_t col_hash_elems = num_cols * SIS_DEGREE; + const size_t total_nodes = 2 * num_cols - 1; + if(row_elems / num_cols != num_rows || col_hash_elems / SIS_DEGREE != num_cols || + total_nodes < num_cols) { + return GNARK_GPU_ERROR_INVALID_ARG; + } + + uint64_t *d_rows = nullptr; + uint8_t *d_row_kinds = nullptr; + uint64_t *d_row_constants = nullptr; + uint64_t *d_ag = nullptr; + uint64_t *d_twiddles = nullptr; + uint64_t *d_twiddles_inv = nullptr; + uint64_t *d_coset = nullptr; + uint64_t *d_coset_inv = nullptr; + uint64_t *d_cardinality_inv = nullptr; + uint64_t *d_mimc_constants = nullptr; + uint64_t *d_col_hashes = nullptr; + uint64_t *d_nodes = nullptr; + const bool timed = timing_enabled(); + const auto t_total_start = now_if(timed); + auto t_phase = t_total_start; + double malloc_ms = 0; + double h2d_ms = 0; + double static_h2d_ms = 0; + double leaf_ms = 0; + double tree_ms = 0; + double d2h_ms = 0; + + auto cleanup = [&]() { + if(d_rows != nullptr) cudaFree(d_rows); + if(d_row_kinds != nullptr) cudaFree(d_row_kinds); + if(d_row_constants != nullptr) cudaFree(d_row_constants); + if(d_ag != nullptr) cudaFree(d_ag); + if(d_twiddles != nullptr) cudaFree(d_twiddles); + if(d_twiddles_inv != nullptr) cudaFree(d_twiddles_inv); + if(d_coset != nullptr) cudaFree(d_coset); + if(d_coset_inv != nullptr) cudaFree(d_coset_inv); + if(d_cardinality_inv != nullptr) cudaFree(d_cardinality_inv); + if(d_mimc_constants != nullptr) cudaFree(d_mimc_constants); + if(d_col_hashes != nullptr) cudaFree(d_col_hashes); + if(d_nodes != nullptr) cudaFree(d_nodes); + }; + + cudaError_t err = cudaMalloc(&d_rows, row_elems * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_row_kinds, num_rows * sizeof(uint8_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_row_constants, num_rows * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_ag, num_polys * SIS_DEGREE * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_twiddles, SIS_TWIDDLES_SIZE * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_twiddles_inv, SIS_TWIDDLES_SIZE * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_coset, SIS_DEGREE * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_coset_inv, SIS_DEGREE * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_cardinality_inv, Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_mimc_constants, MIMC_ROUNDS * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_col_hashes, col_hash_elems * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMalloc(&d_nodes, total_nodes * Params::LIMBS * sizeof(uint64_t)); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + if(timed) { + const auto t_now = Clock::now(); + malloc_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + for(size_t row = 0; row < num_rows; row++) { + if(row_kinds[row] != ROW_KIND_REGULAR && row_kinds[row] != ROW_KIND_CONSTANT) { + cleanup(); + return GNARK_GPU_ERROR_INVALID_ARG; + } + if(row_kinds[row] != ROW_KIND_REGULAR) { + continue; + } + const auto *src = reinterpret_cast(row_ptrs[row]); + if(src == nullptr) { + cleanup(); + return GNARK_GPU_ERROR_INVALID_ARG; + } + err = cudaMemcpy(d_rows + row * num_cols * Params::LIMBS, src, + num_cols * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + } + if(timed) { + const auto t_now = Clock::now(); + h2d_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + err = cudaMemcpy(d_row_kinds, row_kinds, num_rows * sizeof(uint8_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_row_constants, row_constants, num_rows * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_ag, ag, num_polys * SIS_DEGREE * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_twiddles, twiddles, SIS_TWIDDLES_SIZE * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_twiddles_inv, twiddles_inv, + SIS_TWIDDLES_SIZE * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_coset, coset, SIS_DEGREE * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_coset_inv, coset_inv, SIS_DEGREE * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_cardinality_inv, cardinality_inv, Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(d_mimc_constants, mimc_constants, MIMC_ROUNDS * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyHostToDevice); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + if(timed) { + const auto t_now = Clock::now(); + static_h2d_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + if(num_cols % SIS_COLS_PER_BLOCK == 0) { + sis_mimc_leaf_kernel_tiled2<<(num_cols / SIS_COLS_PER_BLOCK), + SIS_COLS_PER_BLOCK * SIS_THREADS>>>( + d_rows, d_row_kinds, d_row_constants, num_rows, num_cols, d_ag, num_polys, + d_twiddles, d_twiddles_inv, d_coset, d_coset_inv, d_cardinality_inv, + d_mimc_constants, d_col_hashes, d_nodes); + } else { + sis_mimc_leaf_kernel<<(num_cols), SIS_THREADS>>>( + d_rows, d_row_kinds, d_row_constants, num_rows, num_cols, d_ag, num_polys, + d_twiddles, d_twiddles_inv, d_coset, d_coset_inv, d_cardinality_inv, + d_mimc_constants, d_col_hashes, d_nodes); + } + err = cudaGetLastError(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + if(timed) { + err = cudaDeviceSynchronize(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + const auto t_now = Clock::now(); + leaf_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + size_t prev_offset = 0; + size_t next_offset = num_cols; + size_t level_size = num_cols; + while(level_size > 1) { + size_t parents = level_size / 2; + parent_kernel<<>>( + d_nodes, prev_offset, next_offset, d_mimc_constants, d_nodes, parents); + err = cudaGetLastError(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + prev_offset = next_offset; + next_offset += parents; + level_size = parents; + } + if(timed) { + err = cudaDeviceSynchronize(); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + const auto t_now = Clock::now(); + tree_ms = elapsed_ms(t_phase, t_now); + t_phase = t_now; + } + + err = cudaMemcpy(out_col_hashes, d_col_hashes, + col_hash_elems * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaMemcpy(out_nodes, d_nodes, total_nodes * Params::LIMBS * sizeof(uint64_t), + cudaMemcpyDeviceToHost); + if(err != cudaSuccess) { + cleanup(); + return check(err); + } + err = cudaDeviceSynchronize(); + if(timed) { + const auto t_now = Clock::now(); + d2h_ms = elapsed_ms(t_phase, t_now); + log_timing("sis_mimc_tree", num_rows, num_cols, row_elems, + malloc_ms, h2d_ms, static_h2d_ms, leaf_ms, tree_ms, d2h_ms, + elapsed_ms(t_total_start, t_now)); + } + cleanup(); + return check(err); +} diff --git a/prover/gpu/cuda/src/plonk2/msm.cu b/prover/gpu/cuda/src/plonk2/msm.cu new file mode 100644 index 00000000000..aef741f9997 --- /dev/null +++ b/prover/gpu/cuda/src/plonk2/msm.cu @@ -0,0 +1,1117 @@ +// Curve-generic Pippenger MSM for gpu/plonk2. +// +// This backend deliberately uses gnark-crypto's short-Weierstrass affine input +// layout for every curve. It keeps the PlonK commitment surface independent of +// the BLS12-377 twisted-Edwards specialization in gpu/plonk while retaining the +// same high-level pipeline: +// +// scalar windows -> CUB radix sort -> bucket boundaries -> bucket sums +// -> per-window reductions -> Horner combination +// +// Bucket accumulation remains intentionally simple. Window reduction is +// parallelized without exposing extra tuning knobs to the Go API. + +#include "ec.cuh" + +#include +#include + +#include +#include + +namespace gnark_gpu::plonk2 { + +namespace { + +static constexpr int MSM_THREADS = 256; +static constexpr int ACCUM_PARALLEL_THREADS = 128; +static constexpr int ACCUM_SEQ_CAP = 256; +static constexpr int REDUCE_THREADS_PER_WINDOW = 128; +static constexpr int FINALIZE_THREADS = 32; + +template +__device__ __forceinline__ void load_affine(AffinePoint &p, + const uint64_t *raw) { +#pragma unroll + for(int i = 0; i < Fp::LIMBS; i++) { + p.x[i] = raw[i]; + p.y[i] = raw[Fp::LIMBS + i]; + } +} + +template +__device__ __forceinline__ void load_affine_at(AffinePoint &p, + const uint64_t *raw, + size_t idx) { + load_affine(p, raw + idx * (2 * Fp::LIMBS)); +} + +template +__device__ __forceinline__ void store_jacobian(const JacobianPoint &p, + uint64_t *raw) { +#pragma unroll + for(int i = 0; i < Fp::LIMBS; i++) { + raw[i] = p.x[i]; + raw[Fp::LIMBS + i] = p.y[i]; + raw[2 * Fp::LIMBS + i] = p.z[i]; + } +} + +template +__device__ __forceinline__ uint32_t scalar_window(const uint64_t *scalars, + size_t idx, int bit_offset, + int window_bits) { + const uint64_t *scalar = scalars + idx * Fr::LIMBS; + const int limb_idx = bit_offset >> 6; + const int bit_shift = bit_offset & 63; + if(limb_idx >= Fr::LIMBS) return 0; + + uint64_t digit = scalar[limb_idx] >> bit_shift; + if(bit_shift + window_bits > 64 && limb_idx + 1 < Fr::LIMBS) { + digit |= scalar[limb_idx + 1] << (64 - bit_shift); + } + const uint64_t mask = (uint64_t{1} << window_bits) - 1; + return static_cast(digit & mask); +} + +template +__global__ void __launch_bounds__(MSM_THREADS) build_pairs_kernel( + const uint64_t *__restrict__ scalars, + uint32_t *__restrict__ keys, + uint32_t *__restrict__ vals, + size_t count, + int window_bits, + int num_windows, + int num_buckets, + int total_buckets) { + + const size_t idx = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if(idx >= count) return; + + uint32_t carry = 0; + const uint32_t point_idx = static_cast(idx); + const uint32_t window_mask = (uint32_t{1} << window_bits) - 1; + + for(int w = 0; w < num_windows; w++) { + uint32_t digit = scalar_window(scalars, idx, w * window_bits, + window_bits); + digit = (digit & window_mask) + carry; + + carry = digit > static_cast(num_buckets) ? 1u : 0u; + const uint32_t neg_digit = (uint32_t{1} << window_bits) - digit; + uint32_t bucket = carry != 0 ? neg_digit : digit; + uint32_t sign = carry; + + const uint32_t is_overflow = (bucket == 0 && sign != 0) ? 1u : 0u; + carry |= is_overflow; + sign &= ~is_overflow; + + const size_t out_idx = idx * static_cast(num_windows) + + static_cast(w); + keys[out_idx] = bucket == 0 + ? static_cast(total_buckets) + : static_cast(w * num_buckets + bucket - 1); + vals[out_idx] = point_idx | (sign << 31); + } +} + +__global__ void __launch_bounds__(MSM_THREADS) detect_bucket_boundaries_kernel( + const uint32_t *__restrict__ sorted_keys, + uint32_t *__restrict__ bucket_offsets, + uint32_t *__restrict__ bucket_ends, + size_t assignments, + int total_buckets) { + + const size_t i = static_cast(blockIdx.x) * blockDim.x + threadIdx.x; + if(i >= assignments) return; + + const uint32_t key = sorted_keys[i]; + if(key >= static_cast(total_buckets)) return; + + if(i == 0 || sorted_keys[i - 1] != key) { + bucket_offsets[key] = static_cast(i); + } + if(i == assignments - 1 || sorted_keys[i + 1] != key) { + bucket_ends[key] = static_cast(i + 1); + } +} + +template +__global__ void __launch_bounds__(MSM_THREADS, 2) accumulate_buckets_kernel( + const uint64_t *__restrict__ points, + const uint32_t *__restrict__ point_indices, + const uint32_t *__restrict__ bucket_offsets, + const uint32_t *__restrict__ bucket_ends, + JacobianPoint *__restrict__ buckets, + int total_buckets, + bool add_to_existing, + int cap, + uint32_t *__restrict__ overflow_buckets, + uint32_t *__restrict__ overflow_count) { + + const int bucket_flat = blockIdx.x * blockDim.x + threadIdx.x; + if(bucket_flat >= total_buckets) return; + + JacobianPoint acc, tmp; + if(add_to_existing) { + acc = buckets[bucket_flat]; + } else { + jacobian_set_infinity(acc); + } + + const uint32_t start = bucket_offsets[bucket_flat]; + const uint32_t full_end = bucket_ends[bucket_flat]; + uint32_t end = full_end; + if(cap > 0 && full_end > start + static_cast(cap)) { + end = start + static_cast(cap); + if(overflow_buckets && overflow_count) { + const uint32_t slot = atomicAdd(overflow_count, 1u); + overflow_buckets[slot] = static_cast(bucket_flat); + } + } + + for(uint32_t i = start; i < end; i++) { + const uint32_t packed = point_indices[i]; + AffinePoint p; + load_affine_at(p, points, packed & 0x7fffffffu); + if((packed >> 31) != 0) { + neg(p.y, p.y); + } + jacobian_add_jacobian_affine(tmp, acc, p); + set(acc.x, tmp.x); + set(acc.y, tmp.y); + set(acc.z, tmp.z); + } + + buckets[bucket_flat] = acc; +} + +template +__device__ __noinline__ void jacobian_add_value(JacobianPoint &out, + const JacobianPoint &a, + const JacobianPoint &b) { + jacobian_add(out, a, b); +} + +template +__device__ __noinline__ void jacobian_double_value(JacobianPoint &out, + const JacobianPoint &a) { + jacobian_double(out, a); +} + +template +__global__ void __launch_bounds__(ACCUM_PARALLEL_THREADS, 2) +accumulate_buckets_parallel_kernel( + const uint64_t *__restrict__ points, + const uint32_t *__restrict__ point_indices, + const uint32_t *__restrict__ bucket_offsets, + const uint32_t *__restrict__ bucket_ends, + const uint32_t *__restrict__ overflow_buckets, + JacobianPoint *__restrict__ buckets, + bool add_to_existing, + uint32_t start_offset) { + + const int bucket_flat = overflow_buckets + ? static_cast(overflow_buckets[blockIdx.x]) + : static_cast(blockIdx.x); + const int tid = threadIdx.x; + const uint32_t start = bucket_offsets[bucket_flat] + start_offset; + const uint32_t end = bucket_ends[bucket_flat]; + if(start >= end) return; + + JacobianPoint acc, tmp; + jacobian_set_infinity(acc); + for(uint32_t i = start + static_cast(tid); i < end; + i += ACCUM_PARALLEL_THREADS) { + const uint32_t packed = point_indices[i]; + AffinePoint p; + load_affine_at(p, points, packed & 0x7fffffffu); + if((packed >> 31) != 0) { + neg(p.y, p.y); + } + jacobian_add_jacobian_affine(tmp, acc, p); + acc = tmp; + } + + extern __shared__ unsigned char shared_raw[]; + JacobianPoint *shared = + reinterpret_cast *>(shared_raw); + shared[tid] = acc; + __syncthreads(); + + for(int stride = ACCUM_PARALLEL_THREADS / 2; stride > 0; stride >>= 1) { + if(tid < stride) { + jacobian_add_value(tmp, shared[tid], shared[tid + stride]); + shared[tid] = tmp; + } + __syncthreads(); + } + + if(tid == 0) { + if(add_to_existing) { + jacobian_add_value(tmp, buckets[bucket_flat], shared[0]); + buckets[bucket_flat] = tmp; + } else { + buckets[bucket_flat] = shared[0]; + } + } +} + +template +__device__ __forceinline__ void jacobian_mul_small(JacobianPoint &out, + const JacobianPoint &in, + int k) { + jacobian_set_infinity(out); + if(k <= 0 || jacobian_is_infinity(in)) return; + + JacobianPoint base = in; + while(k > 0) { + if((k & 1) != 0) { + JacobianPoint tmp; + jacobian_add_value(tmp, out, base); + out = tmp; + } + k >>= 1; + if(k > 0) { + JacobianPoint tmp; + jacobian_double_value(tmp, base); + base = tmp; + } + } +} + +template +__global__ void __launch_bounds__(REDUCE_THREADS_PER_WINDOW, 2) +reduce_windows_partial_kernel( + const JacobianPoint *__restrict__ buckets, + JacobianPoint *__restrict__ partial_totals, + JacobianPoint *__restrict__ partial_sums, + int num_windows, + int num_buckets, + int blocks_per_window) { + + const int block_flat = blockIdx.x; + const int w = block_flat / blocks_per_window; + const int part = block_flat % blocks_per_window; + if(w >= num_windows) return; + + const int tid = threadIdx.x; + const int range_size = (num_buckets + blocks_per_window - 1) / + blocks_per_window; + const int high = num_buckets - 1 - part * range_size; + const int out_idx = w * blocks_per_window + part; + + if(high < 0) { + if(tid == 0) { + jacobian_set_infinity(partial_totals[out_idx]); + jacobian_set_infinity(partial_sums[out_idx]); + } + return; + } + + int low = high - range_size + 1; + if(low < 0) low = 0; + const int range_len = high - low + 1; + const int chunk_size = + (range_len + REDUCE_THREADS_PER_WINDOW - 1) / REDUCE_THREADS_PER_WINDOW; + int chunk_high = high - tid * chunk_size; + int chunk_low = chunk_high - chunk_size + 1; + if(chunk_low < low) chunk_low = low; + if(chunk_high > high) chunk_high = high; + const bool has_work = chunk_high >= low; + + JacobianPoint local_running, local_total, tmp; + jacobian_set_infinity(local_running); + jacobian_set_infinity(local_total); + int local_len = 0; + + if(has_work) { + for(int b = chunk_high; b >= chunk_low; b--) { + jacobian_add_value(tmp, local_running, + buckets[w * num_buckets + b]); + local_running = tmp; + + jacobian_add_value(tmp, local_total, local_running); + local_total = tmp; + local_len++; + } + } + + __shared__ JacobianPoint shared[REDUCE_THREADS_PER_WINDOW]; + shared[tid] = local_running; + __syncthreads(); + + for(int d = 1; d < REDUCE_THREADS_PER_WINDOW; d <<= 1) { + JacobianPoint addend; + const bool do_add = tid >= d; + if(do_add) addend = shared[tid - d]; + __syncthreads(); + if(do_add) { + jacobian_add_value(tmp, shared[tid], addend); + shared[tid] = tmp; + } + __syncthreads(); + } + + if(tid == 0) { + partial_sums[out_idx] = shared[REDUCE_THREADS_PER_WINDOW - 1]; + } + + JacobianPoint exclusive; + if(tid == 0) { + jacobian_set_infinity(exclusive); + } else { + exclusive = shared[tid - 1]; + } + __syncthreads(); + shared[tid] = exclusive; + __syncthreads(); + + JacobianPoint correction; + jacobian_mul_small(correction, shared[tid], local_len); + jacobian_add_value(tmp, local_total, correction); + local_total = tmp; + + shared[tid] = local_total; + __syncthreads(); + for(int stride = REDUCE_THREADS_PER_WINDOW / 2; stride > 0; stride >>= 1) { + if(tid < stride) { + jacobian_add_value(tmp, shared[tid], shared[tid + stride]); + shared[tid] = tmp; + } + __syncthreads(); + } + if(tid == 0) partial_totals[out_idx] = shared[0]; +} + +template +__global__ void reduce_windows_finalize_kernel( + const JacobianPoint *__restrict__ partial_totals, + const JacobianPoint *__restrict__ partial_sums, + JacobianPoint *__restrict__ window_results, + int num_windows, + int num_buckets, + int blocks_per_window) { + + const int w = blockIdx.x; + if(w >= num_windows) return; + + const int tid = threadIdx.x; + const int range_size = (num_buckets + blocks_per_window - 1) / + blocks_per_window; + + extern __shared__ unsigned char raw_shared[]; + JacobianPoint *shared = + reinterpret_cast *>(raw_shared); + + JacobianPoint my_total, my_sum, tmp; + int my_len = 0; + if(tid < blocks_per_window) { + const int high = num_buckets - 1 - tid * range_size; + if(high >= 0) { + int low = high - range_size + 1; + if(low < 0) low = 0; + my_len = high - low + 1; + my_total = partial_totals[w * blocks_per_window + tid]; + my_sum = partial_sums[w * blocks_per_window + tid]; + } else { + jacobian_set_infinity(my_total); + jacobian_set_infinity(my_sum); + } + } else { + jacobian_set_infinity(my_total); + jacobian_set_infinity(my_sum); + } + + shared[tid] = my_sum; + __syncthreads(); + for(int d = 1; d < FINALIZE_THREADS; d <<= 1) { + JacobianPoint addend; + const bool do_add = tid >= d && tid < blocks_per_window; + if(do_add) addend = shared[tid - d]; + __syncthreads(); + if(do_add) { + jacobian_add_value(tmp, shared[tid], addend); + shared[tid] = tmp; + } + __syncthreads(); + } + + JacobianPoint exclusive; + if(tid == 0 || tid >= blocks_per_window) { + jacobian_set_infinity(exclusive); + } else { + exclusive = shared[tid - 1]; + } + __syncthreads(); + shared[tid] = exclusive; + __syncthreads(); + + if(tid < blocks_per_window && my_len > 0) { + JacobianPoint correction; + jacobian_mul_small(correction, shared[tid], my_len); + jacobian_add_value(tmp, my_total, correction); + my_total = tmp; + } + + shared[tid] = my_total; + __syncthreads(); + for(int stride = FINALIZE_THREADS / 2; stride > 0; stride >>= 1) { + if(tid < stride) { + jacobian_add_value(tmp, shared[tid], shared[tid + stride]); + shared[tid] = tmp; + } + __syncthreads(); + } + if(tid == 0) window_results[w] = shared[0]; +} + +static int reduce_blocks_per_window(int num_windows, int num_buckets) { + int max_bpw = num_buckets / REDUCE_THREADS_PER_WINDOW; + int target_bpw = 752 / num_windows; + int bpw = max_bpw < target_bpw ? max_bpw : target_bpw; + if(bpw < 1) bpw = 1; + if(bpw > FINALIZE_THREADS) bpw = FINALIZE_THREADS; + return bpw; +} + +template +__global__ void finalize_msm_kernel( + const JacobianPoint *__restrict__ window_results, + int num_windows, + int window_bits, + uint64_t *__restrict__ out_raw) { + + if(blockIdx.x != 0 || threadIdx.x != 0) return; + + JacobianPoint acc, tmp; + jacobian_set_infinity(acc); + + for(int w = num_windows - 1; w >= 0; w--) { + if(w != num_windows - 1) { + for(int i = 0; i < window_bits; i++) { + jacobian_double(tmp, acc); + set(acc.x, tmp.x); + set(acc.y, tmp.y); + set(acc.z, tmp.z); + } + } + jacobian_add(tmp, acc, window_results[w]); + set(acc.x, tmp.x); + set(acc.y, tmp.y); + set(acc.z, tmp.z); + } + + store_jacobian(acc, out_raw); +} + +static int signed_window_count(int scalar_bits, int window_bits) { + return (scalar_bits + 1 + window_bits - 1) / window_bits; +} + +static int sort_key_bits(int total_buckets) { + int bits = 1; + while((1u << bits) <= static_cast(total_buckets)) bits++; + return bits; +} + +static int accumulation_seq_cap(size_t assignments, int total_buckets) { + size_t avg = assignments / static_cast(total_buckets); + size_t cap = 2 * avg + 64; + if(cap < static_cast(ACCUM_SEQ_CAP)) cap = ACCUM_SEQ_CAP; + if(cap > 4096) cap = 4096; + return static_cast(cap); +} + +template +cudaError_t run_msm_pippenger_core( + const uint64_t *points, + bool points_are_device_resident, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + cudaStream_t stream) { + + if(window_bits <= 1 || window_bits > 24) return cudaErrorInvalidValue; + if(count == 0 || count > static_cast(std::numeric_limits::max())) { + return cudaErrorInvalidValue; + } + + const int num_windows = signed_window_count(Fr::BITS, window_bits); + const int num_buckets = 1 << (window_bits - 1); + const int total_buckets = num_windows * num_buckets; + const int reduce_bpw = reduce_blocks_per_window(num_windows, num_buckets); + const int total_partials = num_windows * reduce_bpw; + const size_t assignments = count * static_cast(num_windows); + if(assignments > static_cast(std::numeric_limits::max())) { + return cudaErrorInvalidValue; + } + + const uint64_t *d_points = nullptr; + uint64_t *owned_d_points = nullptr; + uint64_t *d_scalars = nullptr; + uint64_t *d_out = nullptr; + uint32_t *d_keys_in = nullptr; + uint32_t *d_keys_out = nullptr; + uint32_t *d_vals_in = nullptr; + uint32_t *d_vals_out = nullptr; + uint32_t *d_bucket_offsets = nullptr; + uint32_t *d_bucket_ends = nullptr; + uint32_t *d_overflow_buckets = nullptr; + uint32_t *d_overflow_count = nullptr; + JacobianPoint *d_buckets = nullptr; + JacobianPoint *d_window_results = nullptr; + JacobianPoint *d_partial_totals = nullptr; + JacobianPoint *d_partial_sums = nullptr; + void *d_sort_temp = nullptr; + size_t sort_temp_bytes = 0; + + const size_t point_words = count * 2 * Fp::LIMBS; + const size_t scalar_words = count * Fr::LIMBS; + constexpr size_t output_words = 3 * Fp::LIMBS; + + cudaError_t err = cudaSuccess; + if(points_are_device_resident) { + d_points = points; + } else { + err = cudaMalloc(&owned_d_points, point_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + d_points = owned_d_points; + } + err = cudaMalloc(&d_scalars, scalar_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_out, output_words * sizeof(uint64_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_keys_in, assignments * sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_keys_out, assignments * sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_vals_in, assignments * sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_vals_out, assignments * sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_bucket_offsets, total_buckets * sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_bucket_ends, total_buckets * sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_overflow_buckets, total_buckets * sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_overflow_count, sizeof(uint32_t)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_buckets, + total_buckets * sizeof(JacobianPoint)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_window_results, + num_windows * sizeof(JacobianPoint)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_partial_totals, + total_partials * sizeof(JacobianPoint)); + if(err != cudaSuccess) goto done; + err = cudaMalloc(&d_partial_sums, + total_partials * sizeof(JacobianPoint)); + if(err != cudaSuccess) goto done; + + if(!points_are_device_resident) { + err = cudaMemcpyAsync(owned_d_points, points, + point_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) goto done; + } + err = cudaMemcpyAsync(d_scalars, scalars, scalar_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) goto done; + + { + const unsigned blocks = + static_cast((count + MSM_THREADS - 1) / MSM_THREADS); + build_pairs_kernel<<>>( + d_scalars, d_keys_in, d_vals_in, count, window_bits, num_windows, + num_buckets, total_buckets); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + } + + err = cub::DeviceRadixSort::SortPairs( + nullptr, sort_temp_bytes, d_keys_in, d_keys_out, d_vals_in, d_vals_out, + assignments, 0, sort_key_bits(total_buckets), stream); + if(err != cudaSuccess) goto done; + + err = cudaMalloc(&d_sort_temp, sort_temp_bytes); + if(err != cudaSuccess) goto done; + + err = cub::DeviceRadixSort::SortPairs( + d_sort_temp, sort_temp_bytes, d_keys_in, d_keys_out, d_vals_in, + d_vals_out, assignments, 0, sort_key_bits(total_buckets), stream); + if(err != cudaSuccess) goto done; + + err = cudaMemsetAsync(d_bucket_offsets, 0, + total_buckets * sizeof(uint32_t), stream); + if(err != cudaSuccess) goto done; + err = cudaMemsetAsync(d_bucket_ends, 0, + total_buckets * sizeof(uint32_t), stream); + if(err != cudaSuccess) goto done; + + { + const unsigned blocks = static_cast( + (assignments + MSM_THREADS - 1) / MSM_THREADS); + detect_bucket_boundaries_kernel<<>>( + d_keys_out, d_bucket_offsets, d_bucket_ends, assignments, + total_buckets); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + } + + { + const int cap = accumulation_seq_cap(assignments, total_buckets); + const unsigned blocks = + static_cast((total_buckets + MSM_THREADS - 1) / + MSM_THREADS); + err = cudaMemsetAsync(d_overflow_count, 0, sizeof(uint32_t), stream); + if(err != cudaSuccess) goto done; + accumulate_buckets_kernel<<>>( + d_points, d_vals_out, d_bucket_offsets, d_bucket_ends, d_buckets, + total_buckets, false, cap, d_overflow_buckets, d_overflow_count); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + + uint32_t overflow_count = 0; + err = cudaMemcpyAsync(&overflow_count, d_overflow_count, + sizeof(uint32_t), cudaMemcpyDeviceToHost, + stream); + if(err != cudaSuccess) goto done; + err = cudaStreamSynchronize(stream); + if(err != cudaSuccess) goto done; + if(overflow_count > 0) { + const size_t smem = + ACCUM_PARALLEL_THREADS * sizeof(JacobianPoint); + accumulate_buckets_parallel_kernel + <<>>( + d_points, d_vals_out, d_bucket_offsets, d_bucket_ends, + d_overflow_buckets, d_buckets, true, + static_cast(cap)); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + } + } + + reduce_windows_partial_kernel + <<>>( + d_buckets, d_partial_totals, d_partial_sums, num_windows, + num_buckets, reduce_bpw); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + + reduce_windows_finalize_kernel + <<), stream>>>( + d_partial_totals, d_partial_sums, d_window_results, num_windows, + num_buckets, reduce_bpw); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + + finalize_msm_kernel<<<1, 1, 0, stream>>>( + d_window_results, num_windows, window_bits, d_out); + err = cudaGetLastError(); + if(err != cudaSuccess) goto done; + + err = cudaMemcpyAsync(out, d_out, output_words * sizeof(uint64_t), + cudaMemcpyDeviceToHost, stream); + if(err != cudaSuccess) goto done; + err = cudaStreamSynchronize(stream); + +done: + if(owned_d_points) cudaFree(owned_d_points); + if(d_scalars) cudaFree(d_scalars); + if(d_out) cudaFree(d_out); + if(d_keys_in) cudaFree(d_keys_in); + if(d_keys_out) cudaFree(d_keys_out); + if(d_vals_in) cudaFree(d_vals_in); + if(d_vals_out) cudaFree(d_vals_out); + if(d_bucket_offsets) cudaFree(d_bucket_offsets); + if(d_bucket_ends) cudaFree(d_bucket_ends); + if(d_overflow_buckets) cudaFree(d_overflow_buckets); + if(d_overflow_count) cudaFree(d_overflow_count); + if(d_buckets) cudaFree(d_buckets); + if(d_window_results) cudaFree(d_window_results); + if(d_partial_totals) cudaFree(d_partial_totals); + if(d_partial_sums) cudaFree(d_partial_sums); + if(d_sort_temp) cudaFree(d_sort_temp); + return err; +} + +template +cudaError_t run_msm_pippenger( + const uint64_t *points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + cudaStream_t stream) { + + return run_msm_pippenger_core( + points, false, scalars, count, window_bits, out, stream); +} + +template +cudaError_t run_msm_pippenger_device_points( + const uint64_t *d_points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + cudaStream_t stream) { + + return run_msm_pippenger_core( + d_points, true, scalars, count, window_bits, out, stream); +} + +template +cudaError_t sort_temp_bytes_for(size_t count, int window_bits, + size_t *temp_bytes) { + if(!temp_bytes || window_bits <= 1 || window_bits > 24 || count == 0) { + return cudaErrorInvalidValue; + } + const int num_windows = signed_window_count(Fr::BITS, window_bits); + const int num_buckets = 1 << (window_bits - 1); + const int total_buckets = num_windows * num_buckets; + const size_t assignments = count * static_cast(num_windows); + if(assignments > static_cast(std::numeric_limits::max())) { + return cudaErrorInvalidValue; + } + + uint32_t *keys_in = nullptr; + uint32_t *keys_out = nullptr; + uint32_t *vals_in = nullptr; + uint32_t *vals_out = nullptr; + return cub::DeviceRadixSort::SortPairs( + nullptr, *temp_bytes, keys_in, keys_out, vals_in, vals_out, + assignments, 0, sort_key_bits(total_buckets), nullptr); +} + +template +cudaError_t run_msm_pippenger_prealloc_core( + const uint64_t *d_points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + uint64_t *d_scalars, + uint64_t *d_out, + uint32_t *d_keys_in, + uint32_t *d_keys_out, + uint32_t *d_vals_in, + uint32_t *d_vals_out, + uint32_t *d_bucket_offsets, + uint32_t *d_bucket_ends, + uint32_t *d_overflow_buckets, + uint32_t *d_overflow_count, + void *d_buckets_raw, + void *d_window_results_raw, + void *d_partial_totals_raw, + void *d_partial_sums_raw, + void *d_sort_temp, + size_t sort_temp_bytes, + cudaEvent_t *phase_events, + float *phase_timings_ms, + cudaStream_t stream) { + + if(window_bits <= 1 || window_bits > 24) return cudaErrorInvalidValue; + if(count == 0 || count > static_cast(std::numeric_limits::max())) { + return cudaErrorInvalidValue; + } + + const int num_windows = signed_window_count(Fr::BITS, window_bits); + const int num_buckets = 1 << (window_bits - 1); + const int total_buckets = num_windows * num_buckets; + const int reduce_bpw = reduce_blocks_per_window(num_windows, num_buckets); + const int total_partials = num_windows * reduce_bpw; + const size_t assignments = count * static_cast(num_windows); + if(assignments > static_cast(std::numeric_limits::max())) { + return cudaErrorInvalidValue; + } + + if(!d_points || !scalars || !out || !d_scalars || !d_out || !d_keys_in || + !d_keys_out || !d_vals_in || !d_vals_out || !d_bucket_offsets || + !d_bucket_ends || !d_overflow_buckets || !d_overflow_count || + !d_buckets_raw || !d_window_results_raw || !d_partial_totals_raw || + !d_partial_sums_raw || !d_sort_temp) { + return cudaErrorInvalidValue; + } + if(sort_temp_bytes == 0) return cudaErrorInvalidValue; + + JacobianPoint *d_buckets = + reinterpret_cast *>(d_buckets_raw); + JacobianPoint *d_window_results = + reinterpret_cast *>(d_window_results_raw); + JacobianPoint *d_partial_totals = + reinterpret_cast *>(d_partial_totals_raw); + JacobianPoint *d_partial_sums = + reinterpret_cast *>(d_partial_sums_raw); + + const size_t scalar_words = count * Fr::LIMBS; + constexpr size_t output_words = 3 * Fp::LIMBS; + + auto record_phase = [&](int idx) { + if(phase_events) cudaEventRecord(phase_events[idx], stream); + }; + record_phase(0); + + cudaError_t err = cudaMemcpyAsync( + d_scalars, scalars, scalar_words * sizeof(uint64_t), + cudaMemcpyHostToDevice, stream); + if(err != cudaSuccess) return err; + record_phase(1); + + { + const unsigned blocks = + static_cast((count + MSM_THREADS - 1) / MSM_THREADS); + build_pairs_kernel<<>>( + d_scalars, d_keys_in, d_vals_in, count, window_bits, num_windows, + num_buckets, total_buckets); + err = cudaGetLastError(); + if(err != cudaSuccess) return err; + } + record_phase(2); + + err = cub::DeviceRadixSort::SortPairs( + d_sort_temp, sort_temp_bytes, d_keys_in, d_keys_out, d_vals_in, + d_vals_out, assignments, 0, sort_key_bits(total_buckets), stream); + if(err != cudaSuccess) return err; + record_phase(3); + + err = cudaMemsetAsync(d_bucket_offsets, 0, + total_buckets * sizeof(uint32_t), stream); + if(err != cudaSuccess) return err; + err = cudaMemsetAsync(d_bucket_ends, 0, + total_buckets * sizeof(uint32_t), stream); + if(err != cudaSuccess) return err; + + { + const unsigned blocks = static_cast( + (assignments + MSM_THREADS - 1) / MSM_THREADS); + detect_bucket_boundaries_kernel<<>>( + d_keys_out, d_bucket_offsets, d_bucket_ends, assignments, + total_buckets); + err = cudaGetLastError(); + if(err != cudaSuccess) return err; + } + record_phase(4); + + { + const int cap = accumulation_seq_cap(assignments, total_buckets); + const unsigned blocks = + static_cast((total_buckets + MSM_THREADS - 1) / + MSM_THREADS); + err = cudaMemsetAsync(d_overflow_count, 0, sizeof(uint32_t), stream); + if(err != cudaSuccess) return err; + accumulate_buckets_kernel<<>>( + d_points, d_vals_out, d_bucket_offsets, d_bucket_ends, d_buckets, + total_buckets, false, cap, d_overflow_buckets, d_overflow_count); + err = cudaGetLastError(); + if(err != cudaSuccess) return err; + } + record_phase(5); + + { + uint32_t overflow_count = 0; + err = cudaMemcpyAsync(&overflow_count, d_overflow_count, + sizeof(uint32_t), cudaMemcpyDeviceToHost, + stream); + if(err != cudaSuccess) return err; + err = cudaStreamSynchronize(stream); + if(err != cudaSuccess) return err; + if(overflow_count > 0) { + const int cap = accumulation_seq_cap(assignments, total_buckets); + const size_t smem = + ACCUM_PARALLEL_THREADS * sizeof(JacobianPoint); + accumulate_buckets_parallel_kernel + <<>>( + d_points, d_vals_out, d_bucket_offsets, d_bucket_ends, + d_overflow_buckets, d_buckets, true, + static_cast(cap)); + err = cudaGetLastError(); + if(err != cudaSuccess) return err; + } + } + record_phase(6); + + reduce_windows_partial_kernel + <<>>( + d_buckets, d_partial_totals, d_partial_sums, num_windows, + num_buckets, reduce_bpw); + err = cudaGetLastError(); + if(err != cudaSuccess) return err; + record_phase(7); + + reduce_windows_finalize_kernel + <<), stream>>>( + d_partial_totals, d_partial_sums, d_window_results, num_windows, + num_buckets, reduce_bpw); + err = cudaGetLastError(); + if(err != cudaSuccess) return err; + record_phase(8); + + finalize_msm_kernel<<<1, 1, 0, stream>>>( + d_window_results, num_windows, window_bits, d_out); + err = cudaGetLastError(); + if(err != cudaSuccess) return err; + + err = cudaMemcpyAsync(out, d_out, output_words * sizeof(uint64_t), + cudaMemcpyDeviceToHost, stream); + if(err != cudaSuccess) return err; + record_phase(9); + + err = cudaStreamSynchronize(stream); + if(err != cudaSuccess) return err; + + if(phase_events && phase_timings_ms) { + auto elapsed = [&](int from, int to) -> float { + float ms = 0.0f; + if(cudaEventElapsedTime(&ms, phase_events[from], phase_events[to]) + != cudaSuccess) { + cudaGetLastError(); + ms = 0.0f; + } + return ms; + }; + phase_timings_ms[0] = elapsed(0, 1); + phase_timings_ms[1] = elapsed(1, 2); + phase_timings_ms[2] = elapsed(2, 3); + phase_timings_ms[3] = elapsed(3, 4); + phase_timings_ms[4] = elapsed(4, 5); + phase_timings_ms[5] = elapsed(5, 6); + phase_timings_ms[6] = elapsed(6, 7); + phase_timings_ms[7] = elapsed(7, 8); + phase_timings_ms[8] = elapsed(8, 9); + } + return cudaSuccess; +} + +} // namespace + +cudaError_t msm_pippenger_sort_temp_bytes( + gnark_gpu_plonk2_curve_id_t curve, + size_t count, + int window_bits, + size_t *temp_bytes) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return sort_temp_bytes_for( + count, window_bits, temp_bytes); + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return sort_temp_bytes_for( + count, window_bits, temp_bytes); + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return sort_temp_bytes_for( + count, window_bits, temp_bytes); + default: + return cudaErrorInvalidValue; + } +} + +cudaError_t msm_pippenger_run( + gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return run_msm_pippenger( + points, scalars, count, window_bits, out, stream); + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return run_msm_pippenger( + points, scalars, count, window_bits, out, stream); + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return run_msm_pippenger( + points, scalars, count, window_bits, out, stream); + default: + return cudaErrorInvalidValue; + } +} + +cudaError_t msm_pippenger_device_points_run( + gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *d_points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return run_msm_pippenger_device_points( + d_points, scalars, count, window_bits, out, stream); + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return run_msm_pippenger_device_points( + d_points, scalars, count, window_bits, out, stream); + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return run_msm_pippenger_device_points( + d_points, scalars, count, window_bits, out, stream); + default: + return cudaErrorInvalidValue; + } +} + +cudaError_t msm_pippenger_device_points_prealloc_run( + gnark_gpu_plonk2_curve_id_t curve, + const uint64_t *d_points, + const uint64_t *scalars, + size_t count, + int window_bits, + uint64_t *out, + uint64_t *d_scalars, + uint64_t *d_out, + uint32_t *d_keys_in, + uint32_t *d_keys_out, + uint32_t *d_vals_in, + uint32_t *d_vals_out, + uint32_t *d_bucket_offsets, + uint32_t *d_bucket_ends, + uint32_t *d_overflow_buckets, + uint32_t *d_overflow_count, + void *d_buckets, + void *d_window_results, + void *d_partial_totals, + void *d_partial_sums, + void *d_sort_temp, + size_t sort_temp_bytes, + cudaEvent_t *phase_events, + float *phase_timings_ms, + cudaStream_t stream) { + + switch(curve) { + case GNARK_GPU_PLONK2_CURVE_BN254: + return run_msm_pippenger_prealloc_core( + d_points, scalars, count, window_bits, out, d_scalars, d_out, + d_keys_in, d_keys_out, d_vals_in, d_vals_out, d_bucket_offsets, + d_bucket_ends, d_overflow_buckets, d_overflow_count, d_buckets, + d_window_results, d_partial_totals, d_partial_sums, d_sort_temp, + sort_temp_bytes, phase_events, phase_timings_ms, stream); + case GNARK_GPU_PLONK2_CURVE_BLS12_377: + return run_msm_pippenger_prealloc_core( + d_points, scalars, count, window_bits, out, d_scalars, d_out, + d_keys_in, d_keys_out, d_vals_in, d_vals_out, d_bucket_offsets, + d_bucket_ends, d_overflow_buckets, d_overflow_count, d_buckets, + d_window_results, d_partial_totals, d_partial_sums, d_sort_temp, + sort_temp_bytes, phase_events, phase_timings_ms, stream); + case GNARK_GPU_PLONK2_CURVE_BW6_761: + return run_msm_pippenger_prealloc_core( + d_points, scalars, count, window_bits, out, d_scalars, d_out, + d_keys_in, d_keys_out, d_vals_in, d_vals_out, d_bucket_offsets, + d_bucket_ends, d_overflow_buckets, d_overflow_count, d_buckets, + d_window_results, d_partial_totals, d_partial_sums, d_sort_temp, + sort_temp_bytes, phase_events, phase_timings_ms, stream); + default: + return cudaErrorInvalidValue; + } +} + +} // namespace gnark_gpu::plonk2 diff --git a/prover/gpu/cuda/src/vortex/kb.cu b/prover/gpu/cuda/src/vortex/kb.cu new file mode 100644 index 00000000000..a9490bc88a1 --- /dev/null +++ b/prover/gpu/cuda/src/vortex/kb.cu @@ -0,0 +1,2424 @@ +// KoalaBear GPU kernels — vector ops, NTT, Poseidon2, SIS hash, Merkle, linear combination +// +// All operations on KoalaBear field P = 0x7f000001 (31-bit, 1-limb Montgomery). +// Vectors are flat uint32_t arrays; no SoA decomposition needed. +// +// ┌────────────────── Vortex commit pipeline (GPU) ──────────────────┐ +// │ │ +// │ stream_xfer stream_compute │ +// │ ─────────── ────────────── │ +// │ For each chunk (~32 rows): │ +// │ H2D chunk → d_work ──event──▶ scatter even cols → d_encoded_col │ +// │ iFFT(DIF) + scale + FFT(DIT) │ +// │ scatter odd cols → d_encoded_col │ +// │ │ +// │ After all chunks: │ +// │ kern_sis_hash (1 block/column, shared-mem NTT-512) │ +// │ kern_p2_sponge → d_leaves │ +// │ kern_merkle_level (bottom-up) → d_tree │ +// │ D2H tree → h_tree (host) │ +// └────────────────────────────────────────────────────────────────────┘ + +#include "kb_field.cuh" +#include +#include +#include +#include + +// ═════════════════════════════════════════════════════════════════════════════ +// Internal types +// ═════════════════════════════════════════════════════════════════════════════ + +struct KBVec { + uint32_t *d_data; + size_t n; +}; + +struct KBNtt { + uint32_t *d_fwd_tw; // n/2 forward twiddles + uint32_t *d_inv_tw; // n/2 inverse twiddles + size_t n; + int log_n; +}; + +struct KBPoseidon2 { + uint32_t *d_round_keys; // flat [nb_rounds × key_width] + uint32_t *d_diag; // internal MDS diagonal (width elements, Montgomery form) + int width; + int nb_full_rounds; + int nb_partial_rounds; +}; + +// Ring-SIS hash context: pre-NTT'd keys + NTT domain for degree-d cyclotomic ring +struct KBSis { + uint32_t *d_ag; // [n_polys × degree] pre-NTT'd keys (bit-reversed, coset domain) + uint32_t *d_fwd_tw; // [degree/2] forward twiddles (ωⁱ) + uint32_t *d_inv_tw; // [degree/2] inverse twiddles (ω⁻ⁱ) + uint32_t *d_coset_table; // [degree] shift^j (natural order) + uint32_t *d_coset_inv; // [degree] shift^{-j} · (1/degree) (natural order) + int degree; + int log_degree; + int n_polys; + int log_two_bound; // limb bit-width (e.g. 16) +}; + +// Pre-allocated device buffers for the full Vortex commit pipeline. +// Created once in pipeline_init, reused across Commit() calls. +// +// GPU RS encode (rate=2) eliminates CPU RS + halves H2D data: +// +// h_input (pinned) d_work d_encoded_col (col-major) +// [nR × nC] ─H2D─▶ [nR × nC] ──transpose──▶ even cols [0,2,4,..] +// host, row-major │ (original) +// ▼ iFFT(DIF, inv_tw) +// │ scale(cosetBR × cardInv) +// ▼ FFT(DIT, fwd_tw) +// └──transpose──▶ odd cols [1,3,5,..] +// +// d_encoded_col ──SIS──▶ d_sis ──sponge──▶ d_leaves ──merkle──▶ d_tree +// +// ┌───────────────────── memory layout ──────────────────────────┐ +// │ d_work [max_n_rows × n_cols] H2D + NTT workspace │ +// │ d_encoded_col [scw × max_n_rows] column-major matrix │ +// │ d_rs_fwd_tw [n_cols / 2] RS forward twiddles │ +// │ d_rs_inv_tw [n_cols / 2] RS inverse twiddles │ +// │ d_scaled_coset [n_cols] cosetBR × cardInv │ +// │ d_sis [scw × degree] SIS hash output │ +// │ d_leaves [scw × 8] Poseidon2 digests │ +// │ d_tree [2·np × 8] Merkle heap │ +// └──────────────────────────────────────────────────────────────┘ +struct KBVortexPipeline { + KBSis *sis; // not owned + KBPoseidon2 *p2_sponge; // not owned + KBPoseidon2 *p2_compress; // not owned + // RS encode buffers + uint32_t *d_work; // [max_n_rows × n_cols], H2D staging + NTT workspace + uint32_t *d_rs_fwd_tw; // [n_cols / 2] forward twiddles for RS domain + uint32_t *d_rs_inv_tw; // [n_cols / 2] inverse twiddles for RS domain + uint32_t *d_scaled_coset; // [n_cols] = CosetTableBitReverse × cardinalityInv (rate=2) + // Multi-coset RS encode (rate > 2) + uint32_t *d_coeffs; // [max_n_rows × n_cols], IFFT partial-state backup + uint32_t *d_coset_tables; // [(rate-1) × n_cols], coset scaling tables (bit-reversed) + // Encoded matrix + downstream + uint32_t *d_encoded_col; // [scw × max_n_rows], column-major + uint32_t *d_sis; // [scw × degree] + uint32_t *d_leaves; // [scw × 8] + uint32_t *d_tree; // [2 × tree_np × 8], heap layout + // Async extraction buffers (overlap D2H with SIS+P2+Merkle compute) + uint32_t *d_enc_rowmajor; // [max_n_rows × scw], GPU transpose buffer + uint32_t *h_enc_pinned; // [max_n_rows × scw], pinned host for encoded matrix + uint32_t *h_sis_pinned; // [scw × degree], pinned host for SIS hashes + uint32_t *h_leaves_pinned; // [scw × 8], pinned host for leaf hashes + cudaEvent_t ev_rs_done; // RS encoding complete → start D2H encoded + cudaEvent_t ev_sis_done; // SIS hashing complete → start D2H SIS + cudaEvent_t ev_p2_done; // P2 hashing complete → start D2H leaves + // Pinned host buffers + uint32_t *h_input; // [max_n_rows × n_cols], cudaMallocHost (pinned) + uint32_t *h_tree; // [(2·tree_np − 1) × 8], cudaMallocHost (pinned) + // Streams for pipelined H2D + compute overlap + cudaStream_t stream_xfer; // H2D transfers + async D2H extraction + cudaStream_t stream_compute; // RS encode + transpose + cudaEvent_t h2d_event; // signals H2D chunk completion + // Dimensions + size_t max_n_rows; + size_t n_cols; + size_t size_codeword; // n_cols × rate + size_t tree_np; + int rate; + int log_n_cols; + int degree; // cached from sis +}; + +// ═════════════════════════════════════════════════════════════════════════════ +// Helpers +// ═════════════════════════════════════════════════════════════════════════════ + +static constexpr int KB_BLOCK = 256; + +static inline int kb_grid(size_t n) { + return (int)((n + KB_BLOCK - 1) / KB_BLOCK); +} + +static inline int ilog2(size_t n) { + int r = 0; + while ((1ULL << r) < n) r++; + return r; +} + +static inline size_t next_pow2(size_t n) { + size_t v = 1; + while (v < n) v <<= 1; + return v; +} + +#define CUDA_CHECK(call) do { \ + cudaError_t err = (call); \ + if (err != cudaSuccess) return KB_ERROR_CUDA; \ +} while (0) + +// ═════════════════════════════════════════════════════════════════════════════ +// Vector element-wise kernels +// ═════════════════════════════════════════════════════════════════════════════ + +__global__ void kern_kb_add(uint32_t *c, const uint32_t *a, const uint32_t *b, size_t n) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) c[i] = kb_add(a[i], b[i]); +} + +__global__ void kern_kb_sub(uint32_t *c, const uint32_t *a, const uint32_t *b, size_t n) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) c[i] = kb_sub(a[i], b[i]); +} + +__global__ void kern_kb_mul(uint32_t *c, const uint32_t *a, const uint32_t *b, size_t n) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) c[i] = kb_mul(a[i], b[i]); +} + +__global__ void kern_kb_scale(uint32_t *v, uint32_t s, size_t n) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i < n) v[i] = kb_mul(v[i], s); +} + +// v[i] *= gⁱ — each thread computes gⁱ by repeated squaring +__global__ void kern_kb_scale_by_powers(uint32_t *v, uint32_t g, size_t n) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + uint32_t pow = KB_ONE; + uint32_t base = g; + size_t exp = i; + while (exp > 0) { + if (exp & 1) pow = kb_mul(pow, base); + base = kb_sqr(base); + exp >>= 1; + } + v[i] = kb_mul(v[i], pow); +} + +// Batch version of scale-by-powers: +// for each row r and index j, data[r][j] *= g^j +__global__ void kern_batch_scale_by_powers(uint32_t *data, uint32_t g, + size_t n_rows, size_t row_stride, + size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t total = n_rows * n; + if (idx >= total) return; + + size_t row = idx / n; + size_t j = idx % n; + uint32_t *rd = data + row * row_stride; + + uint32_t pow = KB_ONE; + uint32_t base = g; + size_t exp = j; + while (exp > 0) { + if (exp & 1) pow = kb_mul(pow, base); + base = kb_sqr(base); + exp >>= 1; + } + rd[j] = kb_mul(rd[j], pow); +} + +// ═════════════════════════════════════════════════════════════════════════════ +// NTT kernels — DIF (forward) and DIT (inverse) +// ═════════════════════════════════════════════════════════════════════════════ +// +// Butterfly: +// DIF: a' = a + b, b' = (a − b) · ω +// DIT: a' = a + ω·b, b' = a − ω·b +// +// Twiddles: flat array tw[0..n/2), where tw[k] = ωᵏ in Montgomery form. +// Stage s: distance half = n >> (s+1), twiddle index = j << s. + +__global__ void kern_ntt_dif(uint32_t *data, const uint32_t *tw, + size_t n, int stage) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t half = n >> (stage + 1); + size_t pairs = n >> 1; + if (tid >= pairs) return; + + size_t group = tid / half; + size_t j = tid % half; + size_t ia = group * (2 * half) + j; + size_t ib = ia + half; + size_t tw_idx = j * (1u << stage); + + uint32_t a = data[ia]; + uint32_t b = data[ib]; + uint32_t w = tw[tw_idx]; + + data[ia] = kb_add(a, b); + data[ib] = kb_mul(kb_sub(a, b), w); +} + +__global__ void kern_ntt_dit(uint32_t *data, const uint32_t *tw, + size_t n, int stage) { + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + size_t half = n >> (stage + 1); + size_t pairs = n >> 1; + if (tid >= pairs) return; + + size_t group = tid / half; + size_t j = tid % half; + size_t ia = group * (2 * half) + j; + size_t ib = ia + half; + size_t tw_idx = j * (1u << stage); + + uint32_t a = data[ia]; + uint32_t wb = kb_mul(data[ib], tw[tw_idx]); + + data[ia] = kb_add(a, wb); + data[ib] = kb_sub(a, wb); +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Batch NTT — process n_rows independent NTTs of size n in parallel +// ═════════════════════════════════════════════════════════════════════════════ +// +// One thread per (row, butterfly-pair). Row data at data[row * row_stride]. +// Launch grid: total = n_rows × (n/2), one stage per kernel launch. + +__global__ void kern_batch_ntt_dif(uint32_t *data, const uint32_t *tw, + size_t n, int stage, + size_t n_rows, size_t row_stride) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t pairs_per_row = n >> 1; + size_t total = n_rows * pairs_per_row; + if (idx >= total) return; + + size_t row = idx / pairs_per_row; + size_t pair = idx % pairs_per_row; + uint32_t *rd = data + row * row_stride; + + size_t half = n >> (stage + 1); + size_t group = pair / half; + size_t j = pair % half; + size_t ia = group * 2 * half + j; + size_t ib = ia + half; + size_t tw_idx = j * (1u << stage); + + uint32_t a = rd[ia], b = rd[ib], w = tw[tw_idx]; + rd[ia] = kb_add(a, b); + rd[ib] = kb_mul(kb_sub(a, b), w); +} + +__global__ void kern_batch_ntt_dit(uint32_t *data, const uint32_t *tw, + size_t n, int stage, + size_t n_rows, size_t row_stride) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t pairs_per_row = n >> 1; + size_t total = n_rows * pairs_per_row; + if (idx >= total) return; + + size_t row = idx / pairs_per_row; + size_t pair = idx % pairs_per_row; + uint32_t *rd = data + row * row_stride; + + size_t half = n >> (stage + 1); + size_t group = pair / half; + size_t j = pair % half; + size_t ia = group * 2 * half + j; + size_t ib = ia + half; + size_t tw_idx = j * (1u << stage); + + uint32_t a = rd[ia]; + uint32_t wb = kb_mul(rd[ib], tw[tw_idx]); + rd[ia] = kb_add(a, wb); + rd[ib] = kb_sub(a, wb); +} + +// Batch element-wise multiply each row by a shared vector: data[row][j] *= vec[j] +__global__ void kern_batch_mul_vec(uint32_t *data, const uint32_t *vec, + size_t n, size_t n_rows, size_t row_stride) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t total = n_rows * n; + if (idx >= total) return; + size_t row = idx / n; + size_t j = idx % n; + data[row * row_stride + j] = kb_mul(data[row * row_stride + j], vec[j]); +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Fused batch NTT: DIF tail + scale + DIT head in shared memory +// ═════════════════════════════════════════════════════════════════════════════ +// +// After s_cut global DIF stages, data partitions into independent tiles of +// size T = 2^(log_n − s_cut). This kernel loads one tile, performs: +// +// ┌─── shared memory (T × 4 bytes) ───────────────────────────┐ +// │ Load from global │ +// │ DIF stages s_cut .. log_n−1 (log_n−s_cut stages) │ +// │ Scale × cosetBR·cardInv (element-wise) │ +// │ DIT stages log_n−1 .. s_cut (log_n−s_cut stages) │ +// │ Store to global │ +// └────────────────────────────────────────────────────────────┘ +// +// Replaces 2·(log_n−s_cut)+1 separate kernel launches with one load/store. +// Grid: (tiles_per_row × n_rows) blocks. Threads: min(T/2, 1024). + +__global__ void kern_batch_ntt_fused( + uint32_t *data, + const uint32_t *inv_tw, // DIF (inverse) twiddles [n/2] + const uint32_t *fwd_tw, // DIT (forward) twiddles [n/2] + const uint32_t *scale_vec, // cosetBR × cardInv [n] + int log_n, + int s_cut, // first local DIF stage + size_t n_rows, + size_t row_stride) +{ + extern __shared__ uint32_t tile[]; + + const int k = log_n - s_cut; + const int tile_size = 1 << k; + const int half_tile = tile_size >> 1; + + int tiles_per_row = 1 << s_cut; + size_t row = blockIdx.x / tiles_per_row; + int tidx = blockIdx.x % tiles_per_row; + if (row >= n_rows) return; + + uint32_t *rd = data + row * row_stride + tidx * tile_size; + const uint32_t *sv = scale_vec + tidx * tile_size; + int tid = threadIdx.x; + + // Load tile + for (int i = tid; i < tile_size; i += blockDim.x) + tile[i] = rd[i]; + __syncthreads(); + + // DIF local stages: s = s_cut, s_cut+1, ..., log_n−1 + for (int s = s_cut; s < log_n; s++) { + int half = tile_size >> (s - s_cut + 1); + for (int pid = tid; pid < half_tile; pid += blockDim.x) { + int g = pid / half; + int j = pid % half; + int ia = g * 2 * half + j; + int ib = ia + half; + uint32_t a = tile[ia], b = tile[ib], w = inv_tw[j << s]; + tile[ia] = kb_add(a, b); + tile[ib] = kb_mul(kb_sub(a, b), w); + } + __syncthreads(); + } + + // Scale + for (int i = tid; i < tile_size; i += blockDim.x) + tile[i] = kb_mul(tile[i], sv[i]); + __syncthreads(); + + // DIT local stages: s = log_n−1, log_n−2, ..., s_cut + for (int s = log_n - 1; s >= s_cut; s--) { + int half = tile_size >> (s - s_cut + 1); + for (int pid = tid; pid < half_tile; pid += blockDim.x) { + int g = pid / half; + int j = pid % half; + int ia = g * 2 * half + j; + int ib = ia + half; + uint32_t a = tile[ia]; + uint32_t wb = kb_mul(tile[ib], fwd_tw[j << s]); + tile[ia] = kb_add(a, wb); + tile[ib] = kb_sub(a, wb); + } + __syncthreads(); + } + + // Store tile + for (int i = tid; i < tile_size; i += blockDim.x) + rd[i] = tile[i]; +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Poseidon2 — device functions +// ═════════════════════════════════════════════════════════════════════════════ +// +// Width-16 (Merkle compression) and Width-24 (sponge hash). +// +// Round structure: +// matMulExternal(state) // initial +// for i in 0..rF/2: addRC → sBox_full → matMulExternal +// for i in 0..rP: addRC[0] → sBox(0) → matMulInternal +// for i in 0..rF/2: addRC → sBox_full → matMulExternal +// +// S-box: x³ + +// M4 = circ(2,3,1,1) via addition chain +__device__ __forceinline__ void p2_matmul_m4(uint32_t &s0, uint32_t &s1, + uint32_t &s2, uint32_t &s3) { + uint32_t t01 = kb_add(s0, s1); + uint32_t t23 = kb_add(s2, s3); + uint32_t t0123 = kb_add(t01, t23); + uint32_t t01123 = kb_add(t0123, s1); + uint32_t t01233 = kb_add(t0123, s3); + s3 = kb_add(kb_dbl(s0), t01233); + s1 = kb_add(kb_dbl(s2), t01123); + s0 = kb_add(t01, t01123); + s2 = kb_add(t23, t01233); +} + +// External MDS: circ(2M4, M4, .., M4) +__device__ __forceinline__ void p2_matmul_external(uint32_t *s, int width) { + for (int i = 0; i < width; i += 4) + p2_matmul_m4(s[i], s[i+1], s[i+2], s[i+3]); + uint32_t sum[4] = {0, 0, 0, 0}; + for (int i = 0; i < width; i += 4) { + sum[0] = kb_add(sum[0], s[i]); + sum[1] = kb_add(sum[1], s[i+1]); + sum[2] = kb_add(sum[2], s[i+2]); + sum[3] = kb_add(sum[3], s[i+3]); + } + for (int i = 0; i < width; i += 4) { + s[i] = kb_add(s[i], sum[0]); + s[i+1] = kb_add(s[i+1], sum[1]); + s[i+2] = kb_add(s[i+2], sum[2]); + s[i+3] = kb_add(s[i+3], sum[3]); + } +} + +// Internal MDS: state[i] = sum + dᵢ · state[i] +__device__ __forceinline__ void p2_matmul_internal(uint32_t *s, int width, + const uint32_t *diag) { + uint32_t sum = s[0]; + for (int i = 1; i < width; i++) sum = kb_add(sum, s[i]); + for (int i = 0; i < width; i++) { + s[i] = kb_add(sum, kb_mul(s[i], diag[i])); + } +} + +// S-box: x ↦ x³ +__device__ __forceinline__ uint32_t p2_sbox(uint32_t x) { + return kb_mul(kb_sqr(x), x); +} + +// Full Poseidon2 permutation +__device__ void p2_permutation(uint32_t *state, int width, + int rf, int rp, + const uint32_t *round_keys, + const uint32_t *diag) { + int half_rf = rf / 2; + int rk_off = 0; + + p2_matmul_external(state, width); + + for (int r = 0; r < half_rf; r++) { + for (int j = 0; j < width; j++) + state[j] = kb_add(state[j], round_keys[rk_off + j]); + rk_off += width; + for (int j = 0; j < width; j++) + state[j] = p2_sbox(state[j]); + p2_matmul_external(state, width); + } + + for (int r = 0; r < rp; r++) { + state[0] = kb_add(state[0], round_keys[rk_off]); + rk_off += 1; + state[0] = p2_sbox(state[0]); + p2_matmul_internal(state, width, diag); + } + + for (int r = 0; r < half_rf; r++) { + for (int j = 0; j < width; j++) + state[j] = kb_add(state[j], round_keys[rk_off + j]); + rk_off += width; + for (int j = 0; j < width; j++) + state[j] = p2_sbox(state[j]); + p2_matmul_external(state, width); + } +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Poseidon2 batch kernels +// ═════════════════════════════════════════════════════════════════════════════ + +// Batch compress (width=16): one thread per pair → hash +// Feed-forward: hash[j] = permuted_state[8+j] + right[j] +// +// Shared-mem diag: each block loads the 16-element diag vector into +// shared memory once, so the 21 × 2 = 42 partial-round reads per +// permutation hit shared memory (~zero latency) instead of L2/global. +__global__ void __launch_bounds__(KB_BLOCK, 4) +kern_p2_compress(const uint32_t *input, uint32_t *output, + const uint32_t *round_keys, + const uint32_t *diag, + size_t count) { + __shared__ uint32_t s_diag[16]; + if (threadIdx.x < 16) s_diag[threadIdx.x] = diag[threadIdx.x]; + __syncthreads(); + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= count) return; + + uint32_t state[16]; + const uint32_t *pair = input + tid * 16; + for (int j = 0; j < 16; j++) state[j] = pair[j]; + + uint32_t ff[8]; + for (int j = 0; j < 8; j++) ff[j] = state[8 + j]; + + p2_permutation(state, 16, 6, 21, round_keys, s_diag); + + uint32_t *out = output + tid * 8; + for (int j = 0; j < 8; j++) + out[j] = kb_add(state[8 + j], ff[j]); +} + +// Batch sponge (width=24): one thread per input → 8-element digest +// Absorb: overwrite state[8..23] with input block, permute. Squeeze: state[0..7]. +__global__ void __launch_bounds__(KB_BLOCK, 4) +kern_p2_sponge(const uint32_t *input, size_t input_len, + uint32_t *output, + const uint32_t *round_keys, + const uint32_t *diag, + size_t count) { + // width=24 sponge — diag has 24 elements; cache in shared mem. + __shared__ uint32_t s_diag[24]; + if (threadIdx.x < 24) s_diag[threadIdx.x] = diag[threadIdx.x]; + __syncthreads(); + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= count) return; + + const int rate = 16; + uint32_t state[24]; + for (int j = 0; j < 24; j++) state[j] = 0; + + const uint32_t *inp = input + tid * input_len; + + for (size_t off = 0; off < input_len; off += rate) { + size_t chunk = (off + rate <= input_len) ? rate : input_len - off; + for (size_t j = 0; j < chunk; j++) + state[8 + j] = inp[off + j]; + p2_permutation(state, 24, 6, 21, round_keys, s_diag); + } + + uint32_t *out = output + tid * 8; + for (int j = 0; j < 8; j++) out[j] = state[j]; +} + +// Batch Merkle-Damgard hash (width=16, rate=8): one thread per column → 8-element digest. +// Matches CPU CompressPoseidon2x16: iterative Davies-Meyer with width-16 Poseidon2. +// state[0..7] = running hash (capacity, initially zero) +// state[8..15] = message block (8 input elements per step) +// After permutation: state[j] = P(state)[8+j] + input[j] for j=0..7 +// Davies-Meyer SIS leaf-hash. Each thread processes one column's full +// SIS digest of length input_len = degree (typically 512). With 21 +// partial rounds × ⌈input_len/8⌉ permutations per thread × 16 diag +// reads per partial round, caching diag in shared mem saves a lot of +// L2 traffic. +__global__ void __launch_bounds__(KB_BLOCK, 4) +kern_p2_md_hash(const uint32_t *input, size_t input_len, + uint32_t *output, + const uint32_t *round_keys, + const uint32_t *diag, + size_t count) { + __shared__ uint32_t s_diag[16]; + if (threadIdx.x < 16) s_diag[threadIdx.x] = diag[threadIdx.x]; + __syncthreads(); + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= count) return; + + const int rate = 8; + uint32_t state[16]; + for (int j = 0; j < 16; j++) state[j] = 0; + + const uint32_t *inp = input + tid * input_len; + + for (size_t off = 0; off < input_len; off += rate) { + // Load message block into state[8..15] + for (int j = 0; j < rate; j++) + state[8 + j] = (off + j < input_len) ? inp[off + j] : 0; + + p2_permutation(state, 16, 6, 21, round_keys, s_diag); + + // Davies-Meyer feed-forward: state[j] = P(state)[8+j] + input[j] + for (int j = 0; j < 8; j++) { + uint32_t m = (off + j < input_len) ? inp[off + j] : 0; + state[j] = kb_add(state[8 + j], m); + } + } + + uint32_t *out = output + tid * 8; + for (int j = 0; j < 8; j++) out[j] = state[j]; +} + +// ═════════════════════════════════════════════════════════════════════════════ +// SIS hash kernel +// ═════════════════════════════════════════════════════════════════════════════ +// +// Ring-SIS: H(column) = IFFT_coset( Σᵢ FFT_coset(limbs_i) ⊙ Ag[i] ) +// +// One thread-block per column. 256 threads cooperate on NTT-512 in shared memory. +// Shared memory: 2 × degree uint32 (work buffer + accumulator). +// +// Limb decomposition (logTwoBound=16): +// element (Montgomery) → canonical (kb_from_mont) → [lo16, hi16] (raw, not Montgomery) + +__global__ void kern_sis_hash( + const uint32_t *d_encoded, // [sizeCodeWord × nRows], column-major, Montgomery + int n_rows, + int size_codeword, // unused (kept for ABI compat) + const uint32_t *d_ag, // [nPolys × degree], pre-NTT'd keys (bit-reversed) + int n_polys, + int degree, + int log_degree, + const uint32_t *d_fwd_tw, // [degree/2] forward twiddles + const uint32_t *d_inv_tw, // [degree/2] inverse twiddles + const uint32_t *d_coset_table, // [degree] shift^j, natural order + const uint32_t *d_coset_inv, // [degree] shift^{-j} · (1/degree), natural order + uint32_t *d_sis_out) // [sizeCodeWord × degree] +{ + int col = blockIdx.x; + int tid = threadIdx.x; + int bdim = blockDim.x; + + extern __shared__ uint32_t shared[]; + uint32_t *work = shared; // [degree] + uint32_t *acc = shared + degree; // [degree] + + // Zero accumulator + for (int j = tid; j < degree; j += bdim) acc[j] = 0; + __syncthreads(); + + for (int poly = 0; poly < n_polys; poly++) { + // ── 1. Extract limbs → work[0..degree-1] ──────────────────────── + // Each element gives 2 limbs (16-bit). limb_idx = poly*degree + j. + int limb_base = poly * degree; + for (int j = tid; j < degree; j += bdim) { + int limb_idx = limb_base + j; + int elem_idx = limb_idx >> 1; + int limb_half = limb_idx & 1; + + uint32_t val = 0; + if (elem_idx < n_rows) { + // Column-major: d_encoded[col * n_rows + row]. Coalesced when + // consecutive threads read consecutive rows (elem_idx). + // gnark-crypto extracts LE uint16 limbs from canonical form. + uint32_t canonical = kb_from_mont(d_encoded[(size_t)col * n_rows + elem_idx]); + val = limb_half == 0 ? (canonical & 0xFFFFu) : (canonical >> 16); + } + work[j] = val; + } + __syncthreads(); + + // ── 2. Coset shift: work[j] *= shift^j ────────────────────────── + for (int j = tid; j < degree; j += bdim) + work[j] = kb_mul(work[j], d_coset_table[j]); + __syncthreads(); + + // ── 3. Forward DIF NTT in shared memory ───────────────────────── + for (int s = 0; s < log_degree; s++) { + int half = degree >> (s + 1); + int pairs = degree >> 1; + for (int t = tid; t < pairs; t += bdim) { + int group = t / half; + int jj = t % half; + int ia = group * 2 * half + jj; + int ib = ia + half; + int tw_idx = jj * (1 << s); + + uint32_t a = work[ia], b = work[ib]; + uint32_t w = d_fwd_tw[tw_idx]; + work[ia] = kb_add(a, b); + work[ib] = kb_mul(kb_sub(a, b), w); + } + __syncthreads(); + } + + // ── 4. Pointwise mul by Ag[poly] + accumulate ─────────────────── + const uint32_t *ag_poly = d_ag + poly * degree; + for (int j = tid; j < degree; j += bdim) + acc[j] = kb_add(acc[j], kb_mul(work[j], ag_poly[j])); + __syncthreads(); + } + + // ── 5. Inverse DIT NTT on accumulator ──────────────────────────────── + for (int j = tid; j < degree; j += bdim) work[j] = acc[j]; + __syncthreads(); + + for (int s = log_degree - 1; s >= 0; s--) { + int half = degree >> (s + 1); + int pairs = degree >> 1; + for (int t = tid; t < pairs; t += bdim) { + int group = t / half; + int jj = t % half; + int ia = group * 2 * half + jj; + int ib = ia + half; + int tw_idx = jj * (1 << s); + + uint32_t a = work[ia]; + uint32_t wb = kb_mul(work[ib], d_inv_tw[tw_idx]); + work[ia] = kb_add(a, wb); + work[ib] = kb_sub(a, wb); + } + __syncthreads(); + } + + // ── 6. Inverse coset shift + scale by 1/n ─────────────────────────── + for (int j = tid; j < degree; j += bdim) + d_sis_out[col * degree + j] = kb_mul(work[j], d_coset_inv[j]); +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Scatter-transpose kernel — row-major → column-major with optional stride +// ═════════════════════════════════════════════════════════════════════════════ +// +// Transposes rows into column-major output. Supports column stride/offset +// for rate-2 interleaving: col_stride=2, col_offset=0 writes even columns, +// col_stride=2, col_offset=1 writes odd columns. +// +// Uses shared-memory tiled transpose (32×32, +1 pad) for coalesced R+W. +// +// dst[(src_col * col_stride + col_offset) * total_rows + row] = src[row * n_src_cols + src_col] + +__global__ void kern_scatter_transpose(const uint32_t * __restrict__ src, + uint32_t * __restrict__ dst, + size_t n_rows, size_t n_src_cols, + size_t total_rows, + int col_stride, int col_offset) { + __shared__ uint32_t tile[32][33]; + unsigned bx = blockIdx.x * 32; // source-column tile + unsigned by = blockIdx.y * 32; // row tile + + // Load tile: coalesced read from row-major src + for (int i = 0; i < 32; i += 8) { + unsigned r = by + threadIdx.y + i; + unsigned c = bx + threadIdx.x; + if (r < n_rows && c < n_src_cols) + tile[threadIdx.y + i][threadIdx.x] = src[(size_t)r * n_src_cols + c]; + } + __syncthreads(); + + // Store tile: coalesced write to column-major dst + for (int i = 0; i < 32; i += 8) { + unsigned local_row = by + threadIdx.x; + unsigned src_col = bx + threadIdx.y + i; + if (local_row < n_rows && src_col < n_src_cols) { + size_t dst_col = (size_t)src_col * col_stride + col_offset; + dst[dst_col * total_rows + local_row] = + tile[threadIdx.x][threadIdx.y + i]; + } + } +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Merkle tree kernel +// ═════════════════════════════════════════════════════════════════════════════ +// +// Bottom-up: hash pairs of sibling hashes using Poseidon2 compression (width=16). +// Heap layout: tree[1]=root, tree[2i]=left child, tree[2i+1]=right child. + +// Merkle tree node hash: hashLR(left, right) using Merkle-Damgard Poseidon2. +// +// Matches the CPU poseidon2_koalabear.MDHasher which processes 16 input +// elements in TWO 8-element blocks with zero initial state: +// Block 1 (left): state = CompressPoseidon2([0,...,0], left) +// Block 2 (right): state = CompressPoseidon2(state, right) +// Merkle tree compression: hash(left, right) using Poseidon2 MD hash (width=16). +// Matches smt_koalabear.hashLR which calls MDHasher on left[8]||right[8]: +// h1 = CompressPoseidon2(zero, left) +// h2 = CompressPoseidon2(h1, right) +// output = h2 +// Merkle tree level compress: each thread compresses one (left, right) +// pair into one parent hash via two Poseidon2 permutations + Davies- +// Meyer feed-forward. +// +// Shared-mem diag avoids 2 × 21 × 16 = 672 L2 reads per thread +// (every partial round's matmul_internal reads the full diag vector). +__global__ void __launch_bounds__(KB_BLOCK, 4) +kern_merkle_level(const uint32_t *children, uint32_t *parents, + const uint32_t *round_keys, + const uint32_t *diag, + size_t n_pairs) { + __shared__ uint32_t s_diag[16]; + if (threadIdx.x < 16) s_diag[threadIdx.x] = diag[threadIdx.x]; + __syncthreads(); + + size_t tid = blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n_pairs) return; + + uint32_t state[16]; + // Initialize state to zero (MD IV) + for (int j = 0; j < 16; j++) state[j] = 0; + + // ── Block 1: left child ────────────────────────────────────────────── + for (int j = 0; j < 8; j++) + state[8 + j] = children[(2 * tid) * 8 + j]; + + uint32_t ff[8]; + for (int j = 0; j < 8; j++) ff[j] = state[8 + j]; + + p2_permutation(state, 16, 6, 21, round_keys, s_diag); + + // Feed-forward → new capacity (state[0:8]) + for (int j = 0; j < 8; j++) + state[j] = kb_add(state[8 + j], ff[j]); + + // ── Block 2: right child ───────────────────────────────────────────── + for (int j = 0; j < 8; j++) + state[8 + j] = children[(2 * tid + 1) * 8 + j]; + + for (int j = 0; j < 8; j++) ff[j] = state[8 + j]; + + p2_permutation(state, 16, 6, 21, round_keys, s_diag); + + // Feed-forward → output hash + uint32_t *out = parents + tid * 8; + for (int j = 0; j < 8; j++) + out[j] = kb_add(state[8 + j], ff[j]); +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Linear combination: UAlpha[j] = Σᵢ αⁱ · row[i][j] (result ∈ E4) +// ═════════════════════════════════════════════════════════════════════════════ + +__global__ void kern_lincomb_e4(const uint32_t * const *rows, + size_t n_rows, size_t n_cols, + E4 alpha, + uint32_t *result) { + size_t j = blockIdx.x * blockDim.x + threadIdx.x; + if (j >= n_cols) return; + + E4 acc = e4_zero(); + E4 alpha_pow = {{KB_ONE, 0}, {0, 0}}; + + for (size_t i = 0; i < n_rows; i++) { + uint32_t val = rows[i][j]; + e4_mulacc(acc, val, alpha_pow); + alpha_pow = e4_mul(alpha_pow, alpha); + } + + result[j * 4 + 0] = acc.b0.a0; + result[j * 4 + 1] = acc.b0.a1; + result[j * 4 + 2] = acc.b1.a0; + result[j * 4 + 3] = acc.b1.a1; +} + +// Linear combination on column-major encoded matrix (for GPU Prove). +// UAlpha[j] = Σᵢ αⁱ · d_encoded_col[j * n_rows + i], result ∈ E4^scw +__global__ void kern_lincomb_e4_colmajor(const uint32_t *d_encoded_col, + size_t n_rows, size_t scw, + E4 alpha, uint32_t *result) { + size_t j = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + if (j >= scw) return; + + const uint32_t *col = d_encoded_col + j * n_rows; + E4 acc = e4_zero(); + E4 alpha_pow = {{KB_ONE, 0}, {0, 0}}; + + for (size_t i = 0; i < n_rows; i++) { + e4_mulacc(acc, col[i], alpha_pow); + alpha_pow = e4_mul(alpha_pow, alpha); + } + + result[j * 4 + 0] = acc.b0.a0; + result[j * 4 + 1] = acc.b0.a1; + result[j * 4 + 2] = acc.b1.a0; + result[j * 4 + 3] = acc.b1.a1; +} + +// ═════════════════════════════════════════════════════════════════════════════ +// C ABI implementations +// ═════════════════════════════════════════════════════════════════════════════ + +#include "gnark_gpu_kb.h" + +// ── Vector lifecycle ──────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_vec_alloc(gnark_gpu_context_t, size_t n, kb_vec_t *out) { + auto *v = new(std::nothrow) KBVec; + if (!v) return KB_ERROR_OOM; + cudaError_t err = cudaMalloc(&v->d_data, n * sizeof(uint32_t)); + if (err != cudaSuccess) { delete v; return KB_ERROR_CUDA; } + v->n = n; + *out = v; + return KB_SUCCESS; +} + +extern "C" void kb_vec_free(kb_vec_t v) { + if (v) { cudaFree(v->d_data); delete v; } +} + +extern "C" size_t kb_vec_len(kb_vec_t v) { + return v ? v->n : 0; +} + +extern "C" kb_error_t kb_vec_h2d(gnark_gpu_context_t, kb_vec_t dst, + const uint32_t *src, size_t n) { + if (!dst || n > dst->n) return KB_ERROR_INVALID; + CUDA_CHECK(cudaMemcpy(dst->d_data, src, n * sizeof(uint32_t), cudaMemcpyHostToDevice)); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_d2h(gnark_gpu_context_t, uint32_t *dst, + kb_vec_t src, size_t n) { + if (!src || n > src->n) return KB_ERROR_INVALID; + CUDA_CHECK(cudaMemcpy(dst, src->d_data, n * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_d2d(gnark_gpu_context_t, kb_vec_t dst, kb_vec_t src) { + if (!dst || !src || dst->n != src->n) return KB_ERROR_INVALID; + CUDA_CHECK(cudaMemcpyAsync(dst->d_data, src->d_data, src->n * sizeof(uint32_t), cudaMemcpyDeviceToDevice, 0)); + return KB_SUCCESS; +} + +// D2D copy with raw pointers (async on default stream). +extern "C" kb_error_t kb_vec_d2d_offset(gnark_gpu_context_t, uint32_t *dst, + const uint32_t *src, size_t n) { + CUDA_CHECK(cudaMemcpyAsync(dst, src, n * sizeof(uint32_t), cudaMemcpyDeviceToDevice, 0)); + return KB_SUCCESS; +} + +// D2H with raw pointers. +extern "C" kb_error_t kb_vec_d2h_raw(gnark_gpu_context_t, uint32_t *dst, + const uint32_t *src, size_t n) { + CUDA_CHECK(cudaMemcpy(dst, src, n * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + return KB_SUCCESS; +} + +// Synchronize the default CUDA stream (wait for all queued ops to complete). +extern "C" kb_error_t kb_sync(gnark_gpu_context_t) { + CUDA_CHECK(cudaStreamSynchronize(0)); + return KB_SUCCESS; +} + +// Bulk H2D from pre-pinned host memory (cudaMallocHost). +// src must point into a buffer allocated by kb_pinned_alloc. +extern "C" kb_error_t kb_vec_h2d_pinned(gnark_gpu_context_t, kb_vec_t dst, + const uint32_t *src, size_t n) { + if (!dst || n > dst->n) return KB_ERROR_INVALID; + size_t bytes = n * sizeof(uint32_t); + CUDA_CHECK(cudaMemcpyAsync(dst->d_data, src, bytes, cudaMemcpyHostToDevice, 0)); + CUDA_CHECK(cudaStreamSynchronize(0)); + return KB_SUCCESS; +} + +// Allocate page-locked host memory for fast H2D. +extern "C" kb_error_t kb_pinned_alloc(size_t bytes, uint32_t **out) { + CUDA_CHECK(cudaMallocHost(out, bytes)); + return KB_SUCCESS; +} + +extern "C" void kb_pinned_free(uint32_t *ptr) { + if (ptr) cudaFreeHost(ptr); +} + +// ── Vector arithmetic ─────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_vec_add(gnark_gpu_context_t, kb_vec_t c, kb_vec_t a, kb_vec_t b) { + if (!c || !a || !b) return KB_ERROR_INVALID; + size_t n = c->n; + kern_kb_add<<>>(c->d_data, a->d_data, b->d_data, n); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_sub(gnark_gpu_context_t, kb_vec_t c, kb_vec_t a, kb_vec_t b) { + if (!c || !a || !b) return KB_ERROR_INVALID; + size_t n = c->n; + kern_kb_sub<<>>(c->d_data, a->d_data, b->d_data, n); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_mul(gnark_gpu_context_t, kb_vec_t c, kb_vec_t a, kb_vec_t b) { + if (!c || !a || !b) return KB_ERROR_INVALID; + size_t n = c->n; + kern_kb_mul<<>>(c->d_data, a->d_data, b->d_data, n); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_scale(gnark_gpu_context_t, kb_vec_t v, uint32_t scalar) { + if (!v) return KB_ERROR_INVALID; + kern_kb_scale<<n), KB_BLOCK>>>(v->d_data, scalar, v->n); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_scale_by_powers(gnark_gpu_context_t, kb_vec_t v, uint32_t g) { + if (!v) return KB_ERROR_INVALID; + kern_kb_scale_by_powers<<n), KB_BLOCK>>>(v->d_data, g, v->n); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_batch_invert(gnark_gpu_context_t, kb_vec_t, kb_vec_t) { + return KB_ERROR_INVALID; // TODO +} + +// ── NTT ───────────────────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_ntt_init(gnark_gpu_context_t, size_t n, + const uint32_t *fwd_tw, + const uint32_t *inv_tw, + kb_ntt_t *out) { + if (!fwd_tw || !inv_tw || n == 0 || (n & (n-1)) != 0) return KB_ERROR_INVALID; + auto *d = new(std::nothrow) KBNtt; + if (!d) return KB_ERROR_OOM; + + size_t half = n / 2; + size_t bytes = half * sizeof(uint32_t); + if (cudaMalloc(&d->d_fwd_tw, bytes) != cudaSuccess || + cudaMalloc(&d->d_inv_tw, bytes) != cudaSuccess) { + delete d; + return KB_ERROR_CUDA; + } + cudaMemcpy(d->d_fwd_tw, fwd_tw, bytes, cudaMemcpyHostToDevice); + cudaMemcpy(d->d_inv_tw, inv_tw, bytes, cudaMemcpyHostToDevice); + d->n = n; + d->log_n = ilog2(n); + *out = d; + return KB_SUCCESS; +} + +extern "C" void kb_ntt_free(kb_ntt_t d) { + if (d) { + cudaFree(d->d_fwd_tw); + cudaFree(d->d_inv_tw); + delete d; + } +} + +// Bit-reversal permutation: data[i] ↔ data[bitrev(i, log_n)]. +// In-place, handles n elements. Only swaps when bitrev(i) > i to avoid double-swap. +__global__ void kern_bitrev(uint32_t *data, int log_n, size_t n) { + size_t i = blockIdx.x * blockDim.x + threadIdx.x; + if (i >= n) return; + size_t j = __brev((unsigned int)i) >> (32 - log_n); + if (j > i) { + uint32_t tmp = data[i]; + data[i] = data[j]; + data[j] = tmp; + } +} + +// Bit-reversal for a batch of row-major vectors. +__global__ void kern_batch_bitrev(uint32_t *data, int log_n, + size_t n_rows, size_t row_stride, size_t n) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t total = n_rows * n; + if (idx >= total) return; + + size_t row = idx / n; + size_t i = idx % n; + size_t j = __brev((unsigned int)i) >> (32 - log_n); + if (j > i) { + uint32_t *rd = data + row * row_stride; + uint32_t tmp = rd[i]; + rd[i] = rd[j]; + rd[j] = tmp; + } +} + +extern "C" kb_error_t kb_vec_bitrev(gnark_gpu_context_t, kb_vec_t v) { + if (!v) return KB_ERROR_INVALID; + int log_n = __builtin_ctz((unsigned int)v->n); + kern_bitrev<<n), KB_BLOCK>>>(v->d_data, log_n, v->n); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_ntt_fwd(gnark_gpu_context_t, kb_ntt_t d, kb_vec_t v) { + if (!d || !v || v->n != d->n) return KB_ERROR_INVALID; + size_t pairs = d->n >> 1; + for (int s = 0; s < d->log_n; s++) + kern_ntt_dif<<>>(v->d_data, d->d_fwd_tw, d->n, s); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_ntt_inv(gnark_gpu_context_t, kb_ntt_t d, kb_vec_t v) { + if (!d || !v || v->n != d->n) return KB_ERROR_INVALID; + size_t pairs = d->n >> 1; + for (int s = d->log_n - 1; s >= 0; s--) + kern_ntt_dit<<>>(v->d_data, d->d_inv_tw, d->n, s); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_ntt_coset_fwd(gnark_gpu_context_t ctx, kb_ntt_t d, + kb_vec_t v, uint32_t g) { + if (!d || !v || v->n != d->n) return KB_ERROR_INVALID; + kern_kb_scale_by_powers<<n), KB_BLOCK>>>(v->d_data, g, v->n); + return kb_ntt_fwd(ctx, d, v); +} + +// ── Batch NTT (operates on packed contiguous vectors) ──────────────────────── + +// Batch coset forward NTT: for each of `batch` vectors of size `n`, +// apply scale-by-powers(g) then DIF NTT then bit-reversal. +// data layout: [vec0[0..n-1], vec1[0..n-1], ..., vec_{batch-1}[0..n-1]] +// Single C call for all vectors → avoids per-vector CGO overhead. +extern "C" kb_error_t kb_ntt_batch_coset_fwd_bitrev( + gnark_gpu_context_t ctx, kb_ntt_t d, + uint32_t *data, size_t n, size_t batch, uint32_t g) +{ + if (!d || !data || n != d->n || batch == 0) return KB_ERROR_INVALID; + int log_n = d->log_n; + size_t pairs = n >> 1; + size_t total_pairs = batch * pairs; + + // Scale all vectors by the same coset power table in one launch. + kern_batch_scale_by_powers<<>>(data, g, batch, n, n); + + // DIF stages over all rows. + for (int s = 0; s < log_n; s++) + kern_batch_ntt_dif<<>>(data, d->d_fwd_tw, n, s, batch, n); + + // Natural-order output. + kern_batch_bitrev<<>>(data, log_n, batch, n, n); + return KB_SUCCESS; +} + +// Batch IFFT + scale(1/n): for each of `batch` vectors, +// apply bit-reversal then DIT inverse NTT then scale by nInv. +extern "C" kb_error_t kb_ntt_batch_ifft_scale( + gnark_gpu_context_t ctx, kb_ntt_t d, + uint32_t *data, size_t n, size_t batch, uint32_t nInv) +{ + if (!d || !data || n != d->n || batch == 0) return KB_ERROR_INVALID; + int log_n = d->log_n; + size_t pairs = n >> 1; + size_t total_pairs = batch * pairs; + size_t total = batch * n; + + // Natural-order evaluations -> bit-reversed order for DIT inverse path. + kern_batch_bitrev<<>>(data, log_n, batch, n, n); + + // DIT inverse stages over all rows. + for (int s = log_n - 1; s >= 0; s--) + kern_batch_ntt_dit<<>>(data, d->d_inv_tw, n, s, batch, n); + + // Global scaling by 1/n. + kern_kb_scale<<>>(data, nInv, total); + return KB_SUCCESS; +} + +// ── Raw pointer variants (for selective per-root operations) ───────────────── + +extern "C" kb_error_t kb_ntt_coset_fwd_raw(gnark_gpu_context_t ctx, kb_ntt_t d, + uint32_t *data, uint32_t g) { + if (!d) return KB_ERROR_INVALID; + size_t n = d->n; + kern_kb_scale_by_powers<<>>(data, g, n); + size_t pairs = n >> 1; + for (int s = 0; s < d->log_n; s++) + kern_ntt_dif<<>>(data, d->d_fwd_tw, n, s); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_vec_bitrev_raw(gnark_gpu_context_t, uint32_t *data, size_t n) { + int log_n = __builtin_ctz((unsigned int)n); + kern_bitrev<<>>(data, log_n, n); + return KB_SUCCESS; +} + +// ── Poseidon2 ─────────────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_p2_init(gnark_gpu_context_t, int width, + int nb_full_rounds, int nb_partial_rounds, + const uint32_t *round_keys, + const uint32_t *diag, + kb_p2_t *out) { + if ((width != 16 && width != 24) || !round_keys || !diag) return KB_ERROR_INVALID; + auto *p = new(std::nothrow) KBPoseidon2; + if (!p) return KB_ERROR_OOM; + + // Round keys + int half_rf = nb_full_rounds / 2; + size_t nkeys = (size_t)half_rf * width + nb_partial_rounds + (size_t)half_rf * width; + size_t rk_bytes = nkeys * sizeof(uint32_t); + + if (cudaMalloc(&p->d_round_keys, rk_bytes) != cudaSuccess) { + delete p; + return KB_ERROR_CUDA; + } + cudaMemcpy(p->d_round_keys, round_keys, rk_bytes, cudaMemcpyHostToDevice); + + // Diagonal + size_t diag_bytes = width * sizeof(uint32_t); + if (cudaMalloc(&p->d_diag, diag_bytes) != cudaSuccess) { + cudaFree(p->d_round_keys); + delete p; + return KB_ERROR_CUDA; + } + cudaMemcpy(p->d_diag, diag, diag_bytes, cudaMemcpyHostToDevice); + + p->width = width; + p->nb_full_rounds = nb_full_rounds; + p->nb_partial_rounds = nb_partial_rounds; + *out = p; + return KB_SUCCESS; +} + +extern "C" void kb_p2_free(kb_p2_t p) { + if (p) { + cudaFree(p->d_round_keys); + cudaFree(p->d_diag); + delete p; + } +} + +extern "C" kb_error_t kb_p2_compress_batch(gnark_gpu_context_t, kb_p2_t p, + const uint32_t *input, uint32_t *output, + size_t count) { + if (!p || p->width != 16) return KB_ERROR_INVALID; + + uint32_t *d_in, *d_out; + CUDA_CHECK(cudaMalloc(&d_in, count * 16 * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&d_out, count * 8 * sizeof(uint32_t))); + CUDA_CHECK(cudaMemcpy(d_in, input, count * 16 * sizeof(uint32_t), cudaMemcpyHostToDevice)); + + kern_p2_compress<<>>(d_in, d_out, p->d_round_keys, p->d_diag, count); + + CUDA_CHECK(cudaMemcpy(output, d_out, count * 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaFree(d_in); + cudaFree(d_out); + return KB_SUCCESS; +} + +extern "C" kb_error_t kb_p2_sponge_batch(gnark_gpu_context_t, kb_p2_t p, + const uint32_t *input, size_t input_len, + uint32_t *output, size_t count) { + if (!p || p->width != 24) return KB_ERROR_INVALID; + + uint32_t *d_in, *d_out; + CUDA_CHECK(cudaMalloc(&d_in, count * input_len * sizeof(uint32_t))); + CUDA_CHECK(cudaMalloc(&d_out, count * 8 * sizeof(uint32_t))); + CUDA_CHECK(cudaMemcpy(d_in, input, count * input_len * sizeof(uint32_t), cudaMemcpyHostToDevice)); + + kern_p2_sponge<<>>(d_in, input_len, d_out, + p->d_round_keys, p->d_diag, count); + + CUDA_CHECK(cudaMemcpy(output, d_out, count * 8 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaFree(d_in); + cudaFree(d_out); + return KB_SUCCESS; +} + +// ── SIS ───────────────────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_sis_init(gnark_gpu_context_t, + int degree, int n_polys, int log_two_bound, + const uint32_t *ag, + const uint32_t *fwd_tw, + const uint32_t *inv_tw, + const uint32_t *coset_table, + const uint32_t *coset_inv, + kb_sis_t *out) { + if (degree <= 0 || (degree & (degree-1)) != 0) return KB_ERROR_INVALID; + if (!ag || !fwd_tw || !inv_tw || !coset_table || !coset_inv) return KB_ERROR_INVALID; + + auto *s = new(std::nothrow) KBSis; + if (!s) return KB_ERROR_OOM; + + size_t deg_bytes = degree * sizeof(uint32_t); + size_t half_bytes = (degree / 2) * sizeof(uint32_t); + size_t ag_bytes = (size_t)n_polys * deg_bytes; + + #define SIS_ALLOC(ptr, sz) if (cudaMalloc(&ptr, sz) != cudaSuccess) { delete s; return KB_ERROR_CUDA; } + SIS_ALLOC(s->d_ag, ag_bytes); + SIS_ALLOC(s->d_fwd_tw, half_bytes); + SIS_ALLOC(s->d_inv_tw, half_bytes); + SIS_ALLOC(s->d_coset_table, deg_bytes); + SIS_ALLOC(s->d_coset_inv, deg_bytes); + #undef SIS_ALLOC + + cudaMemcpy(s->d_ag, ag, ag_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(s->d_fwd_tw, fwd_tw, half_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(s->d_inv_tw, inv_tw, half_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(s->d_coset_table, coset_table, deg_bytes, cudaMemcpyHostToDevice); + cudaMemcpy(s->d_coset_inv, coset_inv, deg_bytes, cudaMemcpyHostToDevice); + + s->degree = degree; + s->log_degree = ilog2(degree); + s->n_polys = n_polys; + s->log_two_bound = log_two_bound; + *out = s; + return KB_SUCCESS; +} + +extern "C" void kb_sis_free(kb_sis_t s) { + if (s) { + cudaFree(s->d_ag); + cudaFree(s->d_fwd_tw); + cudaFree(s->d_inv_tw); + cudaFree(s->d_coset_table); + cudaFree(s->d_coset_inv); + delete s; + } +} + +// ── Vortex pipeline (pre-allocated device buffers) ─────────────────────────── + +static bool kb_timing_enabled() { + static int val = -1; + if (val < 0) val = getenv("KB_VORTEX_TIMING") ? 1 : 0; + return val == 1; +} + +extern "C" void kb_vortex_pipeline_free(kb_vortex_pipeline_t p) { + if (!p) return; + cudaFree(p->d_work); + cudaFree(p->d_rs_fwd_tw); + cudaFree(p->d_rs_inv_tw); + cudaFree(p->d_scaled_coset); + cudaFree(p->d_coeffs); + cudaFree(p->d_coset_tables); + cudaFree(p->d_encoded_col); + cudaFree(p->d_sis); + cudaFree(p->d_leaves); + cudaFree(p->d_tree); + // Async extraction buffers + cudaFree(p->d_enc_rowmajor); + cudaFreeHost(p->h_enc_pinned); + cudaFreeHost(p->h_sis_pinned); + cudaFreeHost(p->h_leaves_pinned); + if (p->ev_rs_done) cudaEventDestroy(p->ev_rs_done); + if (p->ev_sis_done) cudaEventDestroy(p->ev_sis_done); + if (p->ev_p2_done) cudaEventDestroy(p->ev_p2_done); + // Original buffers + cudaFreeHost(p->h_input); + cudaFreeHost(p->h_tree); + if (p->stream_xfer) cudaStreamDestroy(p->stream_xfer); + if (p->stream_compute) cudaStreamDestroy(p->stream_compute); + if (p->h2d_event) cudaEventDestroy(p->h2d_event); + delete p; +} + +extern "C" kb_error_t kb_vortex_pipeline_init(gnark_gpu_context_t, + kb_sis_t sis, + kb_p2_t p2_sponge, + kb_p2_t p2_compress, + size_t max_n_rows, + size_t n_cols, + int rate, + const uint32_t *rs_fwd_tw, + const uint32_t *rs_inv_tw, + const uint32_t *scaled_coset_br, + kb_vortex_pipeline_t *out) { + if (!sis || !p2_sponge || !p2_compress || !max_n_rows || !n_cols) + return KB_ERROR_INVALID; + if (!rs_fwd_tw || !rs_inv_tw || !scaled_coset_br) + return KB_ERROR_INVALID; + + auto *p = new(std::nothrow) KBVortexPipeline{}; + if (!p) return KB_ERROR_OOM; + + p->sis = sis; + p->p2_sponge = p2_sponge; + p->p2_compress = p2_compress; + p->max_n_rows = max_n_rows; + p->n_cols = n_cols; + p->rate = rate; + p->size_codeword = n_cols * rate; + p->tree_np = next_pow2(p->size_codeword); + p->log_n_cols = ilog2(n_cols); + p->degree = sis->degree; + + size_t scw = p->size_codeword; + int degree = p->degree; + size_t np = p->tree_np; + size_t half_n = n_cols / 2; + + // Device buffers + #define PIPE_ALLOC(ptr, nbytes) \ + if (cudaMalloc(&(ptr), (nbytes)) != cudaSuccess) { \ + kb_vortex_pipeline_free(p); return KB_ERROR_CUDA; } + PIPE_ALLOC(p->d_work, max_n_rows * n_cols * sizeof(uint32_t)); + PIPE_ALLOC(p->d_rs_fwd_tw, half_n * sizeof(uint32_t)); + PIPE_ALLOC(p->d_rs_inv_tw, half_n * sizeof(uint32_t)); + PIPE_ALLOC(p->d_scaled_coset, n_cols * sizeof(uint32_t)); + PIPE_ALLOC(p->d_encoded_col, scw * max_n_rows * sizeof(uint32_t)); + PIPE_ALLOC(p->d_sis, scw * degree * sizeof(uint32_t)); + PIPE_ALLOC(p->d_leaves, scw * 8 * sizeof(uint32_t)); + PIPE_ALLOC(p->d_tree, 2 * np * 8 * sizeof(uint32_t)); + // d_enc_rowmajor: lazy-allocated on first use (commit_and_extract / extract_all_rowmajor). + // Not needed by the CommitDirect + SnapshotEncoded path, saving ~scw*maxRows*4 bytes. + p->d_enc_rowmajor = nullptr; + #undef PIPE_ALLOC + + // Upload RS domain data (one-time) + cudaMemcpy(p->d_rs_fwd_tw, rs_fwd_tw, half_n * sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(p->d_rs_inv_tw, rs_inv_tw, half_n * sizeof(uint32_t), cudaMemcpyHostToDevice); + cudaMemcpy(p->d_scaled_coset, scaled_coset_br, n_cols * sizeof(uint32_t), cudaMemcpyHostToDevice); + + // Pinned host buffers + #define PIN_ALLOC(ptr, nbytes) \ + if (cudaMallocHost(&(ptr), (nbytes)) != cudaSuccess) { \ + kb_vortex_pipeline_free(p); return KB_ERROR_CUDA; } + PIN_ALLOC(p->h_input, max_n_rows * n_cols * sizeof(uint32_t)); + PIN_ALLOC(p->h_tree, (2 * np - 1) * 8 * sizeof(uint32_t)); + // h_sis_pinned: always allocated — used by commit's overlapped SIS D2H. + PIN_ALLOC(p->h_sis_pinned, scw * degree * sizeof(uint32_t)); + // h_enc_pinned, h_leaves_pinned: lazy (only for commit_and_extract path). + p->h_enc_pinned = nullptr; + p->h_leaves_pinned = nullptr; + #undef PIN_ALLOC + + // Streams for pipelined H2D + compute + CUDA_CHECK(cudaStreamCreate(&p->stream_xfer)); + CUDA_CHECK(cudaStreamCreate(&p->stream_compute)); + CUDA_CHECK(cudaEventCreateWithFlags(&p->h2d_event, cudaEventDisableTiming)); + // Events for async extraction overlap + CUDA_CHECK(cudaEventCreateWithFlags(&p->ev_rs_done, cudaEventDisableTiming)); + CUDA_CHECK(cudaEventCreateWithFlags(&p->ev_sis_done, cudaEventDisableTiming)); + CUDA_CHECK(cudaEventCreateWithFlags(&p->ev_p2_done, cudaEventDisableTiming)); + + *out = p; + return KB_SUCCESS; +} + +// Accessors for pinned host buffers (Go wraps these as zero-copy slices) +extern "C" uint32_t *kb_vortex_pipeline_input_buf(kb_vortex_pipeline_t p) { return p ? p->h_input : nullptr; } +extern "C" uint32_t *kb_vortex_pipeline_tree_buf(kb_vortex_pipeline_t p) { return p ? p->h_tree : nullptr; } + +// Set coset scaling tables for rate > 2 RS encoding (multi-coset NTT). +// coset_tables: [(rate-1) × n_cols] flat array, table k at offset k*n_cols. +// Each table: coset_k_br[j] = (Ω^k)^{bitrev(j)} / n (bit-reversed, normalized). +extern "C" kb_error_t kb_vortex_pipeline_set_coset_tables( + kb_vortex_pipeline_t p, + const uint32_t *coset_tables, + size_t n_tables) { + if (!p || !coset_tables) return KB_ERROR_INVALID; + if ((int)n_tables != p->rate - 1) return KB_ERROR_SIZE; + + size_t nc = p->n_cols; + size_t table_bytes = n_tables * nc * sizeof(uint32_t); + size_t coeffs_bytes = p->max_n_rows * nc * sizeof(uint32_t); + + // Allocate coset tables on device + if (!p->d_coset_tables) { + if (cudaMalloc(&p->d_coset_tables, table_bytes) != cudaSuccess) + return KB_ERROR_CUDA; + } + cudaMemcpy(p->d_coset_tables, coset_tables, table_bytes, cudaMemcpyHostToDevice); + + // Allocate coefficients backup buffer + if (!p->d_coeffs) { + if (cudaMalloc(&p->d_coeffs, coeffs_bytes) != cudaSuccess) + return KB_ERROR_CUDA; + } + + return KB_SUCCESS; +} + +// ── Merkle tree ───────────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_merkle_build(gnark_gpu_context_t, kb_p2_t p, + const uint32_t *leaves, size_t n_leaves, + uint32_t *tree_buf) { + if (!p || p->width != 16) return KB_ERROR_INVALID; + + size_t np = next_pow2(n_leaves); + size_t total_nodes = 2 * np; + size_t hash_bytes = 8 * sizeof(uint32_t); + + uint32_t *d_tree; + CUDA_CHECK(cudaMalloc(&d_tree, total_nodes * hash_bytes)); + CUDA_CHECK(cudaMemset(d_tree, 0, total_nodes * hash_bytes)); + + // Copy leaves into bottom level (indices np .. np+n_leaves-1) + CUDA_CHECK(cudaMemcpy(d_tree + np * 8, leaves, n_leaves * hash_bytes, cudaMemcpyHostToDevice)); + + // Build bottom-up + for (size_t level_size = np; level_size > 1; level_size >>= 1) { + size_t parent_start = level_size / 2; + size_t n_pairs = level_size / 2; + kern_merkle_level<<>>( + d_tree + level_size * 8, + d_tree + parent_start * 8, + p->d_round_keys, p->d_diag, + n_pairs); + } + + // Copy tree back (skip index 0; root at index 1) + CUDA_CHECK(cudaMemcpy(tree_buf, d_tree + 8, (total_nodes - 1) * hash_bytes, cudaMemcpyDeviceToHost)); + cudaFree(d_tree); + return KB_SUCCESS; +} + +// ── Linear combination ────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_lincomb_e4(gnark_gpu_context_t, + kb_vec_t *rows, size_t n_rows, size_t n_cols, + const uint32_t alpha_raw[4], uint32_t *result) { + if (!rows || n_rows == 0 || n_cols == 0) return KB_ERROR_INVALID; + + const uint32_t **h_ptrs = new const uint32_t*[n_rows]; + for (size_t i = 0; i < n_rows; i++) { + if (!rows[i]) { delete[] h_ptrs; return KB_ERROR_INVALID; } + h_ptrs[i] = rows[i]->d_data; + } + + const uint32_t **d_ptrs; + CUDA_CHECK(cudaMalloc(&d_ptrs, n_rows * sizeof(uint32_t*))); + CUDA_CHECK(cudaMemcpy((void*)d_ptrs, h_ptrs, n_rows * sizeof(uint32_t*), cudaMemcpyHostToDevice)); + delete[] h_ptrs; + + uint32_t *d_result; + CUDA_CHECK(cudaMalloc(&d_result, n_cols * 4 * sizeof(uint32_t))); + + E4 alpha = {{alpha_raw[0], alpha_raw[1]}, {alpha_raw[2], alpha_raw[3]}}; + kern_lincomb_e4<<>>(d_ptrs, n_rows, n_cols, alpha, d_result); + + CUDA_CHECK(cudaMemcpy(result, d_result, n_cols * 4 * sizeof(uint32_t), cudaMemcpyDeviceToHost)); + cudaFree((void*)d_ptrs); + cudaFree(d_result); + return KB_SUCCESS; +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Vortex commit pipeline — GPU RS encode + SIS + Merkle +// ═════════════════════════════════════════════════════════════════════════════ +// +// RS encode (rate=2) runs entirely on GPU via batch NTT, eliminating CPU RS +// and halving H2D data volume (only raw rows, not encoded codewords). +// +// raw_rows [nR × nC] d_encoded_col [scw × nR] +// host, pinned ──H2D──▶ d_work +// │ scatter_transpose → even cols +// │ iFFT_DIF(inv_tw) + scale(cosetBR·cardInv) + FFT_DIT(fwd_tw) +// └─scatter_transpose → odd cols +// +// d_encoded_col ──SIS──▶ d_sis ──sponge──▶ d_leaves ──merkle──▶ d_tree +// +// Set KB_VORTEX_TIMING=1 for per-phase CUDA event timing on stderr. + +// Helper: RS encode chunk_rows rows starting at d_chunk in d_work, +// scatter even/odd columns into d_encoded_col offset by row_off. +static void rs_encode_chunk(KBVortexPipeline *p, uint32_t *d_chunk, + size_t chunk_rows, size_t total_rows, + size_t row_off, cudaStream_t s) { + size_t nc = p->n_cols; + int log_nc = p->log_n_cols; + + // Transpose original → even columns (d_encoded_col + row_off) + { + dim3 blk(32, 8); + dim3 grd(((unsigned)nc + 31) / 32, ((unsigned)chunk_rows + 31) / 32); + kern_scatter_transpose<<>>( + d_chunk, p->d_encoded_col + row_off, + chunk_rows, nc, total_rows, 2, 0); + } + + // RS encode (fused NTT: global DIF → fused tile → global DIT) + { + size_t pairs_per_row = nc >> 1; + size_t total_pairs = chunk_rows * pairs_per_row; + int grid1d = kb_grid(total_pairs); + + int tile_log = log_nc < 13 ? log_nc : 13; + int s_cut = log_nc - tile_log; + + for (int st = 0; st < s_cut; st++) + kern_batch_ntt_dif<<>>( + d_chunk, p->d_rs_inv_tw, nc, st, chunk_rows, nc); + + { + int tile_size = 1 << tile_log; + int tiles_per_row = (int)(nc >> tile_log); + int n_blocks = (int)((size_t)tiles_per_row * chunk_rows); + int threads = tile_size >> 1; + if (threads > 1024) threads = 1024; + size_t fused_smem = tile_size * sizeof(uint32_t); + + kern_batch_ntt_fused<<>>( + d_chunk, p->d_rs_inv_tw, p->d_rs_fwd_tw, p->d_scaled_coset, + log_nc, s_cut, chunk_rows, nc); + } + + for (int st = s_cut - 1; st >= 0; st--) + kern_batch_ntt_dit<<>>( + d_chunk, p->d_rs_fwd_tw, nc, st, chunk_rows, nc); + } + + // Transpose FFT result → odd columns + { + dim3 blk(32, 8); + dim3 grd(((unsigned)nc + 31) / 32, ((unsigned)chunk_rows + 31) / 32); + kern_scatter_transpose<<>>( + d_chunk, p->d_encoded_col + row_off, + chunk_rows, nc, total_rows, 2, 1); + } +} + +// RS encode for rate > 2 via multi-coset NTT. +// +// For rate ρ, the codeword evaluates f on ρ cosets of the small domain: +// coset k = Ω^k · {ω⁰, ω¹, ..., ω^{n-1}}, k = 0..ρ-1 +// +// Algorithm per chunk: +// 1. Scatter original values → d_encoded_col at stride=ρ, offset=0 +// 2. Partial IFFT (s_cut global DIF stages) on d_chunk +// 3. Save partial state → d_coeffs +// 4. For k = 1..ρ-1: +// a. Restore d_coeffs → d_chunk +// b. Fused IFFT tail + scale(coset_k_br) + FFT head +// c. Global DIT stages +// d. Scatter → d_encoded_col at stride=ρ, offset=k +static void rs_encode_chunk_general(KBVortexPipeline *p, uint32_t *d_chunk, + size_t chunk_rows, size_t total_rows, + size_t row_off, cudaStream_t s) { + size_t nc = p->n_cols; + int log_nc = p->log_n_cols; + int rate = p->rate; + + // 1. Scatter original values → coset 0 (stride=ρ, offset=0) + { + dim3 blk(32, 8); + dim3 grd(((unsigned)nc + 31) / 32, ((unsigned)chunk_rows + 31) / 32); + kern_scatter_transpose<<>>( + d_chunk, p->d_encoded_col + row_off, + chunk_rows, nc, total_rows, rate, 0); + } + + // 2. Partial IFFT: global DIF stages [0..s_cut) + size_t pairs_per_row = nc >> 1; + size_t total_pairs = chunk_rows * pairs_per_row; + int grid1d = kb_grid(total_pairs); + + int tile_log = log_nc < 13 ? log_nc : 13; + int s_cut = log_nc - tile_log; + + for (int st = 0; st < s_cut; st++) + kern_batch_ntt_dif<<>>( + d_chunk, p->d_rs_inv_tw, nc, st, chunk_rows, nc); + + // 3. Save partial IFFT state + cudaMemcpyAsync(p->d_coeffs, d_chunk, + chunk_rows * nc * sizeof(uint32_t), + cudaMemcpyDeviceToDevice, s); + + // 4. For each coset k = 1..rate-1 + int tile_size = 1 << tile_log; + int tiles_per_row = (int)(nc >> tile_log); + int n_blocks = (int)((size_t)tiles_per_row * chunk_rows); + int threads = tile_size >> 1; + if (threads > 1024) threads = 1024; + size_t fused_smem = tile_size * sizeof(uint32_t); + + for (int k = 1; k < rate; k++) { + // 4a. Restore partial state + cudaMemcpyAsync(d_chunk, p->d_coeffs, + chunk_rows * nc * sizeof(uint32_t), + cudaMemcpyDeviceToDevice, s); + + // 4b. Fused: complete IFFT + scale by coset_k + start FFT + const uint32_t *coset_k = p->d_coset_tables + (size_t)(k - 1) * nc; + kern_batch_ntt_fused<<>>( + d_chunk, p->d_rs_inv_tw, p->d_rs_fwd_tw, coset_k, + log_nc, s_cut, chunk_rows, nc); + + // 4c. Complete FFT: global DIT stages [s_cut-1..0] + for (int st = s_cut - 1; st >= 0; st--) + kern_batch_ntt_dit<<>>( + d_chunk, p->d_rs_fwd_tw, nc, st, chunk_rows, nc); + + // 4d. Scatter → d_encoded_col at stride=ρ, offset=k + dim3 blk(32, 8); + dim3 grd(((unsigned)nc + 31) / 32, ((unsigned)chunk_rows + 31) / 32); + kern_scatter_transpose<<>>( + d_chunk, p->d_encoded_col + row_off, + chunk_rows, nc, total_rows, rate, k); + } +} + +// Forward declaration for transpose kernel (defined later with kb_vortex_extract_all_rowmajor). +__global__ void kern_transpose_col_to_row(const uint32_t *__restrict__ col_major, + uint32_t *__restrict__ row_major, + size_t n_rows, size_t scw); + +extern "C" kb_error_t kb_vortex_commit(kb_vortex_pipeline_t p, + const uint32_t *raw_rows, + size_t n_rows) { + if (!p || !raw_rows) return KB_ERROR_INVALID; + if (n_rows > p->max_n_rows) return KB_ERROR_SIZE; + + size_t nc = p->n_cols; + size_t scw = p->size_codeword; + int degree = p->degree; + size_t np = p->tree_np; + + cudaStream_t sx = p->stream_xfer; + cudaStream_t sc = p->stream_compute; + + // ── Optional CUDA event timing ─────────────────────────────────────── + cudaEvent_t t[6]; + bool timing = kb_timing_enabled(); + if (timing) { + for (int i = 0; i < 6; i++) cudaEventCreate(&t[i]); + cudaEventRecord(t[0], sc); + } + + // ── 1-4. Pipelined H2D + RS encode (2-chunk overlap) ──────────────── + // + // stream_xfer: [H2D chunk0]──event──[H2D chunk1]──event── + // stream_compute: [RS encode chunk0] [RS encode chunk1] + // + // Chunk 0 occupies d_work[0 .. c0*nc), chunk 1 occupies d_work[c0*nc ..]. + // Row offset into d_encoded_col ensures correct global row positions. + { + // Chunk rows for H2D overlap + L2 cache locality. + // Sweet spot: ~32 rows/chunk at nc=2^19 (64 MB, fits 96 MB L2). + int N_CHUNKS = (int)n_rows / 32; + if (N_CHUNKS < 1) N_CHUNKS = 1; + size_t chunk_size = n_rows / N_CHUNKS; + for (int k = 0; k < N_CHUNKS; k++) { + size_t c_rows = (k < N_CHUNKS - 1) ? chunk_size : n_rows - k * chunk_size; + size_t row_off = k * chunk_size; + uint32_t *d_chunk = p->d_work + row_off * nc; + + // Async H2D on transfer stream + CUDA_CHECK(cudaMemcpyAsync(d_chunk, raw_rows + row_off * nc, + c_rows * nc * sizeof(uint32_t), + cudaMemcpyHostToDevice, sx)); + cudaEventRecord(p->h2d_event, sx); + + // Compute stream waits for this chunk's H2D + cudaStreamWaitEvent(sc, p->h2d_event); + + // RS encode this chunk on compute stream + if (p->rate == 2) + rs_encode_chunk(p, d_chunk, c_rows, n_rows, row_off, sc); + else + rs_encode_chunk_general(p, d_chunk, c_rows, n_rows, row_off, sc); + } + } + if (timing) cudaEventRecord(t[1], sc); + + // ── 5. SIS hash (all columns, needs full d_encoded_col) ────────────── + size_t smem = 2 * degree * sizeof(uint32_t); + kern_sis_hash<<<(int)scw, KB_BLOCK, smem, sc>>>( + p->d_encoded_col, (int)n_rows, (int)scw, + p->sis->d_ag, p->sis->n_polys, degree, p->sis->log_degree, + p->sis->d_fwd_tw, p->sis->d_inv_tw, + p->sis->d_coset_table, p->sis->d_coset_inv, + p->d_sis); + if (timing) cudaEventRecord(t[2], sc); + + // ── 6. Poseidon2 MD hash (width=16): SIS hashes → leaf hashes ───── + // Matches CPU CompressPoseidon2x16: iterative Davies-Meyer, width=16. + kern_p2_md_hash<<>>( + p->d_sis, (size_t)degree, p->d_leaves, + p->p2_compress->d_round_keys, p->p2_compress->d_diag, scw); + if (timing) cudaEventRecord(t[3], sc); + + // ── 7. Overlap: SIS D2H on transfer stream while Merkle builds on compute ─ + // After P2 hash, d_sis is no longer read — safe to D2H on sx. + cudaEventRecord(p->h2d_event, sc); // mark P2 hash completion + cudaStreamWaitEvent(sx, p->h2d_event); + CUDA_CHECK(cudaMemcpyAsync(p->h_sis_pinned, p->d_sis, + scw * degree * sizeof(uint32_t), + cudaMemcpyDeviceToHost, sx)); + + // ── 8. Merkle tree (bottom-up Poseidon2 compression) on compute stream ─ + CUDA_CHECK(cudaMemsetAsync(p->d_tree, 0, 2 * np * 8 * sizeof(uint32_t), sc)); + CUDA_CHECK(cudaMemcpyAsync(p->d_tree + np * 8, p->d_leaves, + scw * 8 * sizeof(uint32_t), + cudaMemcpyDeviceToDevice, sc)); + for (size_t level = np; level > 1; level >>= 1) { + size_t n_pairs = level / 2; + kern_merkle_level<<>>( + p->d_tree + level * 8, + p->d_tree + (level / 2) * 8, + p->p2_compress->d_round_keys, p->p2_compress->d_diag, n_pairs); + } + if (timing) cudaEventRecord(t[4], sc); + + // ── 9. D2H tree → pinned host buffer ──────────────────────────────── + CUDA_CHECK(cudaMemcpyAsync(p->h_tree, p->d_tree + 8, + (2 * np - 1) * 8 * sizeof(uint32_t), + cudaMemcpyDeviceToHost, sc)); + CUDA_CHECK(cudaStreamSynchronize(sc)); + CUDA_CHECK(cudaStreamSynchronize(sx)); // ensure SIS D2H is done too + if (timing) cudaEventRecord(t[5], sc); + + // ── Print timing ───────────────────────────────────────────────────── + if (timing) { + cudaDeviceSynchronize(); + static const char *labels[] = { + "H2D+RS encode", "SIS hash", "sponge", "merkle", "D2H tree" + }; + fprintf(stderr, "vortex_commit (n_rows=%zu, nc=%zu, scw=%zu):\n", + n_rows, nc, scw); + for (int i = 0; i < 5; i++) { + float ms; + cudaEventElapsedTime(&ms, t[i], t[i + 1]); + fprintf(stderr, " %-16s %8.2f ms\n", labels[i], ms); + } + float total; + cudaEventElapsedTime(&total, t[0], t[5]); + fprintf(stderr, " %-16s %8.2f ms\n", "TOTAL", total); + for (int i = 0; i < 6; i++) cudaEventDestroy(t[i]); + } + + return KB_SUCCESS; +} + +// ═════════════════════════════════════════════════════════════════════════════ +// Async commit + extract: overlaps D2H with SIS/P2/Merkle compute +// ═════════════════════════════════════════════════════════════════════════════ +// +// Timeline: +// stream_compute: [H2D+RS encode] → ev_rs_done → [SIS] → ev_sis_done → [P2] → ev_p2_done → [Merkle] → [D2H tree] +// stream_xfer: wait(ev_rs) → [transpose] → [D2H enc] +// wait(ev_sis) → [D2H sis] +// wait(ev_p2) → [D2H leaves] +// +// After both streams sync, pinned host buffers contain: +// h_enc_pinned: row-major encoded matrix [n_rows × scw] +// h_sis_pinned: flat SIS hashes [scw × degree] +// h_leaves_pinned: leaf hashes [scw × 8] +// h_tree: Merkle tree (heap layout) + +extern "C" kb_error_t kb_vortex_commit_and_extract(kb_vortex_pipeline_t p, + const uint32_t *raw_rows, + size_t n_rows) { + if (!p || !raw_rows) return KB_ERROR_INVALID; + if (n_rows > p->max_n_rows) return KB_ERROR_SIZE; + + size_t nc = p->n_cols; + size_t scw = p->size_codeword; + int degree = p->degree; + size_t np = p->tree_np; + + // Lazy-allocate buffers only needed by commit_and_extract path + if (!p->d_enc_rowmajor) { + if (cudaMalloc(&p->d_enc_rowmajor, scw * p->max_n_rows * sizeof(uint32_t)) != cudaSuccess) + return KB_ERROR_CUDA; + } + if (!p->h_enc_pinned) { + if (cudaMallocHost(&p->h_enc_pinned, scw * p->max_n_rows * sizeof(uint32_t)) != cudaSuccess) + return KB_ERROR_CUDA; + } + if (!p->h_sis_pinned) { + if (cudaMallocHost(&p->h_sis_pinned, scw * degree * sizeof(uint32_t)) != cudaSuccess) + return KB_ERROR_CUDA; + } + if (!p->h_leaves_pinned) { + if (cudaMallocHost(&p->h_leaves_pinned, scw * 8 * sizeof(uint32_t)) != cudaSuccess) + return KB_ERROR_CUDA; + } + + cudaStream_t sx = p->stream_xfer; + cudaStream_t sc = p->stream_compute; + + bool timing = (getenv("KB_VORTEX_TIMING") && atoi(getenv("KB_VORTEX_TIMING"))); + cudaEvent_t t0, t1, t2, t3, t4, t5; + if (timing) { + cudaEventCreate(&t0); cudaEventCreate(&t1); cudaEventCreate(&t2); + cudaEventCreate(&t3); cudaEventCreate(&t4); cudaEventCreate(&t5); + CUDA_CHECK(cudaDeviceSynchronize()); + cudaEventRecord(t0, sc); + } + + // ── 1-4. Pipelined H2D + RS encode (same as kb_vortex_commit) ──────── + { + int N_CHUNKS = (int)n_rows / 32; + if (N_CHUNKS < 1) N_CHUNKS = 1; + size_t chunk_size = n_rows / N_CHUNKS; + for (int k = 0; k < N_CHUNKS; k++) { + size_t c_rows = (k < N_CHUNKS - 1) ? chunk_size : n_rows - k * chunk_size; + size_t row_off = k * chunk_size; + uint32_t *d_chunk = p->d_work + row_off * nc; + + CUDA_CHECK(cudaMemcpyAsync(d_chunk, raw_rows + row_off * nc, + c_rows * nc * sizeof(uint32_t), + cudaMemcpyHostToDevice, sx)); + cudaEventRecord(p->h2d_event, sx); + cudaStreamWaitEvent(sc, p->h2d_event); + + if (p->rate == 2) + rs_encode_chunk(p, d_chunk, c_rows, n_rows, row_off, sc); + else + rs_encode_chunk_general(p, d_chunk, c_rows, n_rows, row_off, sc); + } + } + + // ── RS encoding complete: signal xfer stream to start D2H ──────────── + if (timing) cudaEventRecord(t1, sc); + cudaEventRecord(p->ev_rs_done, sc); + + // ── stream_xfer: transpose + async D2H of encoded matrix ───────────── + cudaStreamWaitEvent(sx, p->ev_rs_done); + size_t enc_total = scw * n_rows; + kern_transpose_col_to_row<<>>( + p->d_encoded_col, p->d_enc_rowmajor, n_rows, scw); + CUDA_CHECK(cudaMemcpyAsync(p->h_enc_pinned, p->d_enc_rowmajor, + enc_total * sizeof(uint32_t), + cudaMemcpyDeviceToHost, sx)); + + // ── stream_compute: SIS hash ───────────────────────────────────────── + if (timing) cudaEventRecord(t2, sc); + size_t smem = 2 * degree * sizeof(uint32_t); + kern_sis_hash<<<(int)scw, KB_BLOCK, smem, sc>>>( + p->d_encoded_col, (int)n_rows, (int)scw, + p->sis->d_ag, p->sis->n_polys, degree, p->sis->log_degree, + p->sis->d_fwd_tw, p->sis->d_inv_tw, + p->sis->d_coset_table, p->sis->d_coset_inv, + p->d_sis); + cudaEventRecord(p->ev_sis_done, sc); + + // ── stream_xfer: async D2H of SIS hashes (after SIS compute done) ─── + cudaStreamWaitEvent(sx, p->ev_sis_done); + CUDA_CHECK(cudaMemcpyAsync(p->h_sis_pinned, p->d_sis, + scw * degree * sizeof(uint32_t), + cudaMemcpyDeviceToHost, sx)); + + // ── stream_compute: Poseidon2 MD hash (width=16) ─────────────────── + if (timing) cudaEventRecord(t3, sc); + kern_p2_md_hash<<>>( + p->d_sis, (size_t)degree, p->d_leaves, + p->p2_compress->d_round_keys, p->p2_compress->d_diag, scw); + cudaEventRecord(p->ev_p2_done, sc); + + // ── stream_xfer: async D2H of leaves (after P2 compute done) ──────── + cudaStreamWaitEvent(sx, p->ev_p2_done); + CUDA_CHECK(cudaMemcpyAsync(p->h_leaves_pinned, p->d_leaves, + scw * 8 * sizeof(uint32_t), + cudaMemcpyDeviceToHost, sx)); + + // ── stream_compute: Merkle tree + D2H tree ────────────────────────── + if (timing) cudaEventRecord(t4, sc); + CUDA_CHECK(cudaMemsetAsync(p->d_tree, 0, 2 * np * 8 * sizeof(uint32_t), sc)); + CUDA_CHECK(cudaMemcpyAsync(p->d_tree + np * 8, p->d_leaves, + scw * 8 * sizeof(uint32_t), + cudaMemcpyDeviceToDevice, sc)); + for (size_t level = np; level > 1; level >>= 1) { + size_t n_pairs = level / 2; + kern_merkle_level<<>>( + p->d_tree + level * 8, + p->d_tree + (level / 2) * 8, + p->p2_compress->d_round_keys, p->p2_compress->d_diag, n_pairs); + } + CUDA_CHECK(cudaMemcpyAsync(p->h_tree, p->d_tree + 8, + (2 * np - 1) * 8 * sizeof(uint32_t), + cudaMemcpyDeviceToHost, sc)); + + // ── Sync both streams ──────────────────────────────────────────────── + CUDA_CHECK(cudaStreamSynchronize(sc)); + CUDA_CHECK(cudaStreamSynchronize(sx)); + + if (timing) { + cudaEventRecord(t5, sc); + cudaEventSynchronize(t5); + float ms01, ms12, ms23, ms34, ms45, ms05; + cudaEventElapsedTime(&ms01, t0, t1); + cudaEventElapsedTime(&ms12, t1, t2); + cudaEventElapsedTime(&ms23, t2, t3); + cudaEventElapsedTime(&ms34, t3, t4); + cudaEventElapsedTime(&ms45, t4, t5); + cudaEventElapsedTime(&ms05, t0, t5); + fprintf(stderr, "vortex_commit_and_extract (n_rows=%zu, nc=%zu, scw=%zu):\n" + " H2D+RS encode %8.2f ms\n" + " xfer setup %8.2f ms\n" + " SIS hash %8.2f ms\n" + " P2 MD hash %8.2f ms\n" + " Merkle+D2H+sync %8.2f ms\n" + " TOTAL %8.2f ms\n", + n_rows, nc, scw, ms01, ms12, ms23, ms34, ms45, ms05); + cudaEventDestroy(t0); cudaEventDestroy(t1); cudaEventDestroy(t2); + cudaEventDestroy(t3); cudaEventDestroy(t4); cudaEventDestroy(t5); + } + + return KB_SUCCESS; +} + +// Accessors for async extraction pinned host buffers. +extern "C" uint32_t *kb_vortex_h_enc_pinned(kb_vortex_pipeline_t p) { return p ? p->h_enc_pinned : nullptr; } +extern "C" uint32_t *kb_vortex_h_sis_pinned(kb_vortex_pipeline_t p) { return p ? p->h_sis_pinned : nullptr; } +extern "C" uint32_t *kb_vortex_h_leaves_pinned(kb_vortex_pipeline_t p) { return p ? p->h_leaves_pinned : nullptr; } + +// ═════════════════════════════════════════════════════════════════════════════ +// Symbolic expression evaluator — GPU bytecode VM +// ═════════════════════════════════════════════════════════════════════════════ +// +// One thread per element, every thread executes the same bytecode → zero warp +// divergence. Slots live in per-thread local memory (L1-cached). +// +// thread i: +// E4 slots[num_slots] +// for pc in program: +// OP_CONST → slots[dst] = consts[ci] +// OP_INPUT → slots[dst] = read_input(inputs[id], i, n) +// OP_MUL → slots[dst] = Π slots[sₖ]^eₖ +// OP_LINCOMB → slots[dst] = Σ cₖ · slots[sₖ] +// OP_POLYEVAL→ slots[dst] = Horner(x, c₀..cₘ) +// out[i] = slots[result_slot] + +struct KBSymProgram { + uint32_t *d_program; + uint32_t *d_consts; + uint32_t pgm_len; + uint32_t num_consts; + uint32_t num_slots; + uint32_t result_slot; + + // Per-program reusable buffers, sized once on first call and reused on + // every subsequent call. Eliminates two cudaMalloc + cudaFree pairs + // per kb_sym_eval, which used to be a measurable fraction of the + // quotient hot path (especially at small n where the launch + alloc + // costs dominate the actual symbolic eval kernel). + SymInputDesc *d_inputs_pool; + size_t d_inputs_capacity; // in elements + uint32_t *d_out_pool; + size_t d_out_capacity; // in uint32_t (== 4 × n_elements_E4) +}; + +__device__ E4 sym_read_input(const SymInputDesc *desc, uint32_t i, uint32_t n) { + switch (desc->tag) { + case 0: return e4_from_kb(desc->d_ptr[i]); // KB column + case 1: return E4{{desc->val[0], desc->val[1]}, // E4 constant + {desc->val[2], desc->val[3]}}; + case 2: {uint32_t j = (i + desc->offset) % n; // rotated KB + return e4_from_kb(desc->d_ptr[j]);} + case 3: {const uint32_t *p = &desc->d_ptr[i * 4]; // E4 vector + return E4{{p[0], p[1]}, {p[2], p[3]}};} + case 4: {const uint32_t *p = desc->d_ptr; // E4 vector (SoA) + return E4{{p[i], p[n + i]}, {p[2 * n + i], p[3 * n + i]}};} + case 5: {uint32_t j = (i + desc->offset) % n; // rotated E4 vector (SoA) + const uint32_t *p = desc->d_ptr; + return E4{{p[j], p[n + j]}, {p[2 * n + j], p[3 * n + j]}};} + case 6: {uint32_t j = (i + desc->offset) % n; // rotated E4 vector (AoS) + const uint32_t *p = &desc->d_ptr[j * 4]; + return E4{{p[0], p[1]}, {p[2], p[3]}};} + default: return e4_zero(); + } +} + +__global__ void kern_symbolic_eval( + const uint32_t *program, uint32_t pgm_len, + const uint32_t *consts, // E4 constants, 4 words each + const SymInputDesc *inputs, + uint32_t n, + uint32_t result_slot, + uint32_t *d_out) +{ + uint32_t tid = (uint32_t)blockIdx.x * blockDim.x + threadIdx.x; + if (tid >= n) return; + + E4 slots[SYM_MAX_SLOTS]; + uint32_t pc = 0; + + while (pc < pgm_len) { + uint32_t op = program[pc++]; + uint32_t dst = program[pc++]; + + switch (op) { + case 0: { // OP_CONST + uint32_t ci = program[pc++]; + const uint32_t *c = consts + ci * 4; + slots[dst] = E4{{c[0], c[1]}, {c[2], c[3]}}; + break; + } + case 1: { // OP_INPUT + uint32_t id = program[pc++]; + slots[dst] = sym_read_input(&inputs[id], tid, n); + break; + } + case 2: { // OP_MUL: dst = Π slots[sₖ]^eₖ + uint32_t nc = program[pc++]; + uint32_t s0 = program[pc++]; + uint32_t e0 = program[pc++]; + E4 acc = e4_pow(slots[s0], e0); + for (uint32_t k = 1; k < nc; k++) { + uint32_t s = program[pc++]; + uint32_t e = program[pc++]; + acc = e4_mul(acc, e4_pow(slots[s], e)); + } + slots[dst] = acc; + break; + } + case 3: { // OP_LINCOMB: dst = Σ cₖ · slots[sₖ] + uint32_t nc = program[pc++]; + uint32_t s0 = program[pc++]; + int32_t c0 = (int32_t)program[pc++]; + E4 acc = e4_scale_signed(slots[s0], c0); + for (uint32_t k = 1; k < nc; k++) { + uint32_t s = program[pc++]; + int32_t c = (int32_t)program[pc++]; + acc = e4_add(acc, e4_scale_signed(slots[s], c)); + } + slots[dst] = acc; + break; + } + case 4: { // OP_POLYEVAL: Horner children=[x, c₀, c₁, ..., cₘ] + // P(x) = c₀ + c₁x + c₂x² + ... + cₘxᵐ + // Horner: acc = cₘ; for k = m-1..0: acc = acc·x + cₖ + uint32_t nc = program[pc++]; + E4 x = slots[program[pc]]; + E4 acc = slots[program[pc + nc - 1]]; + for (int k = (int)nc - 2; k >= 1; k--) + acc = e4_add(e4_mul(acc, x), slots[program[pc + k]]); + pc += nc; + slots[dst] = acc; + break; + } + } // switch + } // while + + // Write result as flat E4 + d_out[tid * 4 + 0] = slots[result_slot].b0.a0; + d_out[tid * 4 + 1] = slots[result_slot].b0.a1; + d_out[tid * 4 + 2] = slots[result_slot].b1.a0; + d_out[tid * 4 + 3] = slots[result_slot].b1.a1; +} + +// ── Symbolic C ABI ────────────────────────────────────────────────────────── + +extern "C" kb_error_t kb_sym_compile(gnark_gpu_context_t, + const uint32_t *bytecode, uint32_t pgm_len, + const uint32_t *constants, uint32_t num_consts, + uint32_t num_slots, + uint32_t result_slot, + kb_sym_program_t *out) { + if ((!bytecode && pgm_len) || num_slots > SYM_MAX_SLOTS) + return KB_ERROR_INVALID; + + auto *p = new(std::nothrow) KBSymProgram{}; + if (!p) return KB_ERROR_OOM; + + p->pgm_len = pgm_len; + p->num_consts = num_consts; + p->num_slots = num_slots; + p->result_slot = result_slot; + + if (pgm_len > 0) { + CUDA_CHECK(cudaMalloc(&p->d_program, pgm_len * sizeof(uint32_t))); + CUDA_CHECK(cudaMemcpy(p->d_program, bytecode, + pgm_len * sizeof(uint32_t), cudaMemcpyHostToDevice)); + } + if (num_consts > 0) { + CUDA_CHECK(cudaMalloc(&p->d_consts, num_consts * 4 * sizeof(uint32_t))); + CUDA_CHECK(cudaMemcpy(p->d_consts, constants, + num_consts * 4 * sizeof(uint32_t), cudaMemcpyHostToDevice)); + } + *out = p; + return KB_SUCCESS; +} + +extern "C" void kb_sym_free(kb_sym_program_t p) { + if (!p) return; + cudaFree(p->d_program); + cudaFree(p->d_consts); + if (p->d_inputs_pool) cudaFree(p->d_inputs_pool); + if (p->d_out_pool) cudaFree(p->d_out_pool); + delete p; +} + +extern "C" kb_error_t kb_sym_eval(gnark_gpu_context_t, + kb_sym_program_t program, + const SymInputDesc *h_inputs, uint32_t num_inputs, + uint32_t n, + uint32_t *h_out) { + if (!program || !h_out || n == 0) return KB_ERROR_INVALID; + + // Reuse the per-program input descriptor buffer; grow on demand. + // Was: cudaMalloc + cudaMemcpy(sync) + cudaFree on every call. + // Now: zero alloc on the steady state; D2H stays sync because the + // caller (gpu/quotient.RunGPU) needs the result before the + // host-side annulator scaling pass. + if (num_inputs > 0) { + if (program->d_inputs_capacity < num_inputs) { + if (program->d_inputs_pool) cudaFree(program->d_inputs_pool); + CUDA_CHECK(cudaMalloc(&program->d_inputs_pool, + num_inputs * sizeof(SymInputDesc))); + program->d_inputs_capacity = num_inputs; + } + CUDA_CHECK(cudaMemcpy(program->d_inputs_pool, h_inputs, + num_inputs * sizeof(SymInputDesc), + cudaMemcpyHostToDevice)); + } + + // Reuse the per-program output buffer; grow on demand. + size_t out_count = (size_t)n * 4; + if (program->d_out_capacity < out_count) { + if (program->d_out_pool) cudaFree(program->d_out_pool); + CUDA_CHECK(cudaMalloc(&program->d_out_pool, + out_count * sizeof(uint32_t))); + program->d_out_capacity = out_count; + } + + kern_symbolic_eval<<>>( + program->d_program, program->pgm_len, + program->d_consts, + program->d_inputs_pool, + n, + program->result_slot, + program->d_out_pool); + + // D2H of the eval result. Caller materializes it as []fext.E4 in Go + // and runs the per-element annulator scaling on host. A future + // optimization moves that scaling onto the GPU and D2Hs the smaller + // base-field result instead — see worklog Step 4 outstanding items. + CUDA_CHECK(cudaMemcpy(h_out, program->d_out_pool, + out_count * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + + return KB_SUCCESS; +} + +extern "C" uint32_t *kb_vec_device_ptr(kb_vec_t v) { + return v ? v->d_data : nullptr; +} + +// ── GPU Prove helpers ──────────────────────────────────────────────────────── + +// Linear combination on column-major encoded matrix: +// UAlpha[j] = Σᵢ αⁱ · d_encoded_col[j × n_rows + i] +// Result is E4 vector of length scw, written to host. +extern "C" kb_error_t kb_vortex_lincomb(kb_vortex_pipeline_t p, + size_t n_rows, + const uint32_t alpha_raw[4], + uint32_t *result) { + if (!p || !alpha_raw || !result) return KB_ERROR_INVALID; + if (n_rows > p->max_n_rows) return KB_ERROR_SIZE; + + size_t scw = p->size_codeword; + E4 alpha = {{alpha_raw[0], alpha_raw[1]}, {alpha_raw[2], alpha_raw[3]}}; + + uint32_t *d_result; + CUDA_CHECK(cudaMalloc(&d_result, scw * 4 * sizeof(uint32_t))); + + kern_lincomb_e4_colmajor<<>>( + p->d_encoded_col, n_rows, scw, alpha, d_result); + + CUDA_CHECK(cudaMemcpy(result, d_result, + scw * 4 * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + cudaFree(d_result); + return KB_SUCCESS; +} + +// Extract a single column from the column-major encoded matrix to host. +// out[i] = d_encoded_col[col_idx × n_rows + i], i ∈ [0, n_rows) +extern "C" kb_error_t kb_vortex_extract_col(kb_vortex_pipeline_t p, + size_t n_rows, int col_idx, + uint32_t *out) { + if (!p || !out) return KB_ERROR_INVALID; + if (n_rows > p->max_n_rows) return KB_ERROR_SIZE; + if (col_idx < 0 || (size_t)col_idx >= p->size_codeword) return KB_ERROR_INVALID; + + CUDA_CHECK(cudaMemcpy(out, + p->d_encoded_col + (size_t)col_idx * n_rows, + n_rows * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + return KB_SUCCESS; +} + +// Extract full encoded matrix from GPU to host in column-major layout. +// out: [scw × n_rows] uint32, column-major: out[col * n_rows + row]. +extern "C" kb_error_t kb_vortex_extract_all(kb_vortex_pipeline_t p, + size_t n_rows, uint32_t *out) { + if (!p || !out) return KB_ERROR_INVALID; + if (n_rows > p->max_n_rows) return KB_ERROR_SIZE; + + size_t scw = p->size_codeword; + CUDA_CHECK(cudaMemcpy(out, p->d_encoded_col, + scw * n_rows * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + return KB_SUCCESS; +} + +// Transpose kernel: column-major [scw × n_rows] → row-major [n_rows × scw]. +// One thread per element. +__global__ void kern_transpose_col_to_row(const uint32_t *__restrict__ col_major, + uint32_t *__restrict__ row_major, + size_t n_rows, size_t scw) { + size_t idx = (size_t)blockIdx.x * blockDim.x + threadIdx.x; + size_t total = n_rows * scw; + if (idx >= total) return; + size_t row = idx / scw; + size_t col = idx % scw; + row_major[idx] = col_major[col * n_rows + row]; +} + +// Extract full encoded matrix from GPU to host in row-major layout. +// out: [n_rows × scw] uint32, row-major: out[row * scw + col]. +// Transposes on GPU before D2H to avoid costly CPU transposition. +extern "C" kb_error_t kb_vortex_extract_all_rowmajor(kb_vortex_pipeline_t p, + size_t n_rows, + uint32_t *out) { + if (!p || !out) return KB_ERROR_INVALID; + if (n_rows > p->max_n_rows) return KB_ERROR_SIZE; + + size_t scw = p->size_codeword; + size_t total = scw * n_rows; + + // Allocate temp buffer on GPU for row-major result + uint32_t *d_rowmajor = nullptr; + CUDA_CHECK(cudaMalloc(&d_rowmajor, total * sizeof(uint32_t))); + + // Transpose on GPU + kern_transpose_col_to_row<<>>( + p->d_encoded_col, d_rowmajor, n_rows, scw); + CUDA_CHECK(cudaGetLastError()); + + // D2H the row-major buffer + CUDA_CHECK(cudaMemcpy(out, d_rowmajor, total * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + + cudaFree(d_rowmajor); + return KB_SUCCESS; +} + +// Get raw device pointer to column-major encoded matrix. +extern "C" uint32_t *kb_vortex_encoded_device_ptr(kb_vortex_pipeline_t p) { + return p ? p->d_encoded_col : nullptr; +} + +// Lincomb from a standalone column-major device buffer (not pipeline-bound). +// result[j] = Σᵢ αⁱ · d_encoded[j * n_rows + i] ∈ E4, j ∈ [0, scw) +extern "C" kb_error_t kb_lincomb_e4_colmajor(gnark_gpu_context_t ctx, + const uint32_t *d_encoded_col, + size_t n_rows, size_t scw, + const uint32_t alpha_raw[4], + uint32_t *result) { + (void)ctx; + if (!d_encoded_col || !alpha_raw || !result) return KB_ERROR_INVALID; + if (n_rows == 0 || scw == 0) return KB_ERROR_SIZE; + + E4 alpha = {{alpha_raw[0], alpha_raw[1]}, {alpha_raw[2], alpha_raw[3]}}; + + uint32_t *d_result; + CUDA_CHECK(cudaMalloc(&d_result, scw * 4 * sizeof(uint32_t))); + + kern_lincomb_e4_colmajor<<>>( + d_encoded_col, n_rows, scw, alpha, d_result); + + CUDA_CHECK(cudaMemcpy(result, d_result, + scw * 4 * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + cudaFree(d_result); + return KB_SUCCESS; +} + +// Extract SIS column hashes from GPU to host. +// out: flat [scw × degree] uint32, same layout as d_sis. +extern "C" kb_error_t kb_vortex_extract_sis(kb_vortex_pipeline_t p, + size_t n_rows, uint32_t *out) { + if (!p || !out) return KB_ERROR_INVALID; + (void)n_rows; // SIS hashes are per-column, not per-row + size_t scw = p->size_codeword; + size_t deg = (size_t)p->degree; + CUDA_CHECK(cudaMemcpy(out, p->d_sis, + scw * deg * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + return KB_SUCCESS; +} + +// Extract leaf hashes (Poseidon2 digests) from GPU to host. +// out: flat [scw × 8] uint32, same layout as d_leaves. +extern "C" kb_error_t kb_vortex_extract_leaves(kb_vortex_pipeline_t p, + uint32_t *out) { + if (!p || !out) return KB_ERROR_INVALID; + size_t scw = p->size_codeword; + CUDA_CHECK(cudaMemcpy(out, p->d_leaves, + scw * 8 * sizeof(uint32_t), + cudaMemcpyDeviceToHost)); + return KB_SUCCESS; +} + +// Return sizeCodeWord for the pipeline. +extern "C" size_t kb_vortex_scw(kb_vortex_pipeline_t p) { + return p ? p->size_codeword : 0; +} + +// Return degree (SIS polynomial degree) for the pipeline. +extern "C" int kb_vortex_degree(kb_vortex_pipeline_t p) { + return p ? p->degree : 0; +} diff --git a/prover/gpu/cuda/src/vortex/kb_field.cuh b/prover/gpu/cuda/src/vortex/kb_field.cuh new file mode 100644 index 00000000000..8fd5cd2c762 --- /dev/null +++ b/prover/gpu/cuda/src/vortex/kb_field.cuh @@ -0,0 +1,201 @@ +// KoalaBear field arithmetic — GPU device functions +// +// Field: P = 2³¹ − 2²⁴ + 1 = 0x7f000001 (31-bit prime) +// Montgomery form: R = 2³², elements stored as uint32 in [0, P) +// +// Extension tower: +// E2 = KB[u] / (u² − 3) → u² = 3 +// E4 = E2[v] / (v² − u) → v² = u +// E4 element = (a₀, a₁, a₂, a₃) representing a₀ + a₁u + a₂v + a₃uv + +#pragma once +#include + +// ─── Field constants ──────────────────────────────────────────────────────── + +static constexpr uint32_t KB_P = 0x7f000001u; // prime +static constexpr uint32_t KB_MU = 0x7effffffu; // −P⁻¹ mod 2³² +static constexpr uint32_t KB_R2 = 402124772u; // R² mod P (for toMont) +static constexpr uint32_t KB_ONE = 33554430u; // 1 in Montgomery = 2³² mod P +static constexpr uint32_t KB_THREE_M = 100663290u; // 3 in Montgomery form +// Note: KB_THREE_M = kb_mul_host(KB_ONE, 3_mont). Verified: 3·R mod P = 100663290. + +// ─── Base field: Montgomery u32 ───────────────────────────────────────────── + +// Montgomery reduction: x·R⁻¹ mod P +__device__ __forceinline__ uint32_t kb_reduce(uint64_t x) { + uint32_t q = (uint32_t)x * KB_MU; + uint32_t r = (uint32_t)((x + (uint64_t)q * KB_P) >> 32); + return r >= KB_P ? r - KB_P : r; +} + +__device__ __forceinline__ uint32_t kb_add(uint32_t a, uint32_t b) { + uint32_t t = a + b; + return t >= KB_P ? t - KB_P : t; +} + +__device__ __forceinline__ uint32_t kb_sub(uint32_t a, uint32_t b) { + return a >= b ? a - b : a + KB_P - b; +} + +__device__ __forceinline__ uint32_t kb_mul(uint32_t a, uint32_t b) { + return kb_reduce((uint64_t)a * b); +} + +__device__ __forceinline__ uint32_t kb_sqr(uint32_t a) { + return kb_reduce((uint64_t)a * a); +} + +__device__ __forceinline__ uint32_t kb_neg(uint32_t a) { + return a == 0 ? 0 : KB_P - a; +} + +__device__ __forceinline__ uint32_t kb_dbl(uint32_t a) { + return kb_add(a, a); +} + +// v[i] = gⁱ·v[i] — used for coset shift +__device__ __forceinline__ uint32_t kb_mul3(uint32_t a) { + return kb_mul(a, KB_THREE_M); +} + +// ─── E2 = KB[u] / (u² − 3) ───────────────────────────────────────────────── +// +// Element (a₀, a₁) represents a₀ + a₁·u where u² = 3. +// +// Multiplication: (a₀+a₁u)(b₀+b₁u) = (a₀b₀ + 3·a₁b₁) + (a₀b₁ + a₁b₀)u +// Karatsuba: k = (a₀+a₁)(b₀+b₁), d₀ = a₀b₀, d₁ = a₁b₁ +// → c₀ = d₀ + 3·d₁, c₁ = k − d₀ − d₁ + +struct E2 { uint32_t a0, a1; }; + +__device__ __forceinline__ E2 e2_add(E2 a, E2 b) { + return {kb_add(a.a0, b.a0), kb_add(a.a1, b.a1)}; +} + +__device__ __forceinline__ E2 e2_sub(E2 a, E2 b) { + return {kb_sub(a.a0, b.a0), kb_sub(a.a1, b.a1)}; +} + +__device__ __forceinline__ E2 e2_mul(E2 a, E2 b) { + uint32_t d0 = kb_mul(a.a0, b.a0); + uint32_t d1 = kb_mul(a.a1, b.a1); + uint32_t k = kb_mul(kb_add(a.a0, a.a1), kb_add(b.a0, b.a1)); + return {kb_add(d0, kb_mul3(d1)), kb_sub(k, kb_add(d0, d1))}; +} + +// Multiply by non-residue u: (a₀+a₁u)·u = 3a₁ + a₀u +__device__ __forceinline__ E2 e2_mul_nr(E2 a) { + return {kb_mul3(a.a1), a.a0}; +} + +__device__ __forceinline__ E2 e2_neg(E2 a) { + return {kb_neg(a.a0), kb_neg(a.a1)}; +} + +__device__ __forceinline__ E2 e2_sqr(E2 a) { + return e2_mul(a, a); // could optimize but clarity > 2 instructions +} + +// Multiply E2 by base field scalar +__device__ __forceinline__ E2 e2_scale(E2 a, uint32_t s) { + return {kb_mul(a.a0, s), kb_mul(a.a1, s)}; +} + +// ─── E4 = E2[v] / (v² − u) ───────────────────────────────────────────────── +// +// Element (b₀, b₁) represents b₀ + b₁·v where v² = u, and bᵢ ∈ E2. +// Flat layout: (a₀, a₁, a₂, a₃) = (b₀.a0, b₀.a1, b₁.a0, b₁.a1) +// +// Multiplication: (b₀+b₁v)(c₀+c₁v) = (b₀c₀ + b₁c₁·u) + (b₀c₁+b₁c₀)v +// Karatsuba: k = (b₀+b₁)(c₀+c₁), d₀ = b₀c₀, d₁ = b₁c₁ +// → r₀ = d₀ + mulNR(d₁), r₁ = k − d₀ − d₁ + +struct E4 { E2 b0, b1; }; + +__device__ __forceinline__ E4 e4_add(E4 a, E4 b) { + return {e2_add(a.b0, b.b0), e2_add(a.b1, b.b1)}; +} + +__device__ __forceinline__ E4 e4_sub(E4 a, E4 b) { + return {e2_sub(a.b0, b.b0), e2_sub(a.b1, b.b1)}; +} + +__device__ __forceinline__ E4 e4_mul(E4 a, E4 b) { + E2 d0 = e2_mul(a.b0, b.b0); + E2 d1 = e2_mul(a.b1, b.b1); + E2 k = e2_mul(e2_add(a.b0, a.b1), e2_add(b.b0, b.b1)); + return {e2_add(d0, e2_mul_nr(d1)), e2_sub(k, e2_add(d0, d1))}; +} + +// Multiply E4 by base field scalar: s · (a₀+a₁u+a₂v+a₃uv) = (sa₀+sa₁u+sa₂v+sa₃uv) +__device__ __forceinline__ E4 e4_scale(E4 a, uint32_t s) { + return {e2_scale(a.b0, s), e2_scale(a.b1, s)}; +} + +// Accumulate: dst += scalar · e4 (used in linear combination) +__device__ __forceinline__ void e4_mulacc(E4& acc, uint32_t s, E4 alpha_pow) { + // acc += s * alpha_pow (scalar from base field, alpha_pow ∈ E4) + acc = e4_add(acc, e4_scale(alpha_pow, s)); +} + +static __device__ __forceinline__ E4 e4_zero() { + return {{0, 0}, {0, 0}}; +} + +// ─── Montgomery conversion ────────────────────────────────────────────────── +// from_mont: stored (a·R mod P) → canonical a +// to_mont: canonical a → stored (a·R mod P) + +__device__ __forceinline__ uint32_t kb_from_mont(uint32_t a) { + return kb_reduce((uint64_t)a); +} + +__device__ __forceinline__ uint32_t kb_to_mont(uint32_t a) { + return kb_reduce((uint64_t)a * KB_R2); +} + +// ─── E4 extended helpers ──────────────────────────────────────────────────── + +__device__ __forceinline__ E4 e4_one() { + return {{KB_ONE, 0}, {0, 0}}; +} + +__device__ __forceinline__ E4 e4_neg(E4 a) { + return {e2_neg(a.b0), e2_neg(a.b1)}; +} + +__device__ __forceinline__ E4 e4_sqr(E4 a) { + return e4_mul(a, a); +} + +// Square-and-multiply: base^exp. Fast paths for exp ∈ {0,1,2}. +__device__ __forceinline__ E4 e4_pow(E4 base, uint32_t exp) { + if (exp == 0) return e4_one(); + if (exp == 1) return base; + if (exp == 2) return e4_sqr(base); + E4 r = e4_one(); + while (exp > 0) { + if (exp & 1) r = e4_mul(r, base); + base = e4_sqr(base); + exp >>= 1; + } + return r; +} + +// Scale E4 by small signed integer coefficient. +// Fast paths: c ∈ {0, ±1, 2}; general case via Montgomery. +__device__ __forceinline__ E4 e4_scale_signed(E4 a, int32_t c) { + if (c == 1) return a; + if (c == -1) return e4_neg(a); + if (c == 0) return e4_zero(); + if (c == 2) return e4_add(a, a); + uint32_t abs_c = kb_to_mont((uint32_t)(c < 0 ? -c : c)); + E4 r = e4_scale(a, abs_c); + return c < 0 ? e4_neg(r) : r; +} + +// Embed KB base field scalar into E4: val ↦ (val, 0, 0, 0) +__device__ __forceinline__ E4 e4_from_kb(uint32_t val) { + return {{val, 0}, {0, 0}}; +} diff --git a/prover/gpu/device.go b/prover/gpu/device.go new file mode 100644 index 00000000000..6560a6bcb4c --- /dev/null +++ b/prover/gpu/device.go @@ -0,0 +1,191 @@ +//go:build cuda + +package gpu + +/* +#cgo LDFLAGS: -L${SRCDIR}/cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/cuda/include + +#include "gnark_gpu.h" +#include + +extern int cudaGetDeviceCount(int *count); +*/ +import "C" +import ( + "runtime" + "unsafe" +) + +// toError converts a C error code to a Go error. +func toError(code C.gnark_gpu_error_t) error { + switch code { + case C.GNARK_GPU_SUCCESS: + return nil + case C.GNARK_GPU_ERROR_CUDA: + return &Error{Code: int(code), Message: "CUDA error"} + case C.GNARK_GPU_ERROR_INVALID_ARG: + return &Error{Code: int(code), Message: "invalid argument"} + case C.GNARK_GPU_ERROR_OUT_OF_MEMORY: + return &Error{Code: int(code), Message: "out of GPU memory"} + case C.GNARK_GPU_ERROR_SIZE_MISMATCH: + return &Error{Code: int(code), Message: "vector size mismatch"} + default: + return &Error{Code: int(code), Message: "unknown error"} + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Device +// ───────────────────────────────────────────────────────────────────────────── + +// Device manages GPU resources and operations. +// Create with New() and release with Close(). +type Device struct { + handle C.gnark_gpu_context_t + multiStreamInit bool + deviceID int +} + +// New creates a Device on the specified GPU. +// The device must be closed when no longer needed. +func New(opts ...Option) (*Device, error) { + cfg := config{deviceID: 0} + for _, o := range opts { + o(&cfg) + } + + var handle C.gnark_gpu_context_t + if err := toError(C.gnark_gpu_init(C.int(cfg.deviceID), &handle)); err != nil { + return nil, err + } + + d := &Device{handle: handle, deviceID: cfg.deviceID} + runtime.SetFinalizer(d, (*Device).Close) + return d, nil +} + +// PhysicalDeviceCount returns the number of CUDA devices visible to the +// process. It returns 0 if CUDA cannot report a device count. +func PhysicalDeviceCount() int { + var n C.int + if C.cudaGetDeviceCount(&n) != 0 { + return 0 + } + if n < 0 { + return 0 + } + return int(n) +} + +// DeviceID returns the GPU index this Device was created against. +func (d *Device) DeviceID() int { return d.deviceID } + +// Bind sets the CUDA "current device" for the calling OS thread to this +// Device's GPU. CUDA's current-device state is per-thread, so on multi-GPU +// hosts every host thread that issues CUDA calls must call Bind once +// (typically after runtime.LockOSThread). Without it, allocations and +// kernel launches silently fall through to device 0. +// +// Idempotent and cheap — wraps a single cudaSetDevice. +func (d *Device) Bind() error { + return toError(C.gnark_gpu_set_device(C.int(d.deviceID))) +} + +// Close releases GPU resources associated with this device. +// It is safe to call Close multiple times. +func (d *Device) Close() error { + if d.handle != nil { + C.gnark_gpu_destroy(d.handle) + d.handle = nil + runtime.SetFinalizer(d, nil) + } + return nil +} + +// Sync waits for all queued GPU operations to complete and returns +// any deferred error. +func (d *Device) Sync() error { + if d.handle == nil { + return ErrDeviceClosed + } + return toError(C.gnark_gpu_sync(d.handle)) +} + +// MemGetInfo returns free and total GPU memory in bytes. +func (d *Device) MemGetInfo() (free, total uint64, err error) { + if d.handle == nil { + return 0, 0, ErrDeviceClosed + } + var f, t C.size_t + if err := toError(C.gnark_gpu_mem_get_info(d.handle, &f, &t)); err != nil { + return 0, 0, err + } + return uint64(f), uint64(t), nil +} + +// Closed reports whether the device has been closed. +func (d *Device) Closed() bool { + return d.handle == nil +} + +// Handle returns the opaque CUDA context handle as an unsafe.Pointer. +// Subpackages (plonk, vortex, symbolic) cast this back to their own +// C.gnark_gpu_context_t via their CGO import. +func (d *Device) Handle() unsafe.Pointer { + return unsafe.Pointer(d.handle) +} + +// CreateStream creates a CUDA stream at the given ID. +// Stream 0 (StreamCompute) is created automatically with the device. +func (d *Device) CreateStream(id StreamID) error { + if d.handle == nil { + return ErrDeviceClosed + } + return toError(C.gnark_gpu_create_stream(d.handle, C.int(id))) +} + +// RecordEvent records an event on the specified stream. +// Another stream can later wait for this event via WaitEvent. +func (d *Device) RecordEvent(stream StreamID, event EventID) { + if d.handle == nil { + panic("gpu: RecordEvent on closed device") + } + if err := toError(C.gnark_gpu_record_event(d.handle, C.int(stream), C.int(event))); err != nil { + panic("gpu: RecordEvent failed: " + err.Error()) + } +} + +// WaitEvent makes the specified stream wait until the given event is recorded. +func (d *Device) WaitEvent(stream StreamID, event EventID) { + if d.handle == nil { + panic("gpu: WaitEvent on closed device") + } + if err := toError(C.gnark_gpu_wait_event(d.handle, C.int(stream), C.int(event))); err != nil { + panic("gpu: WaitEvent failed: " + err.Error()) + } +} + +// SyncStream waits for all operations on the specified stream to complete. +func (d *Device) SyncStream(stream StreamID) error { + if d.handle == nil { + return ErrDeviceClosed + } + return toError(C.gnark_gpu_sync_stream(d.handle, C.int(stream))) +} + +// InitMultiStream creates the transfer and MSM streams. +// Call once before using multi-stream operations. +func (d *Device) InitMultiStream() error { + if d.multiStreamInit { + return nil + } + if err := d.CreateStream(StreamTransfer); err != nil { + return err + } + if err := d.CreateStream(StreamMSM); err != nil { + return err + } + d.multiStreamInit = true + return nil +} diff --git a/prover/gpu/device_stub.go b/prover/gpu/device_stub.go new file mode 100644 index 00000000000..2f14429bb95 --- /dev/null +++ b/prover/gpu/device_stub.go @@ -0,0 +1,28 @@ +//go:build !cuda + +package gpu + +import "unsafe" + +// Device is a stub for non-CUDA builds. All methods panic. +// Guard with gpu.Enabled before calling. +type Device struct{} + +func New(opts ...Option) (*Device, error) { panic("gpu: requires cuda build tag") } +func (d *Device) Close() error { panic("gpu: requires cuda build tag") } +func (d *Device) Sync() error { panic("gpu: requires cuda build tag") } +func (d *Device) Closed() bool { return true } +func (d *Device) Handle() unsafe.Pointer { panic("gpu: requires cuda build tag") } +func (d *Device) DeviceID() int { return 0 } +func (d *Device) Bind() error { panic("gpu: requires cuda build tag") } +func PhysicalDeviceCount() int { return 0 } + +func (d *Device) MemGetInfo() (free, total uint64, err error) { + panic("gpu: requires cuda build tag") +} + +func (d *Device) CreateStream(id StreamID) error { panic("gpu: requires cuda build tag") } +func (d *Device) RecordEvent(stream StreamID, event EventID) { panic("gpu: requires cuda build tag") } +func (d *Device) WaitEvent(stream StreamID, event EventID) { panic("gpu: requires cuda build tag") } +func (d *Device) SyncStream(stream StreamID) error { panic("gpu: requires cuda build tag") } +func (d *Device) InitMultiStream() error { panic("gpu: requires cuda build tag") } diff --git a/prover/gpu/enabled_cuda.go b/prover/gpu/enabled_cuda.go new file mode 100644 index 00000000000..d4714ffb5ca --- /dev/null +++ b/prover/gpu/enabled_cuda.go @@ -0,0 +1,9 @@ +//go:build cuda + +package gpu + +// Enabled is true when the binary is built with the cuda tag. +// Use as a compile-time constant so the compiler eliminates dead branches: +// +// if gpu.Enabled { /* GPU path */ } else { /* CPU path */ } +const Enabled = true diff --git a/prover/gpu/enabled_nocuda.go b/prover/gpu/enabled_nocuda.go new file mode 100644 index 00000000000..d19799507ef --- /dev/null +++ b/prover/gpu/enabled_nocuda.go @@ -0,0 +1,9 @@ +//go:build !cuda + +package gpu + +// Enabled is false when the binary is built without the cuda tag. +// Use as a compile-time constant so the compiler eliminates dead branches: +// +// if gpu.Enabled { /* GPU path */ } else { /* CPU path */ } +const Enabled = false diff --git a/prover/gpu/gpu.go b/prover/gpu/gpu.go new file mode 100644 index 00000000000..5275aad9968 --- /dev/null +++ b/prover/gpu/gpu.go @@ -0,0 +1,102 @@ +// Package gpu provides GPU device management, stream scheduling, and error +// handling for CUDA-accelerated cryptographic operations (PlonK prover, Vortex +// commitment, symbolic expression evaluation). +// +// Build tags: +// +// cuda — links against CUDA runtime; full GPU acceleration +// !cuda — stub types that panic; compile-only on non-GPU machines +// +// Usage: +// +// if gpu.Enabled { +// dev, _ := gpu.New() +// defer dev.Close() +// // GPU path ... +// } else { +// // CPU path ... +// } +// +// Runtime model (cuda build): +// +// Go code C API CUDA stream(s) +// ------- ----- ---------------- +// New() -----> gnark_gpu_init --> context + default stream +// ops() -----> enqueue kernels --> async execution +// Sync() -----> sync call --> wait until stream idle +// Close() -----> destroy --> free all device resources +// +// Error contract: +// - C API returns error codes; ToError maps them to stable Go errors. +// - Compute-path methods panic on programmer misuse (size/device mismatch), +// matching gnark-crypto style. +package gpu + +import ( + "errors" + "fmt" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Errors +// ───────────────────────────────────────────────────────────────────────────── + +// ErrDeviceClosed is returned when operating on a closed device. +var ErrDeviceClosed = errors.New("gpu: device closed") + +// Error represents a gnark-gpu error. +type Error struct { + Code int + Message string +} + +func (e *Error) Error() string { + return fmt.Sprintf("gpu: error %d: %s", e.Code, e.Message) +} + +// ───────────────────────────────────────────────────────────────────────────── +// CUDA Streams & Events +// +// Streams allow overlapping GPU operations (compute, H2D, D2H) for pipeline +// parallelism. Events provide cross-stream synchronization. +// +// ┌─────────────┐ ┌─────────────┐ ┌─────────────┐ +// │ Compute(0) │ │ Transfer(1) │ │ MSM(2) │ +// │ FFT, gates │ │ H2D / D2H │ │ sort+accum │ +// └──────┬──────┘ └──────┬──────┘ └──────┬──────┘ +// │ │ │ +// ├─── event ──────►│ │ +// │ ├─── event ───────►│ +// │ │ │ +// +// ───────────────────────────────────────────────────────────────────────────── + +// StreamID identifies a CUDA stream within a Device context. +type StreamID int + +const ( + StreamCompute StreamID = 0 // default compute stream (always available) + StreamTransfer StreamID = 1 // dedicated H2D/D2H transfer stream + StreamMSM StreamID = 2 // dedicated MSM pipeline stream +) + +// EventID identifies a CUDA event for cross-stream synchronization. +type EventID int + +// ───────────────────────────────────────────────────────────────────────────── +// Device configuration +// ───────────────────────────────────────────────────────────────────────────── + +type config struct { + deviceID int +} + +// Option configures a Device. +type Option func(*config) + +// WithDeviceID selects which GPU to use (default 0). +func WithDeviceID(id int) Option { + return func(c *config) { + c.deviceID = id + } +} diff --git a/prover/gpu/internal/generator/common/generator.go b/prover/gpu/internal/generator/common/generator.go new file mode 100644 index 00000000000..576071eb4ea --- /dev/null +++ b/prover/gpu/internal/generator/common/generator.go @@ -0,0 +1,61 @@ +// Package common provides a simple code generator for the gpu/plonk2 packages. +// +// It reads Go text/template files, executes them with curve-specific data, +// formats the output with go/format, and writes the result to the target directory. +package common + +import ( + "bytes" + "fmt" + "go/format" + "os" + "path/filepath" + "strings" + "text/template" +) + +// Generator executes templates and writes formatted Go source files. +type Generator struct { + generatedBy string +} + +// New creates a Generator that stamps each output file with the given generatedBy label. +func New(generatedBy string) *Generator { + return &Generator{generatedBy: generatedBy} +} + +// Execute renders templateSrc with data, formats the result as Go source, and +// writes it to outputPath. The directory is created if it does not exist. +func (g *Generator) Execute(outputPath string, templateSrc string, data any) error { + tmpl, err := template.New("").Funcs(template.FuncMap{ + "toLower": strings.ToLower, + "toUpper": strings.ToUpper, + "mul": func(a, b int) int { return a * b }, + "add": func(a, b int) int { return a + b }, + }).Parse(templateSrc) + if err != nil { + return fmt.Errorf("parse template for %s: %w", outputPath, err) + } + + var buf bytes.Buffer + if err := tmpl.Execute(&buf, data); err != nil { + return fmt.Errorf("execute template for %s: %w", outputPath, err) + } + + src, err := format.Source(buf.Bytes()) + if err != nil { + // Emit unformatted source for debugging. + // Generated source/debug files should have normal repository file permissions. + //nolint:gosec + _ = os.WriteFile(outputPath+".broken", buf.Bytes(), 0o644) + return fmt.Errorf("format source for %s: %w\n\nunformatted written to %s.broken", outputPath, err, outputPath) + } + + if err := os.MkdirAll(filepath.Dir(outputPath), 0o755); err != nil { + return fmt.Errorf("mkdir %s: %w", filepath.Dir(outputPath), err) + } + + // Generated source files should have normal repository file permissions. + //nolint:gosec + return os.WriteFile(outputPath, src, 0o644) +} diff --git a/prover/gpu/internal/generator/config/bls12377.go b/prover/gpu/internal/generator/config/bls12377.go new file mode 100644 index 00000000000..1119d4c6329 --- /dev/null +++ b/prover/gpu/internal/generator/config/bls12377.go @@ -0,0 +1,20 @@ +package config + +// BLS12377 is the curve configuration for BLS12-377. +var BLS12377 = Curve{ + Name: "bls12377", + Package: "bls12377", + FrLimbs: 4, + FpLimbs: 6, + ScalarBits: 253, + GnarkCryptoFr: "github.com/consensys/gnark-crypto/ecc/bls12-377/fr", + GnarkCryptoFFT: "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft", + GnarkCryptoKZG: "github.com/consensys/gnark-crypto/ecc/bls12-377/kzg", + GnarkCryptoIOP: "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/iop", + GnarkCryptoHTF: "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/hash_to_field", + GnarkCurve: "github.com/consensys/gnark-crypto/ecc/bls12-377", + GnarkCS: "github.com/consensys/gnark/constraint/bls12-377", + GnarkPlonk: "github.com/consensys/gnark/backend/plonk/bls12-377", + CurveIndex: 2, // GNARK_GPU_PLONK2_CURVE_BLS12_377 + EccIDStr: "BLS12_377", +} diff --git a/prover/gpu/internal/generator/config/bn254.go b/prover/gpu/internal/generator/config/bn254.go new file mode 100644 index 00000000000..bb03a6ebe73 --- /dev/null +++ b/prover/gpu/internal/generator/config/bn254.go @@ -0,0 +1,20 @@ +package config + +// BN254 is the curve configuration for BN254. +var BN254 = Curve{ + Name: "bn254", + Package: "bn254", + FrLimbs: 4, + FpLimbs: 4, + ScalarBits: 254, + GnarkCryptoFr: "github.com/consensys/gnark-crypto/ecc/bn254/fr", + GnarkCryptoFFT: "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft", + GnarkCryptoKZG: "github.com/consensys/gnark-crypto/ecc/bn254/kzg", + GnarkCryptoIOP: "github.com/consensys/gnark-crypto/ecc/bn254/fr/iop", + GnarkCryptoHTF: "github.com/consensys/gnark-crypto/ecc/bn254/fr/hash_to_field", + GnarkCurve: "github.com/consensys/gnark-crypto/ecc/bn254", + GnarkCS: "github.com/consensys/gnark/constraint/bn254", + GnarkPlonk: "github.com/consensys/gnark/backend/plonk/bn254", + CurveIndex: 1, // GNARK_GPU_PLONK2_CURVE_BN254 + EccIDStr: "BN254", +} diff --git a/prover/gpu/internal/generator/config/bw6761.go b/prover/gpu/internal/generator/config/bw6761.go new file mode 100644 index 00000000000..06db164f90b --- /dev/null +++ b/prover/gpu/internal/generator/config/bw6761.go @@ -0,0 +1,20 @@ +package config + +// BW6761 is the curve configuration for BW6-761. +var BW6761 = Curve{ + Name: "bw6761", + Package: "bw6761", + FrLimbs: 6, + FpLimbs: 12, + ScalarBits: 377, + GnarkCryptoFr: "github.com/consensys/gnark-crypto/ecc/bw6-761/fr", + GnarkCryptoFFT: "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/fft", + GnarkCryptoKZG: "github.com/consensys/gnark-crypto/ecc/bw6-761/kzg", + GnarkCryptoIOP: "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/iop", + GnarkCryptoHTF: "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/hash_to_field", + GnarkCurve: "github.com/consensys/gnark-crypto/ecc/bw6-761", + GnarkCS: "github.com/consensys/gnark/constraint/bw6-761", + GnarkPlonk: "github.com/consensys/gnark/backend/plonk/bw6-761", + CurveIndex: 3, // GNARK_GPU_PLONK2_CURVE_BW6_761 + EccIDStr: "BW6_761", +} diff --git a/prover/gpu/internal/generator/config/curve.go b/prover/gpu/internal/generator/config/curve.go new file mode 100644 index 00000000000..7e7fed9f5a8 --- /dev/null +++ b/prover/gpu/internal/generator/config/curve.go @@ -0,0 +1,26 @@ +package config + +// Curve holds all configuration needed to generate a typed per-curve GPU package. +type Curve struct { + Name string // "bn254", "bls12377", "bw6761" + Package string // Go package name: "bn254", "bls12377", "bw6761" + FrLimbs int // Fr limb count: 4 or 6 + FpLimbs int // Fp limb count: 4, 6, or 12 + ScalarBits int // scalar bit-width: 254, 253, 377 + + // gnark-crypto import paths + GnarkCryptoFr string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr" + GnarkCryptoFFT string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft" + GnarkCryptoKZG string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/kzg" + GnarkCryptoIOP string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/iop" + GnarkCryptoHTF string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/hash_to_field" + GnarkCurve string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254" + GnarkCS string // e.g. "github.com/consensys/gnark/constraint/bn254" + GnarkPlonk string // e.g. "github.com/consensys/gnark/backend/plonk/bn254" + + // CurveIndex is the integer passed to curve-indexed C API calls (curve ID). + CurveIndex int + + // EccIDStr is the gnark-crypto ecc.ID string (e.g., "BN254", "BLS12_377", "BW6_761"). + EccIDStr string +} diff --git a/prover/gpu/internal/generator/main.go b/prover/gpu/internal/generator/main.go new file mode 100644 index 00000000000..8eafe865317 --- /dev/null +++ b/prover/gpu/internal/generator/main.go @@ -0,0 +1,42 @@ +// Command generator generates the typed per-curve GPU packages in gpu/plonk2. +// +// Run from this directory: +// +// go run . +package main + +import ( + "log" + "path/filepath" + "runtime" + + "github.com/consensys/linea-monorepo/prover/gpu/internal/generator/common" + "github.com/consensys/linea-monorepo/prover/gpu/internal/generator/config" + "github.com/consensys/linea-monorepo/prover/gpu/internal/generator/plonk" +) + +func main() { + // Resolve output base relative to this file's directory so the generator + // works correctly regardless of the working directory it is invoked from. + _, thisFile, _, _ := runtime.Caller(0) + thisDir := filepath.Dir(thisFile) + plonk2Dir := filepath.Join(thisDir, "..", "..", "plonk2") + + gen := common.New("gpu/internal/generator") + + curves := []config.Curve{ + config.BN254, + config.BLS12377, + config.BW6761, + } + + for _, curve := range curves { + outputDir := filepath.Join(plonk2Dir, curve.Package) + log.Printf("generating %s → %s", curve.Name, outputDir) + if err := plonk.Generate(curve, outputDir, gen); err != nil { + log.Fatalf("generate %s: %v", curve.Name, err) + } + } + + log.Println("done") +} diff --git a/prover/gpu/internal/generator/plonk/generate.go b/prover/gpu/internal/generator/plonk/generate.go new file mode 100644 index 00000000000..fe7db1552cf --- /dev/null +++ b/prover/gpu/internal/generator/plonk/generate.go @@ -0,0 +1,47 @@ +// Package plonk drives code generation for the gpu/plonk2 per-curve packages. +package plonk + +import ( + "path/filepath" + + "github.com/consensys/linea-monorepo/prover/gpu/internal/generator/common" + "github.com/consensys/linea-monorepo/prover/gpu/internal/generator/config" + tmpl "github.com/consensys/linea-monorepo/prover/gpu/internal/generator/plonk/template" +) + +// Generate renders all templates for curve c into outputDir. +func Generate(c config.Curve, outputDir string, gen *common.Generator) error { + entries := []struct { + file string + src string + }{ + {filepath.Join(outputDir, "doc.go"), tmpl.DocTemplate}, + {filepath.Join(outputDir, "cgo.go"), tmpl.CgoTemplate}, + // Phase 2: FrVector + {filepath.Join(outputDir, "fr.go"), tmpl.FrTemplate}, + {filepath.Join(outputDir, "fr_stub.go"), tmpl.FrStubTemplate}, + {filepath.Join(outputDir, "fr_test.go"), tmpl.FrTestTemplate}, + // Phase 3: FFTDomain + {filepath.Join(outputDir, "fft.go"), tmpl.FFTTemplate}, + {filepath.Join(outputDir, "fft_stub.go"), tmpl.FFTStubTemplate}, + {filepath.Join(outputDir, "fft_test.go"), tmpl.FFTTestTemplate}, + // Phase 4: MSM + {filepath.Join(outputDir, "msm.go"), tmpl.MSMTemplate}, + {filepath.Join(outputDir, "msm_stub.go"), tmpl.MSMStubTemplate}, + {filepath.Join(outputDir, "msm_test.go"), tmpl.MSMTestTemplate}, + // Phase 5a: GPU kernels + Prover + {filepath.Join(outputDir, "kernels.go"), tmpl.KernelsTemplate}, + {filepath.Join(outputDir, "kernels_stub.go"), tmpl.KernelsStubTemplate}, + {filepath.Join(outputDir, "pinned_fr.go"), tmpl.PinnedFrTemplate}, + {filepath.Join(outputDir, "prove.go"), tmpl.ProveTemplate}, + {filepath.Join(outputDir, "prove_stub.go"), tmpl.ProveStubTemplate}, + {filepath.Join(outputDir, "plonk_test.go"), tmpl.PlonkTestTemplate}, + } + + for _, e := range entries { + if err := gen.Execute(e.file, e.src, c); err != nil { + return err + } + } + return nil +} diff --git a/prover/gpu/internal/generator/plonk/template/cgo.go.tmpl b/prover/gpu/internal/generator/plonk/template/cgo.go.tmpl new file mode 100644 index 00000000000..030d0ff2c9e --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/cgo.go.tmpl @@ -0,0 +1,44 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +/* +#cgo LDFLAGS: -L${SRCDIR}/../../cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/../../cuda/include + +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// curve returns the C curve identifier for {{.Name}}, baked in at generation time. +func curveID() C.gnark_gpu_plonk2_curve_id_t { + return C.gnark_gpu_plonk2_curve_id_t({{.CurveIndex}}) +} + +func devCtx(d *gpu.Device) C.gnark_gpu_context_t { + return C.gnark_gpu_context_t(d.Handle()) +} + +func toError(code C.gnark_gpu_error_t) error { + switch code { + case C.GNARK_GPU_SUCCESS: + return nil + case C.GNARK_GPU_ERROR_CUDA: + return &gpu.Error{Code: int(code), Message: "CUDA error"} + case C.GNARK_GPU_ERROR_INVALID_ARG: + return &gpu.Error{Code: int(code), Message: "invalid argument"} + case C.GNARK_GPU_ERROR_OUT_OF_MEMORY: + return &gpu.Error{Code: int(code), Message: "out of GPU memory"} + case C.GNARK_GPU_ERROR_SIZE_MISMATCH: + return &gpu.Error{Code: int(code), Message: "vector size mismatch"} + default: + return &gpu.Error{Code: int(code), Message: "unknown error"} + } +} diff --git a/prover/gpu/internal/generator/plonk/template/doc.go.tmpl b/prover/gpu/internal/generator/plonk/template/doc.go.tmpl new file mode 100644 index 00000000000..74181fa6ded --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/doc.go.tmpl @@ -0,0 +1,7 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +// Package {{.Package}} provides GPU-accelerated PlonK operations for the {{.Name}} curve. +// +// Generated from gpu/internal/generator. Do not edit by hand. +// Re-generate with: cd gpu/internal/generator && go run . +package {{.Package}} diff --git a/prover/gpu/internal/generator/plonk/template/fft.go.tmpl b/prover/gpu/internal/generator/plonk/template/fft.go.tmpl new file mode 100644 index 00000000000..29ad3918f01 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/fft.go.tmpl @@ -0,0 +1,211 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "math/big" + "runtime" + "unsafe" + + fr "{{.GnarkCryptoFr}}" + "{{.GnarkCryptoFFT}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain holds GPU-resident twiddle factors for NTT operations over the +// {{.Name}} scalar field. +// +// All NTT operations accept an optional StreamID. When provided, the operation +// is dispatched on that CUDA stream (non-blocking). When omitted, the default +// stream (stream 0) is used. +type GPUFFTDomain struct { + handle C.gnark_gpu_plonk2_ntt_domain_t + dev *gpu.Device + size int +} + +// NewFFTDomain creates a GPU NTT domain of the given size (must be a power of 2). +// +// Twiddle factors are computed using gnark-crypto's fft.Domain, then uploaded +// to GPU in AoS format. This is a one-time cost per domain size. +func NewFFTDomain(dev *gpu.Device, size int) (*GPUFFTDomain, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if size <= 0 || (size&(size-1)) != 0 { + return nil, &gpu.Error{Code: -1, Message: "size must be a positive power of 2"} + } + + domain := fft.NewDomain(uint64(size)) + halfN := size / 2 + + fwdTwiddles := make([]fr.Element, halfN) + invTwiddles := make([]fr.Element, halfN) + if halfN > 0 { + fwdTwiddles[0].SetOne() + invTwiddles[0].SetOne() + for i := 1; i < halfN; i++ { + fwdTwiddles[i].Mul(&fwdTwiddles[i-1], &domain.Generator) + invTwiddles[i].Mul(&invTwiddles[i-1], &domain.GeneratorInv) + } + } + + invN := domain.CardinalityInv + + var fwdPtr, invPtr *C.uint64_t + if halfN > 0 { + fwdPtr = (*C.uint64_t)(unsafe.Pointer(&fwdTwiddles[0])) + invPtr = (*C.uint64_t)(unsafe.Pointer(&invTwiddles[0])) + } + + var handle C.gnark_gpu_plonk2_ntt_domain_t + if err := toError(C.gnark_gpu_plonk2_ntt_domain_create( + devCtx(dev), + curveID(), + C.size_t(size), + fwdPtr, + invPtr, + (*C.uint64_t)(unsafe.Pointer(&invN)), + &handle, + )); err != nil { + return nil, err + } + + dom := &GPUFFTDomain{handle: handle, dev: dev, size: size} + runtime.SetFinalizer(dom, (*GPUFFTDomain).Close) + return dom, nil +} + +// Size returns the domain size. +func (f *GPUFFTDomain) Size() int { return f.size } + +// Close releases GPU resources. Safe to call multiple times. +func (f *GPUFFTDomain) Close() { + if f.handle != nil { + C.gnark_gpu_plonk2_ntt_domain_destroy(f.handle) + f.handle = nil + runtime.SetFinalizer(f, nil) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Forward / Inverse FFT +// ───────────────────────────────────────────────────────────────────────────── + +// FFT performs a forward NTT (DIF): natural-order input → bit-reversed output. +func (f *GPUFFTDomain) FFT(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFT size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_forward_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_forward(f.handle, v.handle)); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } +} + +// FFTInverse performs an inverse NTT (DIT): bit-reversed input → natural-order output. +// The result is scaled by 1/n. +func (f *GPUFFTDomain) FFTInverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFTInverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_inverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_inverse(f.handle, v.handle)); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } +} + +// BitReverse applies the bit-reversal permutation. +func (f *GPUFFTDomain) BitReverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: BitReverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse(f.handle, v.handle)); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Coset FFT +// +// CosetFFT evaluates p(X) on coset g·H = {g·ω^i : i=0..n-1}. +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// ───────────────────────────────────────────────────────────────────────────── + +// CosetFFT evaluates a polynomial in canonical form on coset g·H. +// Input: v holds canonical coefficients in natural order. +// Output: v holds p(g·ω⁰), p(g·ω¹), …, p(g·ωⁿ⁻¹) in natural order. +// +// Implemented as: ScaleByPowers(g) → FFT → BitReverse. +func (f *GPUFFTDomain) CosetFFT(v *FrVector, g fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFT size mismatch") + } + v.ScaleByPowers(g, stream...) + f.FFT(v, stream...) + f.BitReverse(v, stream...) +} + +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// gInv must be the inverse of the coset generator g. +// +// Implemented as: BitReverse → FFTInverse → ScaleByPowers(gInv). +func (f *GPUFFTDomain) CosetFFTInverse(v *FrVector, gInv fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFTInverse size mismatch") + } + f.BitReverse(v, stream...) + f.FFTInverse(v, stream...) + v.ScaleByPowers(gInv, stream...) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Butterfly4Inverse — decomposed iFFT(4n) for quotient computation +// ───────────────────────────────────────────────────────────────────────────── + +// Butterfly4Inverse applies a size-4 inverse DFT butterfly across 4 FrVectors. +// +// omega4Inv: inverse of the primitive 4th root of unity. +// quarter: 1/4 in Montgomery form. +func Butterfly4Inverse(b0, b1, b2, b3 *FrVector, omega4Inv, quarter fr.Element) { + if b0.n != b1.n || b1.n != b2.n || b2.n != b3.n { + panic("gpu: Butterfly4Inverse size mismatch") + } + if b0.dev != b1.dev || b1.dev != b2.dev || b2.dev != b3.dev { + panic("gpu: Butterfly4Inverse device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_butterfly4_inverse( + devCtx(b0.dev), + b0.handle, b1.handle, b2.handle, b3.handle, + (*C.uint64_t)(unsafe.Pointer(&omega4Inv)), + (*C.uint64_t)(unsafe.Pointer(&quarter)), + )); err != nil { + panic("gpu: Butterfly4Inverse failed: " + err.Error()) + } +} + +// ─── suppress unused import ─────────────────────────────────────────────────── +var _ = big.NewInt diff --git a/prover/gpu/internal/generator/plonk/template/fft_stub.go.tmpl b/prover/gpu/internal/generator/plonk/template/fft_stub.go.tmpl new file mode 100644 index 00000000000..72f12f2d965 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/fft_stub.go.tmpl @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package {{.Package}} + +import ( + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain is a stub for non-CUDA builds. +type GPUFFTDomain struct{} + +func NewFFTDomain(_ *gpu.Device, _ int) (*GPUFFTDomain, error) { + return nil, gpu.ErrDeviceClosed +} + +func (f *GPUFFTDomain) Size() int { return 0 } +func (f *GPUFFTDomain) Close() {} +func (f *GPUFFTDomain) FFT(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) FFTInverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) BitReverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFT(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFTInverse(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} + +func Butterfly4Inverse(_, _, _, _ *FrVector, _, _ fr.Element) { panic("gpu: cuda required") } diff --git a/prover/gpu/internal/generator/plonk/template/fft_test.go.tmpl b/prover/gpu/internal/generator/plonk/template/fft_test.go.tmpl new file mode 100644 index 00000000000..75e4a66eb2b --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/fft_test.go.tmpl @@ -0,0 +1,188 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}}_test + +import ( + "fmt" + "testing" + + "{{.GnarkCryptoFFT}}" + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/{{.Package}}" + "github.com/stretchr/testify/require" +) + +func newDomain(t testing.TB, dev *gpu.Device, size int) *{{.Package}}.GPUFFTDomain { + t.Helper() + dom, err := {{.Package}}.NewFFTDomain(dev, size) + require.NoError(t, err) + t.Cleanup(func() { dom.Close() }) + return dom +} + +// TestFFTRoundtrip verifies FFT(FFTInverse(v)) == v. +func TestFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16, 20} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + gV := newGPUVec(t, dev, orig) + + dom.FFT(gV) + dom.FFTInverse(gV) + dom.BitReverse(gV) // FFTInverse expects bit-reversed input; FFT output is bit-reversed + dev.Sync() + + // Actually test FFTInverse(FFT(v)) == v: + // FFT: natural → bit-reversed + // FFTInverse: bit-reversed → natural (scaled by 1/n) + // So we need FFTInverse after FFT directly. + gV2 := newGPUVec(t, dev, orig) + dom.FFT(gV2) + dom.FFTInverse(gV2) + dev.Sync() + + result := make(fr.Vector, n) + gV2.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "FFTInverse(FFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestCosetFFTRoundtrip verifies CosetFFT(CosetFFTInverse(v)) == v. +func TestCosetFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + var gInv fr.Element + gInv.Inverse(&g) + + gV := newGPUVec(t, dev, orig) + dom.CosetFFT(gV, g) + dom.CosetFFTInverse(gV, gInv) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "CosetFFTInverse(CosetFFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestFFTMatchesCPU verifies GPU FFT output matches gnark-crypto CPU FFT. +func TestFFTMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const logN = 14 + n := 1 << logN + + dom := newDomain(t, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + + orig := randFrVec(n) + cpuCopy := make(fr.Vector, n) + copy(cpuCopy, orig) + + // CPU FFT + cpuDom.FFT(cpuCopy, fft.DIF) + fft.BitReverse(cpuCopy) + + // GPU FFT (DIF: natural → bit-reversed, then BitReverse → natural) + gV := newGPUVec(t, dev, orig) + dom.FFT(gV) // natural → bit-reversed + dom.BitReverse(gV) // bit-reversed → natural + dev.Sync() + + gpuResult := make(fr.Vector, n) + gV.CopyToHost(gpuResult) + + for i := range cpuCopy { + require.True(t, cpuCopy[i].Equal(&gpuResult[i]), + "FFT mismatch at i=%d", i) + } +} + +// BenchmarkFFTForward benchmarks GPU forward NTT. +func BenchmarkFFTForward(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFT(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkFFTInverse benchmarks GPU inverse NTT. +func BenchmarkFFTInverse(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + dom.FFT(gV) // put into bit-reversed form first + dev.Sync() + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFTInverse(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkCosetFFT benchmarks GPU coset FFT. +func BenchmarkCosetFFT(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Re-upload canonical coefficients before each run + gV.CopyFromHost(src) + dom.CosetFFT(gV, g) + dev.Sync() + } + }) + } +} diff --git a/prover/gpu/internal/generator/plonk/template/fr.go.tmpl b/prover/gpu/internal/generator/plonk/template/fr.go.tmpl new file mode 100644 index 00000000000..4af521306c1 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/fr.go.tmpl @@ -0,0 +1,270 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "runtime" + "sync" + "unsafe" + + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector holds a vector of {{.Name}} scalar-field (Fr) elements on the GPU +// in Structure-of-Arrays (SoA) layout for coalesced memory access. +// +// All elements are in Montgomery form. GPU memory is SoA by limb; host memory +// uses gnark-crypto AoS Montgomery layout. +// +// All operations accept an optional gpu.StreamID. When omitted, the default +// stream (stream 0) is used. +type FrVector struct { + handle C.gnark_gpu_plonk2_fr_vector_t + dev *gpu.Device + n int +} + +var hostTransferMu sync.Mutex + +// NewFrVector allocates GPU memory for n Fr elements on dev. +// A finalizer is installed; call Free for deterministic VRAM release. +func NewFrVector(dev *gpu.Device, n int) (*FrVector, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if n <= 0 { + return nil, &gpu.Error{Code: -1, Message: "count must be positive"} + } + + var handle C.gnark_gpu_plonk2_fr_vector_t + if err := toError(C.gnark_gpu_plonk2_fr_vector_alloc( + devCtx(dev), curveID(), C.size_t(n), &handle, + )); err != nil { + return nil, err + } + + v := &FrVector{handle: handle, dev: dev, n: n} + runtime.SetFinalizer(v, (*FrVector).Free) + return v, nil +} + +// Free releases GPU memory. Safe to call multiple times. +func (v *FrVector) Free() { + if v.handle != nil { + v.bind() + C.gnark_gpu_plonk2_fr_vector_free(v.handle) + v.handle = nil + runtime.SetFinalizer(v, nil) + } +} + +// Len returns the number of elements. +func (v *FrVector) Len() int { return v.n } + +func (v *FrVector) bind() { + if err := v.dev.Bind(); err != nil { + panic("gpu: bind device failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Host ↔ Device transfers +// ───────────────────────────────────────────────────────────────────────────── + +// CopyFromHost copies host data (AoS) to GPU (SoA). Panics on size mismatch. +func (v *FrVector) CopyFromHost(src fr.Vector, _ ...gpu.StreamID) { + if len(src) != v.n { + panic("gpu: CopyFromHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_device( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&src[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyFromHost failed: " + err.Error()) + } +} + +// CopyToHost copies GPU data (SoA) back to host (AoS). Panics on size mismatch. +func (v *FrVector) CopyToHost(dst fr.Vector, _ ...gpu.StreamID) { + if len(dst) != v.n { + panic("gpu: CopyToHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_host( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&dst[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyToHost failed: " + err.Error()) + } +} + +// CopyFromDevice copies src to v (GPU-to-GPU). Panics on size or device mismatch. +func (v *FrVector) CopyFromDevice(src *FrVector, _ ...gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDevice size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDevice device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d( + devCtx(v.dev), v.handle, src.handle, + )); err != nil { + panic("gpu: CopyFromDevice failed: " + err.Error()) + } +} + +// CopyFromDeviceStream copies src to v (GPU-to-GPU) on a specific stream. +// Panics on size or device mismatch. +func (v *FrVector) CopyFromDeviceStream(src *FrVector, streamID gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDeviceStream size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDeviceStream device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d_stream( + devCtx(v.dev), v.handle, src.handle, C.int(streamID), + )); err != nil { + panic("gpu: CopyFromDeviceStream failed: " + err.Error()) + } +} + +// SetZero sets all elements to zero. +func (v *FrVector) SetZero(_ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_set_zero( + devCtx(v.dev), v.handle, + )); err != nil { + panic("gpu: SetZero failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Element-wise arithmetic (all async on the default stream) +// ───────────────────────────────────────────────────────────────────────────── + +func mustSameDeviceAndSize(v, a, b *FrVector) { + if v.n != a.n || a.n != b.n { + panic("gpu: vector size mismatch") + } + if v.dev != a.dev || a.dev != b.dev { + panic("gpu: vectors from different devices") + } +} + +// Mul computes v[i] = a[i] · b[i] (mod r). +func (v *FrVector) Mul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_mul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Mul failed: " + err.Error()) + } +} + +// Add computes v[i] = a[i] + b[i] (mod r). +func (v *FrVector) Add(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_add( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Add failed: " + err.Error()) + } +} + +// Sub computes v[i] = a[i] - b[i] (mod r). +func (v *FrVector) Sub(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_sub( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Sub failed: " + err.Error()) + } +} + +// AddMul computes v[i] += a[i] · b[i] (mod r). +func (v *FrVector) AddMul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_addmul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: AddMul failed: " + err.Error()) + } +} + +// AddScalarMul computes v[i] += a[i] · scalar (mod r). +func (v *FrVector) AddScalarMul(a *FrVector, scalar fr.Element, _ ...gpu.StreamID) { + if v.n != a.n { + panic("gpu: AddScalarMul size mismatch") + } + if v.dev != a.dev { + panic("gpu: AddScalarMul device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_add_scalar_mul( + devCtx(v.dev), v.handle, a.handle, + (*C.uint64_t)(unsafe.Pointer(&scalar)), + )); err != nil { + panic("gpu: AddScalarMul failed: " + err.Error()) + } +} + +// ScalarMul computes v[i] *= c (mod r) for all i. +func (v *FrVector) ScalarMul(c fr.Element, _ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scalar_mul( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&c)), + )); err != nil { + panic("gpu: ScalarMul failed: " + err.Error()) + } +} + +// ScaleByPowers computes v[i] *= g^i for i in [0, n). +// Used for coset FFT shifting. +func (v *FrVector) ScaleByPowers(g fr.Element, streams ...gpu.StreamID) { + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers_stream( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + C.int(streams[0]), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } +} + +// BatchInvert computes v[i] = 1/v[i] using Montgomery batch inversion. +// temp must be a separate FrVector of the same size used as scratch space. +func (v *FrVector) BatchInvert(temp *FrVector, _ ...gpu.StreamID) { + if v.n != temp.n { + panic("gpu: BatchInvert size mismatch") + } + if v.dev != temp.dev { + panic("gpu: BatchInvert device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_batch_invert( + devCtx(v.dev), v.handle, temp.handle, + )); err != nil { + panic("gpu: BatchInvert failed: " + err.Error()) + } +} diff --git a/prover/gpu/internal/generator/plonk/template/fr_stub.go.tmpl b/prover/gpu/internal/generator/plonk/template/fr_stub.go.tmpl new file mode 100644 index 00000000000..08f6e7be1d7 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/fr_stub.go.tmpl @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package {{.Package}} + +import ( + "errors" + + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector is a stub for non-CUDA builds. +type FrVector struct{} + +func NewFrVector(_ *gpu.Device, _ int) (*FrVector, error) { + return nil, errors.New("gpu: cuda required") +} + +func (v *FrVector) Free() {} +func (v *FrVector) Len() int { return 0 } +func (v *FrVector) CopyFromHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyToHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDevice(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDeviceStream(_ *FrVector, _ gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) SetZero(_ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Mul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Add(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Sub(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddMul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddScalarMul(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (v *FrVector) ScalarMul(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) ScaleByPowers(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) BatchInvert(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } diff --git a/prover/gpu/internal/generator/plonk/template/fr_test.go.tmpl b/prover/gpu/internal/generator/plonk/template/fr_test.go.tmpl new file mode 100644 index 00000000000..b99fc8d1781 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/fr_test.go.tmpl @@ -0,0 +1,275 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}}_test + +import ( + "fmt" + "testing" + + "github.com/leanovate/gopter" + "github.com/leanovate/gopter/prop" + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/{{.Package}}" + "github.com/stretchr/testify/require" +) + +func requireGPUDev(t testing.TB) *gpu.Device { + t.Helper() + dev, err := gpu.New() + require.NoError(t, err) + t.Cleanup(func() { dev.Close() }) + return dev +} + +func genFrElem() gopter.Gen { + return func(_ *gopter.GenParameters) *gopter.GenResult { + var e fr.Element + e.MustSetRandom() + return gopter.NewGenResult(e, gopter.NoShrinker) + } +} + +func randFrVec(n int) fr.Vector { + v := make(fr.Vector, n) + for i := range v { + v[i].MustSetRandom() + } + return v +} + +func newGPUVec(t testing.TB, dev *gpu.Device, data fr.Vector) *{{.Package}}.FrVector { + t.Helper() + gv, err := {{.Package}}.NewFrVector(dev, len(data)) + require.NoError(t, err) + t.Cleanup(func() { gv.Free() }) + gv.CopyFromHost(data) + dev.Sync() + return gv +} + +// TestFrVectorRoundtrip verifies CopyFromHost → CopyToHost is identity. +func TestFrVectorRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + const n = 1024 + src := randFrVec(n) + gv := newGPUVec(t, dev, src) + dst := make(fr.Vector, n) + gv.CopyToHost(dst) + for i := range src { + require.True(t, src[i].Equal(&dst[i]), "mismatch at %d", i) + } +} + +// TestFrVectorAddCommutative checks GPU Add(a,b) == GPU Add(b,a). +func TestFrVectorAddCommutative(t *testing.T) { + dev := requireGPUDev(t) + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 50 + properties := gopter.NewProperties(parameters) + + properties.Property("Add is commutative", prop.ForAll( + func(a, b fr.Element) bool { + n := 16 + aVec := make(fr.Vector, n) + bVec := make(fr.Vector, n) + for i := range aVec { + aVec[i] = a + bVec[i] = b + } + + gA, err := {{.Package}}.NewFrVector(dev, n) + if err != nil { + return false + } + gB, _ := {{.Package}}.NewFrVector(dev, n) + gAB, _ := {{.Package}}.NewFrVector(dev, n) + gBA, _ := {{.Package}}.NewFrVector(dev, n) + defer gA.Free() + defer gB.Free() + defer gAB.Free() + defer gBA.Free() + + gA.CopyFromHost(aVec) + gB.CopyFromHost(bVec) + gAB.Add(gA, gB) + gBA.Add(gB, gA) + dev.Sync() + + ab := make(fr.Vector, n) + ba := make(fr.Vector, n) + gAB.CopyToHost(ab) + gBA.CopyToHost(ba) + for i := range ab { + if !ab[i].Equal(&ba[i]) { + return false + } + } + return true + }, + genFrElem(), genFrElem(), + )) + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + +// TestFrVectorBatchInvert verifies v[i] * inv(v[i]) == 1. +func TestFrVectorBatchInvert(t *testing.T) { + dev := requireGPUDev(t) + const n = 256 + + orig := make(fr.Vector, n) + for i := range orig { + orig[i].MustSetRandom() + if orig[i].IsZero() { + orig[i].SetOne() + } + } + + gV := newGPUVec(t, dev, orig) + gTemp, err := {{.Package}}.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + inv := make(fr.Vector, n) + gV.CopyToHost(inv) + + var one fr.Element + one.SetOne() + for i := range orig { + var product fr.Element + product.Mul(&orig[i], &inv[i]) + require.True(t, product.Equal(&one), "BatchInvert: v[%d]*inv[%d] != 1", i, i) + } +} + +// TestFrVectorScaleByPowers checks GPU ScaleByPowers matches CPU loop. +func TestFrVectorScaleByPowers(t *testing.T) { + dev := requireGPUDev(t) + const n = 512 + + var omega fr.Element + omega.MustSetRandom() + + ones := make(fr.Vector, n) + for i := range ones { + ones[i].SetOne() + } + + gV := newGPUVec(t, dev, ones) + gV.ScaleByPowers(omega) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + + expected := make(fr.Vector, n) + expected[0].SetOne() + for i := 1; i < n; i++ { + expected[i].Mul(&expected[i-1], &omega) + } + + for i := range result { + require.True(t, result[i].Equal(&expected[i]), "ScaleByPowers mismatch at %d", i) + } +} + +// TestFrVectorBatchInvertMatchesCPU verifies BatchInvert matches scalar CPU inversion. +func TestFrVectorBatchInvertMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const n = 128 + + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + + cpuInv := make(fr.Vector, n) + for i := range src { + cpuInv[i].Inverse(&src[i]) + } + + gV := newGPUVec(t, dev, src) + gTemp, err := {{.Package}}.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + gpuInv := make(fr.Vector, n) + gV.CopyToHost(gpuInv) + + for i := range cpuInv { + require.True(t, cpuInv[i].Equal(&gpuInv[i]), + "BatchInvert mismatch at %d", i) + } +} + +// BenchmarkFrVectorAdd benchmarks GPU element-wise addition. +func BenchmarkFrVectorAdd(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20, 1 << 22} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + gA := newGPUVec(b, dev, src) + gB := newGPUVec(b, dev, src) + gC, _ := {{.Package}}.NewFrVector(dev, n) + defer gC.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gC.Add(gA, gB) + dev.Sync() + } + }) + } +} + +// BenchmarkFrVectorBatchInvert benchmarks GPU batch inversion. +func BenchmarkFrVectorBatchInvert(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + gV := newGPUVec(b, dev, src) + gTemp, _ := {{.Package}}.NewFrVector(dev, n) + defer gTemp.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gV.CopyFromHost(src) + gV.BatchInvert(gTemp) + dev.Sync() + } + }) + } +} + +func fmtPow2(n int) string { + switch { + case n >= 1<<20: + return fmt.Sprintf("%dM", n>>20) + case n >= 1<<10: + return fmt.Sprintf("%dK", n>>10) + default: + return fmt.Sprintf("%d", n) + } +} diff --git a/prover/gpu/internal/generator/plonk/template/kernels.go.tmpl b/prover/gpu/internal/generator/plonk/template/kernels.go.tmpl new file mode 100644 index 00000000000..7173105feba --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/kernels.go.tmpl @@ -0,0 +1,316 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +/* +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "math/big" + "runtime" + "sync" + "unsafe" + + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// ZPrefixProduct computes Z[i] = product(ratio[0..i-1]) on GPU with CPU chunk scan. +func ZPrefixProduct(dev *gpu.Device, zVec, ratioVec, tempVec *FrVector) { + if zVec.n != ratioVec.n || zVec.n != tempVec.n { + panic("gpu: ZPrefixProduct size mismatch") + } + n := ratioVec.n + maxChunks := (n + 1023) / 1024 + cpHost := make([]uint64, maxChunks*{{.FrLimbs}}) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase1( + devCtx(dev), zVec.handle, ratioVec.handle, + (*C.uint64_t)(unsafe.Pointer(&cpHost[0])), &numChunks, + )); err != nil { + panic("gpu: ZPrefixProduct phase1 failed: " + err.Error()) + } + + nc := int(numChunks) + spHost := make([]uint64, nc*{{.FrLimbs}}) + copy(spHost[:{{.FrLimbs}}], cpHost[:{{.FrLimbs}}]) + for i := 1; i < nc; i++ { + prev := *(*fr.Element)(unsafe.Pointer(&spHost[(i-1)*{{.FrLimbs}}])) + cur := *(*fr.Element)(unsafe.Pointer(&cpHost[i*{{.FrLimbs}}])) + var prod fr.Element + prod.Mul(&prev, &cur) + *(*fr.Element)(unsafe.Pointer(&spHost[i*{{.FrLimbs}}])) = prod + } + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase3( + devCtx(dev), zVec.handle, tempVec.handle, + (*C.uint64_t)(unsafe.Pointer(&spHost[0])), C.size_t(nc), + )); err != nil { + panic("gpu: ZPrefixProduct phase3 failed: " + err.Error()) + } +} + +// PlonkZComputeFactors computes per-element Z ratio factors on GPU. +// On exit L contains numerators, R contains denominators. +func PlonkZComputeFactors( + L, R, O *FrVector, dPerm unsafe.Pointer, + beta, gamma, gMul, gSq fr.Element, + log2n uint, domain *GPUFFTDomain, +) { + n := L.n + if R.n != n || O.n != n || domain.size != n { + panic("gpu: PlonkZComputeFactors size mismatch") + } + params := [4]fr.Element{beta, gamma, gMul, gSq} + if err := toError(C.gnark_gpu_plonk2_z_compute_factors( + devCtx(L.dev), L.handle, R.handle, O.handle, + dPerm, (*C.uint64_t)(unsafe.Pointer(¶ms[0])), + C.uint(log2n), domain.handle, + )); err != nil { + panic("gpu: PlonkZComputeFactors failed: " + err.Error()) + } +} + +// PlonkGateAccum computes the fused gate constraint accumulation. +func PlonkGateAccum(result, Ql, Qr, Qm, Qo, Qk, L, R, O *FrVector, zhKInv fr.Element) { + n := result.n + if Ql.n != n || Qr.n != n || Qm.n != n || Qo.n != n || Qk.n != n || + L.n != n || R.n != n || O.n != n { + panic("gpu: PlonkGateAccum size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_gate_accum( + devCtx(result.dev), + result.handle, Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + L.handle, R.handle, O.handle, + (*C.uint64_t)(unsafe.Pointer(&zhKInv)), + )); err != nil { + panic("gpu: PlonkGateAccum failed: " + err.Error()) + } +} + +// PlonkLinearizeStatic computes the fixed-selector part of the linearized polynomial. +func PlonkLinearizeStatic( + result, Z, S3, Ql, Qr, Qm, Qo, Qk *FrVector, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta fr.Element, +) { + n := result.n + if Z.n != n || S3.n != n || Ql.n != n || Qr.n != n || Qm.n != n || + Qo.n != n || Qk.n != n { + panic("gpu: PlonkLinearizeStatic size mismatch") + } + scalars := [6]fr.Element{combinedZCoeff, s1, lZeta, rZeta, rl, oZeta} + if err := toError(C.gnark_gpu_plonk2_linearize_static( + devCtx(result.dev), + result.handle, Z.handle, S3.handle, + Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + (*C.uint64_t)(unsafe.Pointer(&scalars[0])), + )); err != nil { + panic("gpu: PlonkLinearizeStatic failed: " + err.Error()) + } +} + +// PlonkPermBoundary computes the fused permutation + boundary constraint. +func PlonkPermBoundary( + result, L, R, O, Z, S1, S2, S3, L1DenInv *FrVector, + alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen fr.Element, + domain *GPUFFTDomain, _ ...gpu.StreamID, +) { + n := result.n + if L.n != n || R.n != n || O.n != n || Z.n != n || + S1.n != n || S2.n != n || S3.n != n || L1DenInv.n != n || domain.size != n { + panic("gpu: PlonkPermBoundary size mismatch") + } + params := [7]fr.Element{alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen} + if err := toError(C.gnark_gpu_plonk2_perm_boundary( + devCtx(result.dev), + result.handle, L.handle, R.handle, O.handle, Z.handle, + S1.handle, S2.handle, S3.handle, L1DenInv.handle, + (*C.uint64_t)(unsafe.Pointer(¶ms[0])), domain.handle, + )); err != nil { + panic("gpu: PlonkPermBoundary failed: " + err.Error()) + } +} + +// ComputeL1Den computes out[i] = cosetGen·ω^i - 1 for all i. +func ComputeL1Den(out *FrVector, cosetGen fr.Element, domain *GPUFFTDomain, _ ...gpu.StreamID) { + if domain.size != out.n { + panic("gpu: ComputeL1Den domain size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_compute_l1_den( + domain.handle, out.handle, + (*C.uint64_t)(unsafe.Pointer(&cosetGen)), + )); err != nil { + panic("gpu: ComputeL1Den failed: " + err.Error()) + } +} + +// ReduceBlindedCoset reduces a blinded polynomial for coset evaluation on GPU. +func ReduceBlindedCoset(dst, src *FrVector, tail []fr.Element, cosetPowN fr.Element) { + if dst.n != src.n { + panic("gpu: ReduceBlindedCoset size mismatch") + } + var tailPtr *C.uint64_t + if len(tail) > 0 { + tailPtr = (*C.uint64_t)(unsafe.Pointer(&tail[0])) + } + if err := toError(C.gnark_gpu_plonk2_reduce_blinded_coset( + devCtx(dst.dev), dst.handle, src.handle, + tailPtr, C.size_t(len(tail)), + (*C.uint64_t)(unsafe.Pointer(&cosetPowN)), + )); err != nil { + panic("gpu: ReduceBlindedCoset failed: " + err.Error()) + } +} + +// SubtractBlindingHead subtracts tail[i] from v[i] for the blinding tail. +func SubtractBlindingHead(v *FrVector, tail []fr.Element) { + if len(tail) == 0 { + return + } + if len(tail) > v.n { + panic("gpu: SubtractBlindingHead size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_subtract_head( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&tail[0])), + C.size_t(len(tail)), + )); err != nil { + panic("gpu: SubtractBlindingHead failed: " + err.Error()) + } +} + +// DeviceAllocCopyInt64 uploads an int64 slice to GPU device memory. +func DeviceAllocCopyInt64(dev *gpu.Device, data []int64) (unsafe.Pointer, error) { + var dPtr unsafe.Pointer + if err := toError(C.gnark_gpu_device_alloc_copy_int64( + devCtx(dev), + (*C.int64_t)(unsafe.Pointer(&data[0])), + C.size_t(len(data)), + &dPtr, + )); err != nil { + return nil, err + } + return dPtr, nil +} + +// DeviceFreePtr frees device memory allocated by DeviceAllocCopyInt64. +func DeviceFreePtr(ptr unsafe.Pointer) { + if ptr != nil { + C.gnark_gpu_device_free_ptr(ptr) + } +} + +// PolyEvalGPU evaluates a GPU-resident polynomial at z using chunked Horner on +// device and a small CPU combine over chunk partials. +func PolyEvalGPU(dev *gpu.Device, v *FrVector, z fr.Element) fr.Element { + n := v.n + if n == 0 { + return fr.Element{} + } + + maxChunks := (n + 1023) / 1024 + partialsHost := make([]uint64, maxChunks*{{.FrLimbs}}) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_poly_eval_chunks( + devCtx(dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&z)), + (*C.uint64_t)(unsafe.Pointer(&partialsHost[0])), + &numChunks, + )); err != nil { + panic("gpu: PolyEvalGPU failed: " + err.Error()) + } + + return combinePolyEvalPartials(partialsHost, int(numChunks), z) +} + +// PolyEvalFromDevice downloads a GPU FrVector and evaluates at z using CPU Horner. +func PolyEvalFromDevice(v *FrVector, z fr.Element) fr.Element { + n := v.n + coeffs := make(fr.Vector, n) + v.CopyToHost(coeffs) + return polyEvalParallel(coeffs, z) +} + +func combinePolyEvalPartials(partialsHost []uint64, numChunks int, z fr.Element) fr.Element { + if numChunks == 0 { + return fr.Element{} + } + readPartial := func(chunk int) fr.Element { + var r fr.Element + for limb := range r { + r[limb] = partialsHost[chunk*{{.FrLimbs}}+limb] + } + return r + } + if numChunks == 1 { + return readPartial(0) + } + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(1024)) + result := readPartial(numChunks - 1) + for j := numChunks - 2; j >= 0; j-- { + p := readPartial(j) + result.Mul(&result, &zChunk).Add(&result, &p) + } + return result +} + +// polyEvalParallel evaluates p(z) = Σ c[i]·z^i using multi-core Horner. +func polyEvalParallel(coeffs []fr.Element, z fr.Element) fr.Element { + n := len(coeffs) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + return hornerEval(coeffs, z) + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + partials := make([]fr.Element, numChunks) + var wg sync.WaitGroup + for c := range numChunks { + start := c * chunkSize + if start >= n { + break + } + end := start + chunkSize + if end > n { + end = n + } + wg.Add(1) + go func(idx, s, e int) { + defer wg.Done() + partials[idx] = hornerEval(coeffs[s:e], z) + }(c, start, end) + } + wg.Wait() + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(int64(chunkSize))) + var result, zPow fr.Element + zPow.SetOne() + for c := range numChunks { + if c*chunkSize >= n { + break + } + var t fr.Element + t.Mul(&partials[c], &zPow) + result.Add(&result, &t) + zPow.Mul(&zPow, &zChunk) + } + return result +} + +func hornerEval(coeffs []fr.Element, z fr.Element) fr.Element { + var r fr.Element + for i := len(coeffs) - 1; i >= 0; i-- { + r.Mul(&r, &z).Add(&r, &coeffs[i]) + } + return r +} diff --git a/prover/gpu/internal/generator/plonk/template/kernels_stub.go.tmpl b/prover/gpu/internal/generator/plonk/template/kernels_stub.go.tmpl new file mode 100644 index 00000000000..da216570521 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/kernels_stub.go.tmpl @@ -0,0 +1,36 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package {{.Package}} + +import ( + "errors" + "unsafe" + + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +func ZPrefixProduct(_ *gpu.Device, _, _, _ *FrVector) { panic("gpu: cuda required") } +func PlonkZComputeFactors(_, _, _ *FrVector, _ unsafe.Pointer, _, _, _, _ fr.Element, _ uint, _ *GPUFFTDomain) { + panic("gpu: cuda required") +} +func PlonkGateAccum(_, _, _, _, _, _, _, _, _ *FrVector, _ fr.Element) { panic("gpu: cuda required") } +func PlonkPermBoundary(_, _, _, _, _, _, _, _, _ *FrVector, _, _, _, _, _, _, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ComputeL1Den(_ *FrVector, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ReduceBlindedCoset(_, _ *FrVector, _ []fr.Element, _ fr.Element) { panic("gpu: cuda required") } +func DeviceAllocCopyInt64(_ *gpu.Device, _ []int64) (unsafe.Pointer, error) { + return nil, errors.New("gpu: cuda required") +} +func DeviceFreePtr(_ unsafe.Pointer) {} +func PolyEvalGPU(_ *gpu.Device, _ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} +func PolyEvalFromDevice(_ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} diff --git a/prover/gpu/internal/generator/plonk/template/msm.go.tmpl b/prover/gpu/internal/generator/plonk/template/msm.go.tmpl new file mode 100644 index 00000000000..20a1c416b7c --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/msm.go.tmpl @@ -0,0 +1,407 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "fmt" + "log" + "math/big" + "os" + "runtime" + "strconv" + "unsafe" + + curve "{{.GnarkCurve}}" + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// frRInv is R^{-1} mod r where R = 2^{FrLimbs*64} (the Fr Montgomery constant). +// The GPU MSM uses Montgomery-form scalars without fr_from_mont, so the result +// is R * correct_result. Multiplying by frRInv corrects this. +var frRInv big.Int + +func init() { + var rInv fr.Element + rInv[0] = 1 // Montgomery representation of R^{-1}: stores R^{-1} mod r + rInv.BigInt(&frRInv) +} + +// msmDefaultWindowBits selects the Pippenger window size for n points. +func msmDefaultWindowBits(n int) int { +{{- if eq .Package "bw6761" }} + switch { + case n >= 1<<22: + return 18 + case n > 1<<18: + return 15 + case n > 1<<12: + return 13 + default: + return 11 + } +{{- else }} + switch { + case n > 1<<26: +{{- if eq .Package "bls12377" }} + return 20 +{{- else }} + return 19 +{{- end }} + case n > 1<<22: + return 17 + case n > 1<<18: + return 15 + case n > 1<<12: + return 13 + default: + return 11 + } +{{- end }} +} + +// G1MSM holds a GPU MSM context with uploaded affine base points. +// +// Points are uploaded once at construction. The context supports multiple +// MultiExp calls sharing the same base points. +type G1MSM struct { + handle C.gnark_gpu_plonk2_msm_t + dev *gpu.Device + n int + windowBits int + hostPoints []curve.G1Affine + hostPointsPtr unsafe.Pointer + lastBatchPhaseTimings [][9]float32 +} + +// NewG1MSM creates a G1MSM context by uploading affine points to the GPU. +// window_bits=0 selects a default based on point count. +func NewG1MSM(dev *gpu.Device, points []curve.G1Affine, windowBits int) (*G1MSM, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if err := dev.Bind(); err != nil { + return nil, err + } + n := len(points) + if n == 0 { + return nil, &gpu.Error{Code: -1, Message: "points must not be empty"} + } + if windowBits == 0 { + windowBits = msmDefaultWindowBits(n) + } + if override := os.Getenv("GNARK_GPU_PLONK2_MSM_WINDOW_BITS"); override != "" { + parsed, err := strconv.Atoi(override) + if err != nil { + return nil, fmt.Errorf("gpu: invalid GNARK_GPU_PLONK2_MSM_WINDOW_BITS %q: %w", override, err) + } + windowBits = parsed + } + if windowBits < 2 || windowBits > 24 { + return nil, fmt.Errorf("gpu: window bits must be in [2,24], got %d", windowBits) + } + + hostPoints := points + var hostPointsPtr unsafe.Pointer + if os.Getenv("GNARK_GPU_DISABLE_PINNED_MSM_POINTS") == "" { + nbytes := C.size_t(n) * C.size_t(unsafe.Sizeof(curve.G1Affine{})) + if err := toError(C.gnark_gpu_alloc_pinned(&hostPointsPtr, nbytes)); err == nil { + hostPoints = unsafe.Slice((*curve.G1Affine)(hostPointsPtr), n) + copy(hostPoints, points) + } else { + log.Printf("gpu: pinned MSM points unavailable (%v), using heap", err) + hostPointsPtr = nil + } + } + + var handle C.gnark_gpu_plonk2_msm_t + if err := toError(C.gnark_gpu_plonk2_msm_create( + devCtx(dev), + curveID(), + (*C.uint64_t)(unsafe.Pointer(&hostPoints[0])), + C.size_t(n), + C.int(windowBits), + &handle, + )); err != nil { + if hostPointsPtr != nil { + C.gnark_gpu_free_pinned(hostPointsPtr) + } + return nil, err + } + + m := &G1MSM{ + handle: handle, + dev: dev, + n: n, + windowBits: windowBits, + hostPoints: hostPoints, + hostPointsPtr: hostPointsPtr, + } + runtime.SetFinalizer(m, (*G1MSM).Close) + return m, nil +} + +// Close releases GPU resources. Safe to call multiple times. +func (m *G1MSM) Close() { + if m.handle != nil { + C.gnark_gpu_plonk2_msm_destroy(m.handle) + m.handle = nil + if m.hostPointsPtr != nil { + C.gnark_gpu_free_pinned(m.hostPointsPtr) + m.hostPointsPtr = nil + } + m.hostPoints = nil + runtime.SetFinalizer(m, nil) + } +} + +// Len returns the number of base points. +func (m *G1MSM) Len() int { return m.n } + +// PinWorkBuffers keeps MSM scratch buffers resident across MultiExp calls, +// amortizing cudaMalloc/Free overhead over a wave of MSMs. +func (m *G1MSM) PinWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_pin_work_buffers(m.handle)) +} + +// ReleaseWorkBuffers frees pinned scratch buffers. Subsequent MultiExp calls +// re-allocate lazily. +func (m *G1MSM) ReleaseWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_release_work_buffers(m.handle)) +} + +// OffloadPoints frees the GPU-resident base points. Call ReloadPoints before +// the next MultiExp. +func (m *G1MSM) OffloadPoints() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_offload_points(m.handle)) +} + +// ReloadPoints uploads the retained host base points after OffloadPoints. +func (m *G1MSM) ReloadPoints() error { + if len(m.hostPoints) < m.n { + return fmt.Errorf("gpu: MSM host points unavailable") + } + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_reload_points( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&m.hostPoints[0])), + C.size_t(m.n), + )) +} + +// MultiExp computes Q[i] = Σⱼ scalars[i][j] · P[j] for each scalar set. +// Each scalars[i] must have length ≤ m.Len(). +// Returns Jacobian results. +func (m *G1MSM) MultiExp(scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if err := m.dev.Bind(); err != nil { + return nil, err + } + k := len(scalars) + if k == 0 { + return nil, nil + } + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: MSM scalar set %d is empty", i) + } + if len(s) > m.n { + return nil, fmt.Errorf("gpu: MSM scalar set %d has %d elements, exceeds %d points", i, len(s), m.n) + } + } + + results := make([]curve.G1Jac, k) + m.lastBatchPhaseTimings = make([][9]float32, k) + for i, s := range scalars { + if err := toError(C.gnark_gpu_plonk2_msm_run( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&s[0])), + C.size_t(len(s)), + (*C.uint64_t)(unsafe.Pointer(&results[i])), + )); err != nil { + return nil, fmt.Errorf("gpu: MSM set %d failed: %w", i, err) + } + m.lastBatchPhaseTimings[i] = m.LastPhaseTimings() + // Montgomery correction: GPU skips fr_from_mont on scalars, so result = R * correct. + results[i].ScalarMultiplication(&results[i], &frRInv) + } + return results, nil +} + +// LastPhaseTimings returns per-phase timings (ms) from the most recent MultiExp call. +func (m *G1MSM) LastPhaseTimings() [9]float32 { + var out [9]C.float + C.gnark_gpu_plonk2_msm_get_phase_timings(m.handle, (*C.float)(unsafe.Pointer(&out[0]))) + var result [9]float32 + for i := range result { + result[i] = float32(out[i]) + } + return result +} + +// LastBatchPhaseTimings returns per-set MSM phase timings from the most recent +// MultiExp call. +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { + if len(m.lastBatchPhaseTimings) == 0 { + return nil + } + out := make([][9]float32, len(m.lastBatchPhaseTimings)) + copy(out, m.lastBatchPhaseTimings) + return out +} + +// MultiExpSplit runs the MSM split across 2 devices for ~2x speedup. +// msm0 must hold points[:n/2] and msm1 must hold points[n/2:]. +// This is an advanced API; use MultiExp for single-GPU operation. +func MultiExpSplit(msm0, msm1 *G1MSM, scalars []fr.Element) (curve.G1Jac, error) { + return MultiExpSplitAt(msm0, msm1, len(scalars)/2, scalars) +} + +// MultiExpSplitAt runs one MSM split across 2 devices at a fixed scalar index. +// msm0 must hold points[:split], and msm1 must hold points[split:]. +func MultiExpSplitAt(msm0, msm1 *G1MSM, split int, scalars []fr.Element) (curve.G1Jac, error) { + if msm0 == nil || msm1 == nil || len(scalars) == 0 { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: nil MSM or empty scalars") + } + n := len(scalars) + if split <= 0 || split >= n { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: invalid split %d for %d scalars", split, n) + } + if split > msm0.Len() || n-split > msm1.Len() { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: split exceeds MSM point capacity") + } + + type result struct { + jac curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(scalars[:split]) + if err != nil { + ch0 <- result{err: err} + return + } + ch0 <- result{jac: jacs[0]} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(scalars[split:]) + if err != nil { + ch1 <- result{err: err} + return + } + ch1 <- result{jac: jacs[0]} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return curve.G1Jac{}, r0.err + } + if r1.err != nil { + return curve.G1Jac{}, r1.err + } + r0.jac.AddAssign(&r1.jac) + return r0.jac, nil +} + +// MultiExpSplitBatchAt runs several MSMs split across 2 devices. Each device +// executes its half-batch sequentially on its own stream, and the host combines +// matching partials. +func MultiExpSplitBatchAt(msm0, msm1 *G1MSM, split int, scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if len(scalars) == 0 { + return nil, nil + } + first := make([][]fr.Element, len(scalars)) + second := make([][]fr.Element, len(scalars)) + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: split MSM scalar set %d is empty", i) + } + if split <= 0 || split >= len(s) { + return nil, fmt.Errorf("gpu: split MSM scalar set %d has invalid split %d for %d scalars", i, split, len(s)) + } + if split > msm0.Len() || len(s)-split > msm1.Len() { + return nil, fmt.Errorf("gpu: split MSM scalar set %d exceeds MSM point capacity", i) + } + first[i] = s[:split] + second[i] = s[split:] + } + + type result struct { + jacs []curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(first...) + ch0 <- result{jacs: jacs, err: err} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(second...) + ch1 <- result{jacs: jacs, err: err} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return nil, r0.err + } + if r1.err != nil { + return nil, r1.err + } + if len(r0.jacs) != len(scalars) || len(r1.jacs) != len(scalars) { + return nil, fmt.Errorf("gpu: split MSM result length mismatch") + } + for i := range r0.jacs { + r0.jacs[i].AddAssign(&r1.jacs[i]) + } + return r0.jacs, nil +} diff --git a/prover/gpu/internal/generator/plonk/template/msm_stub.go.tmpl b/prover/gpu/internal/generator/plonk/template/msm_stub.go.tmpl new file mode 100644 index 00000000000..fec5e3ffba3 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/msm_stub.go.tmpl @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package {{.Package}} + +import ( + "errors" + + curve "{{.GnarkCurve}}" + fr "{{.GnarkCryptoFr}}" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// G1MSM is a stub for non-CUDA builds. +type G1MSM struct{} + +func NewG1MSM(_ *gpu.Device, _ []curve.G1Affine, _ int) (*G1MSM, error) { + return nil, errors.New("gpu: cuda required") +} + +func (m *G1MSM) Close() {} +func (m *G1MSM) Len() int { return 0 } +func (m *G1MSM) PinWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) ReleaseWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) MultiExp(_ ...[]fr.Element) ([]curve.G1Jac, error) { + return nil, errors.New("gpu: cuda required") +} +func (m *G1MSM) LastPhaseTimings() [9]float32 { return [9]float32{} } +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { return nil } + +func MultiExpSplit(_, _ *G1MSM, _ []fr.Element) (curve.G1Jac, error) { + return curve.G1Jac{}, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/internal/generator/plonk/template/msm_test.go.tmpl b/prover/gpu/internal/generator/plonk/template/msm_test.go.tmpl new file mode 100644 index 00000000000..f5bc6a113e4 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/msm_test.go.tmpl @@ -0,0 +1,139 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}}_test + +import ( + "fmt" + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "{{.GnarkCurve}}" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/{{.Package}}" + "github.com/stretchr/testify/require" +) + +func makeTestPoints(n int) []curve.G1Affine { + _, _, g1, _ := curve.Generators() + pts := make([]curve.G1Affine, n) + pts[0] = g1 + for i := 1; i < n; i++ { + pts[i].Add(&pts[i-1], &g1) + } + return pts +} + +// TestMSMMatchesCPU verifies GPU MSM matches gnark-crypto CPU MultiExp. +func TestMSMMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + + for _, n := range []int{1, 16, 100, 1000} { + n := n + t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + + // CPU reference + var cpuResult curve.G1Affine + cpuResult.MultiExp(pts, scalars, ecc.MultiExpConfig{}) + + // GPU + msm, err := {{.Package}}.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars) + require.NoError(t, err) + require.Len(t, results, 1) + + var gpuAffine curve.G1Affine + gpuAffine.FromJacobian(&results[0]) + + require.True(t, cpuResult.Equal(&gpuAffine), + "MSM mismatch at n=%d", n) + }) + } +} + +// TestMSMBatchScalarSets tests MultiExp with multiple scalar sets. +func TestMSMBatchScalarSets(t *testing.T) { + dev := requireGPUDev(t) + const n = 100 + + pts := makeTestPoints(n) + scalars1 := randFrVec(n) + scalars2 := randFrVec(n) + + // CPU references + var cpu1, cpu2 curve.G1Affine + cpu1.MultiExp(pts, scalars1, ecc.MultiExpConfig{}) + cpu2.MultiExp(pts, scalars2, ecc.MultiExpConfig{}) + + // GPU batch + msm, err := {{.Package}}.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars1, scalars2) + require.NoError(t, err) + require.Len(t, results, 2) + + var gpu1, gpu2 curve.G1Affine + gpu1.FromJacobian(&results[0]) + gpu2.FromJacobian(&results[1]) + + require.True(t, cpu1.Equal(&gpu1), "MSM set 0 mismatch") + require.True(t, cpu2.Equal(&gpu2), "MSM set 1 mismatch") +} + +// TestMSMWorkBuffers verifies PinWorkBuffers/ReleaseWorkBuffers are idempotent. +func TestMSMWorkBuffers(t *testing.T) { + dev := requireGPUDev(t) + const n = 64 + + pts := makeTestPoints(n) + scalars := randFrVec(n) + + msm, err := {{.Package}}.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + require.NoError(t, msm.PinWorkBuffers()) + r1, err := msm.MultiExp(scalars) + require.NoError(t, err) + + require.NoError(t, msm.ReleaseWorkBuffers()) + r2, err := msm.MultiExp(scalars) + require.NoError(t, err) + + var a1, a2 curve.G1Affine + a1.FromJacobian(&r1[0]) + a2.FromJacobian(&r2[0]) + require.True(t, a1.Equal(&a2), "result changed after work buffer release") +} + +// BenchmarkMSM benchmarks GPU MSM at various sizes. +func BenchmarkMSM(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + msm, err := {{.Package}}.NewG1MSM(dev, pts, 0) + require.NoError(b, err) + defer msm.Close() + require.NoError(b, msm.PinWorkBuffers()) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := msm.MultiExp(scalars) + require.NoError(b, err) + } + }) + } +} diff --git a/prover/gpu/internal/generator/plonk/template/pinned_fr.go.tmpl b/prover/gpu/internal/generator/plonk/template/pinned_fr.go.tmpl new file mode 100644 index 00000000000..b818e0717ee --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/pinned_fr.go.tmpl @@ -0,0 +1,41 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "unsafe" + + fr "{{.GnarkCryptoFr}}" +) + +type pinnedFrBuffer struct { + ptr unsafe.Pointer + data []fr.Element +} + +func newPinnedFrBuffer(n int) (pinnedFrBuffer, error) { + var ptr unsafe.Pointer + nbytes := C.size_t(n) * C.size_t(fr.Bytes) + if err := toError(C.gnark_gpu_alloc_pinned(&ptr, nbytes)); err != nil { + return pinnedFrBuffer{}, err + } + return pinnedFrBuffer{ + ptr: ptr, + data: unsafe.Slice((*fr.Element)(ptr), n), + }, nil +} + +func (b *pinnedFrBuffer) free() { + if b.ptr != nil { + C.gnark_gpu_free_pinned(b.ptr) + b.ptr = nil + b.data = nil + } +} diff --git a/prover/gpu/internal/generator/plonk/template/plonk_test.go.tmpl b/prover/gpu/internal/generator/plonk/template/plonk_test.go.tmpl new file mode 100644 index 00000000000..34ea5e8c2b0 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/plonk_test.go.tmpl @@ -0,0 +1,210 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}}_test + +import ( + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "{{.GnarkCurve}}" + kzg "{{.GnarkCryptoKZG}}" + gnarkplonk "github.com/consensys/gnark/backend/plonk" + curplonk "{{.GnarkPlonk}}" + cs "{{.GnarkCS}}" + "github.com/consensys/gnark/frontend" + "github.com/consensys/gnark/frontend/cs/scs" +{{- if eq .Package "bls12377" }} + emPlonk "github.com/consensys/gnark/std/recursion/plonk" +{{- end }} + "github.com/consensys/gnark/test/unsafekzg" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/{{.Package}}" + "github.com/stretchr/testify/require" +) + +// addCircuit has enough constraints for sizeSystem >= 6 (avoiding gnark's 8-coset edge case for tiny circuits). +// Circuit: a*b + c*d + e*f = out (out is public) +type addCircuit struct { + A, B, C, D, F, G frontend.Variable + Out frontend.Variable `gnark:",public"` +} + +func (c *addCircuit) Define(api frontend.API) error { + ab := api.Mul(c.A, c.B) + cd := api.Mul(c.C, c.D) + fg := api.Mul(c.F, c.G) + sum := api.Add(ab, cd) + sum2 := api.Add(sum, fg) + api.AssertIsEqual(sum2, c.Out) + return nil +} + +type commitCircuit struct { + A, B, Out frontend.Variable +} + +func (c *commitCircuit) Define(api frontend.API) error { + commitment, err := api.(frontend.Committer).Commit(c.A, c.B) + if err != nil { + return err + } + product := api.Mul(c.A, c.B) + api.AssertIsDifferent(commitment, product) + api.AssertIsEqual(api.Add(c.A, c.B), c.Out) + return nil +} + +func setupAddCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &addCircuit{}) +} + +func setupCommitCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &commitCircuit{}) +} + +func setupCircuit(t testing.TB, circuit frontend.Circuit) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + ccs, err := frontend.Compile(ecc.{{.EccIDStr}}.ScalarField(), scs.NewBuilder, circuit) + require.NoError(t, err) + + srs, srsLag, err := unsafekzg.NewSRS(ccs) + require.NoError(t, err) + + _, vkIface, err := gnarkplonk.Setup(ccs, srs, srsLag) + require.NoError(t, err) + vk := vkIface.(*curplonk.VerifyingKey) + + // Extract canonical G1 SRS points from the concrete KZG SRS type. + concreteSRS := srs.(*kzg.SRS) + srsPoints := make([]curve.G1Affine, len(concreteSRS.Pk.G1)) + copy(srsPoints, concreteSRS.Pk.G1) + + return ccs.(*cs.SparseR1CS), vk, srsPoints +} + +// TestGPUProveVerify proves a small circuit with the GPU and verifies with gnark CPU. +func TestGPUProveVerify(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := {{.Package}}.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15+77+8} + fullW, err := frontend.NewWitness(assignment, ecc.{{.EccIDStr}}.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := {{.Package}}.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +// TestGPUProveMultipleProofs tests that multiple proofs can be generated from the same key. +func TestGPUProveMultipleProofs(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := {{.Package}}.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + for i := range 3 { + a := int64(i + 1) + _ = int64(i + 2) + assignment := &addCircuit{A: a, B: a + 1, C: a + 2, D: a + 3, F: a + 4, G: a + 5, Out: a*(a+1) + (a+2)*(a+3) + (a+4)*(a+5)} + fullW, err := frontend.NewWitness(assignment, ecc.{{.EccIDStr}}.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := {{.Package}}.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err, "proof %d failed", i) + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "proof %d verification failed", i) + } +} + +func TestGPUProveVerify_BSB22Commitment(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupCommitCircuit(t) + + gpk := {{.Package}}.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &commitCircuit{A: 3, B: 5, Out: 8} + fullW, err := frontend.NewWitness(assignment, ecc.{{.EccIDStr}}.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := {{.Package}}.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +{{- if eq .Package "bls12377" }} + +func TestGPUProveVerify_BSB22Commitment_NativeRecursionOptions(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupCommitCircuit(t) + + gpk := bls12377.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &commitCircuit{A: 3, B: 5, Out: 8} + fullW, err := frontend.NewWitness(assignment, ecc.BLS12_377.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bls12377.GPUProve( + dev, + gpk, + spr, + fullW, + emPlonk.GetNativeProverOptions(ecc.BW6_761.ScalarField(), ecc.BLS12_377.ScalarField()), + ) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError( + t, + gnarkplonk.Verify( + proof, + vk, + pubW, + emPlonk.GetNativeVerifierOptions(ecc.BW6_761.ScalarField(), ecc.BLS12_377.ScalarField()), + ), + "GPU proof failed verification with native recursion options", + ) +} +{{- end }} + +// BenchmarkGPUProve benchmarks GPU proof generation. +func BenchmarkGPUProve(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + spr, vk, srsPoints := setupAddCircuit(b) + gpk := {{.Package}}.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15+77+8} + fullW, err := frontend.NewWitness(assignment, ecc.{{.EccIDStr}}.ScalarField()) + require.NoError(b, err) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := {{.Package}}.GPUProve(dev, gpk, spr, fullW) + require.NoError(b, err) + } +} diff --git a/prover/gpu/internal/generator/plonk/template/prove.go.tmpl b/prover/gpu/internal/generator/plonk/template/prove.go.tmpl new file mode 100644 index 00000000000..64b34d8075c --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/prove.go.tmpl @@ -0,0 +1,2618 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "context" + "errors" + "fmt" + "hash" + "log" + "math/big" + "math/bits" + "os" + "runtime" + "strconv" + "sync" + "time" + "unsafe" + + curve "{{.GnarkCurve}}" + fr "{{.GnarkCryptoFr}}" + "{{.GnarkCryptoFFT}}" + htf "{{.GnarkCryptoHTF}}" + iop "{{.GnarkCryptoIOP}}" + kzg "{{.GnarkCryptoKZG}}" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + + "github.com/consensys/gnark/backend" + curplonk "{{.GnarkPlonk}}" + "github.com/consensys/gnark/backend/witness" + "github.com/consensys/gnark/constraint" + cs "{{.GnarkCS}}" + "github.com/consensys/gnark/constraint/solver" + fcs "github.com/consensys/gnark/frontend/cs" + + "github.com/consensys/linea-monorepo/prover/gpu" + "golang.org/x/sync/errgroup" +) + +const ( + id_L int = iota + id_R + id_O + id_Z + + orderBlindingL = 1 + orderBlindingR = 1 + orderBlindingO = 1 + orderBlindingZ = 2 + msmExtraPoints = 6 +) + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProvingKey — slim wrapper: VerifyingKey + lazy gpuInstance +// ───────────────────────────────────────────────────────────────────────────── + +type GPUProvingKey struct { + mu sync.Mutex + Vk *curplonk.VerifyingKey + n int + + // SRS data (consumed during instance init) + srsPoints []curve.G1Affine + pinnedN int + + inst *gpuInstance +} + +// NewGPUProvingKey creates a GPUProvingKey from affine SRS points. +func NewGPUProvingKey(srsPoints []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + n := 0 + if vk != nil { + n = int(vk.Size) + } + return &GPUProvingKey{Vk: vk, n: n, srsPoints: srsPoints} +} + +// Size returns the domain size n. +func (gpk *GPUProvingKey) Size() int { return gpk.n } + +// Prepare performs one-time GPU setup. +func (gpk *GPUProvingKey) Prepare(dev *gpu.Device, spr *cs.SparseR1CS) error { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil && gpk.inst.dev == dev { + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + inst, err := newGPUInstance(dev, gpk, spr) + if err != nil { + return err + } + gpk.inst = inst + return nil +} + +// Close releases all GPU resources. +func (gpk *GPUProvingKey) Close() { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuInstance — persistent GPU resources + circuit data +// ───────────────────────────────────────────────────────────────────────────── + +// quotientWorkBufs holds pre-allocated GPU buffers for computeNumeratorGPU and +// computeLinearizedPoly, avoiding per-proof cudaMalloc/Free overhead. +type quotientWorkBufs struct { + L, R, O, Z *FrVector // wire poly working buffers (reused per coset) + S1, S2, S3 *FrVector // perm selector buffers + Result *FrVector // coset numerator accumulator + LCan, RCan, OCan, ZCan *FrVector // canonical wire polys (uploaded once per proof) + QkSrc *FrVector // Qk canonical source (D2D per coset, avoids H2D) + Pi2Src []*FrVector // per-proof BSB22 pi2 sources (D2D per coset) + CosetBlock [3]*FrVector // GPU-resident coset results; Result keeps block 4 + LinResult, LinW *FrVector // linearized poly GPU scratch +} + +type lowMemorySelectorCache struct { + ql, qr, qm, qo *FrVector + s1, s2, s3 *FrVector + qcp []*FrVector +} + +type splitMSMBackend struct { + secondary *gpu.Device + msm0 *G1MSM + msm1 *G1MSM + split int +} + +type gpuInstance struct { + dev *gpu.Device + vk *curplonk.VerifyingKey + n int + log2n uint + lowMemory bool + canonicalReady chan struct{} + canonicalErr error + canonicalOnce sync.Once + + domain0 *fft.Domain + + msm *G1MSM + splitMSM *splitMSMBackend + fftDom *GPUFFTDomain + dPerm unsafe.Pointer + + dQl, dQr, dQm, dQo *FrVector + dS1, dS2, dS3 *FrVector + dQkFixed *FrVector + dQcp []*FrVector + + qlCanonical, qrCanonical, qmCanonical, qoCanonical fr.Vector + qkFixedCanonical fr.Vector + s1Canonical, s2Canonical, s3Canonical fr.Vector + qcpCanonical []fr.Vector + qkLagrange fr.Vector + permutation []int64 + nbPublicVariables int + commitmentInfo []uint64 + + gpuWork *FrVector // shared scratch buffer (persists for prover lifetime) + qWb quotientWorkBufs + + hBufs hostBufs +} + +type gpuInstanceReadyHooks struct { + msm func(*gpuInstance) + commit func(*gpuInstance) + trace func(*gpuInstance) +} + +type hostBufs struct { + lCanonical, rCanonical, oCanonical fr.Vector + zLagrange fr.Vector + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + hFull []fr.Element + openZBuf []fr.Element + pinned []pinnedFrBuffer +} + +func (inst *gpuInstance) initHostBufs() { + n := inst.n + var hb hostBufs + + allocPinnedHotBuffer := func(name string, n int) []fr.Element { + if os.Getenv("GNARK_GPU_DISABLE_PINNED_HOST_BUFS") == "" { + buf, err := newPinnedFrBuffer(n) + if err == nil { + hb.pinned = append(hb.pinned, buf) + return buf.data + } + log.Printf("gpu: pinned host buffer %s unavailable (%v), using heap", name, err) + } + return make([]fr.Element, n) + } + + hb = hostBufs{ + lCanonical: make(fr.Vector, n), + rCanonical: make(fr.Vector, n), + oCanonical: make(fr.Vector, n), + zLagrange: make(fr.Vector, n), + qkCoeffs: make(fr.Vector, n), + openZBuf: make([]fr.Element, n+1+orderBlindingZ), + } + hb.lBlinded = allocPinnedHotBuffer("lBlinded", n+1+orderBlindingL) + hb.rBlinded = allocPinnedHotBuffer("rBlinded", n+1+orderBlindingR) + hb.oBlinded = allocPinnedHotBuffer("oBlinded", n+1+orderBlindingO) + hb.zBlinded = allocPinnedHotBuffer("zBlinded", n+1+orderBlindingZ) + hSize := 4 * n + if needed := 3 * (n + 2); needed > hSize { + hSize = needed + } + hb.hFull = allocPinnedHotBuffer("hFull", hSize) + inst.hBufs = hb +} + +func (hb *hostBufs) free() { + for i := range hb.pinned { + hb.pinned[i].free() + } + *hb = hostBufs{} +} + +func newGPUInstance(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, hooks ...gpuInstanceReadyHooks) (*gpuInstance, error) { + inst := &gpuInstance{dev: dev, vk: gpk.Vk, n: gpk.n, canonicalReady: make(chan struct{})} + var hook gpuInstanceReadyHooks + if len(hooks) > 0 { + hook = hooks[0] + } + commitPublished := false + msmPublished := false + tracePublished := false + publishMSMReady := func() { + if hook.msm != nil && !msmPublished { + msmPublished = true + hook.msm(inst) + } + } + publishCommitReady := func() { + if hook.commit != nil && !commitPublished { + commitPublished = true + hook.commit(inst) + } + } + publishTraceReady := func() { + if hook.trace != nil && !tracePublished { + tracePublished = true + hook.trace(inst) + } + } + var traceErrCh chan error + + fail := func(msg string, err error) (*gpuInstance, error) { + wrapped := fmt.Errorf("%s: %w", msg, err) + if traceErrCh != nil { + <-traceErrCh + traceErrCh = nil + } + inst.publishCanonicalReady(wrapped) + if !msmPublished && !commitPublished && !tracePublished { + inst.close() + } + return nil, wrapped + } + + if err := inst.initCircuitShape(spr); err != nil { + return fail("init circuit shape", err) + } + inst.lowMemory = selectLowMemoryMode(dev, inst.n) + traceErrCh = make(chan error, 1) + go func() { + traceErrCh <- inst.initTraceData(spr) + }() + waitTrace := func() error { + if traceErrCh == nil { + return nil + } + err := <-traceErrCh + traceErrCh = nil + return err + } + + var err error + msmSize := inst.n + msmExtraPoints + pts := gpk.srsPoints + if msmSize > len(pts) { + msmSize = len(pts) + } + if secondaryID, ok, cfgErr := secondaryMSMDeviceID(dev.DeviceID()); cfgErr != nil { + return fail("configure secondary MSM GPU", cfgErr) + } else if ok { + split := inst.n / 2 + if split <= 0 || split >= msmSize { + return fail("configure secondary MSM GPU", fmt.Errorf("invalid split %d for MSM size %d", split, msmSize)) + } + secondary, err := gpu.New(gpu.WithDeviceID(secondaryID)) + if err != nil { + return fail("create secondary GPU device", err) + } + inst.splitMSM = &splitMSMBackend{secondary: secondary, split: split} + inst.splitMSM.msm0, err = NewG1MSM(dev, pts[:split], 0) + if err != nil { + return fail("create primary split MSM", err) + } + inst.splitMSM.msm1, err = NewG1MSM(secondary, pts[split:msmSize], 0) + if err != nil { + return fail("create secondary split MSM", err) + } + } else { + inst.msm, err = NewG1MSM(dev, pts[:msmSize], 0) + if err != nil { + return fail("create MSM", err) + } + } + gpk.srsPoints = nil // ownership transferred; free heap copy + + if !inst.lowMemory { + if perr := inst.pinMSMWorkBuffers(); perr != nil { + return fail("pin MSM work buffers", perr) + } + } + + if inst.lowMemory { + if err := inst.offloadMSMPoints(); err != nil { + return fail("offload MSM points", err) + } + } + + inst.fftDom, err = NewFFTDomain(dev, inst.n) + if err != nil { + return fail("create FFT domain", err) + } + + if inst.lowMemory { + inst.gpuWork, err = NewFrVector(dev, inst.n) + if err != nil { + return fail("alloc low-memory GPU work buffer", err) + } + if err := dev.InitMultiStream(); err != nil { + return fail("init multi-stream", err) + } + publishMSMReady() + inst.initHostBufs() + publishCommitReady() + } + + if err := waitTrace(); err != nil { + return fail("init circuit data", err) + } + + inst.dPerm, err = DeviceAllocCopyInt64(dev, inst.permutation) + if err != nil { + return fail("upload permutation", err) + } + + if inst.lowMemory { + publishTraceReady() + } + + if err := inst.initCanonicalGPU(); err != nil { + return fail("init canonical", err) + } + + if inst.lowMemory { + inst.publishCanonicalReady(nil) + return inst, nil + } + + if err := inst.uploadPolynomials(); err != nil { + return fail("upload polynomials", err) + } + + if err := inst.allocPersistentBufs(); err != nil { + return fail("alloc persistent GPU buffers", err) + } + + inst.initHostBufs() + publishMSMReady() + publishCommitReady() + publishTraceReady() + inst.publishCanonicalReady(nil) + return inst, nil +} + +func (inst *gpuInstance) publishCanonicalReady(err error) { + inst.canonicalOnce.Do(func() { + inst.canonicalErr = err + close(inst.canonicalReady) + }) +} + +func (inst *gpuInstance) waitCanonicalReady() error { + if inst.canonicalReady == nil { + return nil + } + <-inst.canonicalReady + return inst.canonicalErr +} + +func selectLowMemoryMode(dev *gpu.Device, n int) bool { + if os.Getenv("GNARK_GPU_PLONK2_FORCE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode forced for n=%d", n) + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode disabled for n=%d", n) + return false + } + free, total, err := dev.MemGetInfo() + if err != nil { + low := n >= 1<<25 + log.Printf("plonk2: low-memory GPU mode=%t for n=%d; mem query failed: %v", low, n, err) + return low + } + vecBytes := uint64(n) * uint64(fr.Bytes) + estimatedResident := vecBytes * 24 + low := estimatedResident > total/2 + log.Printf( + "plonk2: low-memory GPU mode=%t n=%d vecBytes=%d estimatedResident=%d freeVRAM=%d totalVRAM=%d", + low, n, vecBytes, estimatedResident, free, total, + ) + return low +} + +func secondaryMSMDeviceID(primaryID int) (int, bool, error) { + raw := os.Getenv("GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID") + if raw == "" { + return 0, false, nil + } + id, err := strconv.Atoi(raw) + if err != nil { + return 0, false, fmt.Errorf("invalid GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID %q: %w", raw, err) + } + if id == primaryID { + return 0, false, fmt.Errorf("secondary device matches primary device %d", primaryID) + } + if id < 0 { + return 0, false, fmt.Errorf("secondary device id must be non-negative, got %d", id) + } + return id, true, nil +} + +func (inst *gpuInstance) pinMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.PinWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.PinWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.PinWorkBuffers() +} + +func (inst *gpuInstance) releaseMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReleaseWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReleaseWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReleaseWorkBuffers() +} + +func (inst *gpuInstance) offloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.OffloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.OffloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.OffloadPoints() +} + +func (inst *gpuInstance) reloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReloadPoints() +} + +// allocPersistentBufs allocates GPU work buffers that persist across proofs. +// Avoids per-proof cudaMalloc/Free overhead (~3 ms per 64 MB alloc × 20 bufs). +func (inst *gpuInstance) allocPersistentBufs() error { + n := inst.n + alloc := func() (*FrVector, error) { + return NewFrVector(inst.dev, n) + } + wb := &inst.qWb + // Flat list mirrors the free loop in close() — keep in sync. + named := []*(*FrVector){ + &inst.gpuWork, + &wb.L, &wb.R, &wb.O, &wb.Z, + &wb.S1, &wb.S2, &wb.S3, &wb.Result, + &wb.LCan, &wb.RCan, &wb.OCan, &wb.ZCan, + &wb.QkSrc, &wb.LinResult, &wb.LinW, + } + for _, p := range named { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + *p = v + } + for k := range wb.CosetBlock { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.CosetBlock[k] = v + } + if len(inst.commitmentInfo) > 0 { + wb.Pi2Src = make([]*FrVector, len(inst.commitmentInfo)) + for i := range wb.Pi2Src { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.Pi2Src[i] = v + } + } + // Create multi-stream upfront so the quotient pipeline can use it immediately. + return inst.dev.InitMultiStream() +} + +func (inst *gpuInstance) initCircuitShape(spr *cs.SparseR1CS) error { + nbConstraints := spr.GetNbConstraints() + sizeSystem := uint64(nbConstraints + len(spr.Public)) + inst.domain0 = fft.NewDomain(sizeSystem, fft.WithoutPrecompute()) + n := int(inst.domain0.Cardinality) + if n != inst.n { + return fmt.Errorf("domain size mismatch: spr=%d SRS=%d", n, inst.n) + } + inst.log2n = uint(bits.TrailingZeros(uint(n))) + inst.nbPublicVariables = len(spr.Public) + inst.commitmentInfo = inst.vk.CommitmentConstraintIndexes + return nil +} + +func (inst *gpuInstance) initTraceData(spr *cs.SparseR1CS) error { + trace := curplonk.NewTrace(spr, inst.domain0) + inst.qlCanonical = fr.Vector(trace.Ql.Coefficients()) + inst.qrCanonical = fr.Vector(trace.Qr.Coefficients()) + inst.qmCanonical = fr.Vector(trace.Qm.Coefficients()) + inst.qoCanonical = fr.Vector(trace.Qo.Coefficients()) + inst.s1Canonical = fr.Vector(trace.S1.Coefficients()) + inst.s2Canonical = fr.Vector(trace.S2.Coefficients()) + inst.s3Canonical = fr.Vector(trace.S3.Coefficients()) + + inst.qkLagrange = make(fr.Vector, inst.n) + copy(inst.qkLagrange, trace.Qk.Coefficients()) + inst.qkFixedCanonical = fr.Vector(trace.Qk.Coefficients()) + + inst.qcpCanonical = make([]fr.Vector, len(trace.Qcp)) + for i, p := range trace.Qcp { + inst.qcpCanonical[i] = fr.Vector(p.Coefficients()) + } + inst.permutation = trace.S + return nil +} + +func (inst *gpuInstance) initCanonicalGPU() error { + n := inst.n + gpuWork, err := NewFrVector(inst.dev, n) + if err != nil { + return fmt.Errorf("alloc work vector: %w", err) + } + defer gpuWork.Free() + + iFFTSelector := func(v fr.Vector) { + gpuWork.CopyFromHost(v) + inst.fftDom.BitReverse(gpuWork) + inst.fftDom.FFTInverse(gpuWork) + gpuWork.CopyToHost(v) + } + + for _, v := range []fr.Vector{ + inst.qlCanonical, inst.qrCanonical, inst.qmCanonical, inst.qoCanonical, + inst.qkFixedCanonical, inst.s1Canonical, inst.s2Canonical, inst.s3Canonical, + } { + iFFTSelector(v) + } + for _, v := range inst.qcpCanonical { + iFFTSelector(v) + } + + return inst.dev.Sync() +} + +func (inst *gpuInstance) uploadPolynomials() error { + upload := func(data fr.Vector) (*FrVector, error) { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, err + } + v.CopyFromHost(data) + return v, nil + } + var err error + if inst.dQl, err = upload(inst.qlCanonical); err != nil { + return fmt.Errorf("upload ql: %w", err) + } + if inst.dQr, err = upload(inst.qrCanonical); err != nil { + return fmt.Errorf("upload qr: %w", err) + } + if inst.dQm, err = upload(inst.qmCanonical); err != nil { + return fmt.Errorf("upload qm: %w", err) + } + if inst.dQo, err = upload(inst.qoCanonical); err != nil { + return fmt.Errorf("upload qo: %w", err) + } + if inst.dS1, err = upload(inst.s1Canonical); err != nil { + return fmt.Errorf("upload s1: %w", err) + } + if inst.dS2, err = upload(inst.s2Canonical); err != nil { + return fmt.Errorf("upload s2: %w", err) + } + if inst.dS3, err = upload(inst.s3Canonical); err != nil { + return fmt.Errorf("upload s3: %w", err) + } + if inst.dQkFixed, err = upload(inst.qkFixedCanonical); err != nil { + return fmt.Errorf("upload qkFixed: %w", err) + } + inst.dQcp = make([]*FrVector, len(inst.qcpCanonical)) + for i, v := range inst.qcpCanonical { + if inst.dQcp[i], err = upload(v); err != nil { + return fmt.Errorf("upload qcp[%d]: %w", i, err) + } + } + return nil +} + +func (inst *gpuInstance) close() { + if inst.msm != nil { + inst.msm.Close() + inst.msm = nil + } + if inst.splitMSM != nil { + if inst.splitMSM.msm0 != nil { + inst.splitMSM.msm0.Close() + } + if inst.splitMSM.msm1 != nil { + inst.splitMSM.msm1.Close() + } + if inst.splitMSM.secondary != nil { + _ = inst.splitMSM.secondary.Close() + } + inst.splitMSM = nil + } + if inst.fftDom != nil { + inst.fftDom.Close() + inst.fftDom = nil + } + if inst.dPerm != nil { + DeviceFreePtr(inst.dPerm) + inst.dPerm = nil + } + for _, v := range []*FrVector{inst.dQl, inst.dQr, inst.dQm, inst.dQo, + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed} { + if v != nil { + v.Free() + } + } + inst.dQl, inst.dQr, inst.dQm, inst.dQo = nil, nil, nil, nil + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed = nil, nil, nil, nil + for _, v := range inst.dQcp { + if v != nil { + v.Free() + } + } + inst.dQcp = nil + // Free persistent work buffers (mirrors the alloc list in allocPersistentBufs). + wb := &inst.qWb + for _, v := range []*FrVector{ + inst.gpuWork, + wb.L, wb.R, wb.O, wb.Z, wb.S1, wb.S2, wb.S3, wb.Result, + wb.LCan, wb.RCan, wb.OCan, wb.ZCan, wb.QkSrc, wb.LinResult, wb.LinW, + } { + if v != nil { + v.Free() + } + } + for k := range wb.CosetBlock { + if wb.CosetBlock[k] != nil { + wb.CosetBlock[k].Free() + } + } + for _, v := range wb.Pi2Src { + if v != nil { + v.Free() + } + } + inst.gpuWork = nil + inst.qWb = quotientWorkBufs{} + inst.hBufs.free() +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuProver — per-proof mutable state +// ───────────────────────────────────────────────────────────────────────────── + +type gpuProver struct { + inst *gpuInstance + instMu sync.Mutex + waitInst func() (*gpuInstance, error) + waitMSMInst func() (*gpuInstance, error) + waitCommitInst func() (*gpuInstance, error) + + proof curplonk.Proof + fs *fiatshamir.Transcript + + commitmentInfo constraint.PlonkCommitments + commitmentVal []fr.Element + pi2Canonical [][]fr.Element + pi2DeviceReady []bool + solverOpts []solver.Option + kzgFoldingHash hash.Hash + htfFunc hash.Hash + + evalL, evalR, evalO fr.Vector + wWitness fr.Vector + bpL, bpR, bpO, bpZ *iop.Polynomial + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + h1, h2, h3 []fr.Element + gamma, beta, alpha, zeta fr.Element + + logTime func(string) +} + +// ─── Prove phases ───────────────────────────────────────────────────────────── + +func (p *gpuProver) ensureInst() (*gpuInstance, error) { + p.instMu.Lock() + if p.inst != nil { + inst := p.inst + p.instMu.Unlock() + return inst, nil + } + waitInst := p.waitInst + p.instMu.Unlock() + if waitInst == nil { + return nil, errors.New("gpu instance is not initialized") + } + inst, err := waitInst() + if err != nil { + return nil, err + } + p.instMu.Lock() + if p.inst == nil { + p.inst = inst + } + inst = p.inst + p.instMu.Unlock() + return inst, nil +} + +func (p *gpuProver) initBlindingPolynomials() { + p.bpL = getRandomPolynomial(orderBlindingL) + p.bpR = getRandomPolynomial(orderBlindingR) + p.bpO = getRandomPolynomial(orderBlindingO) + p.bpZ = getRandomPolynomial(orderBlindingZ) +} + +func (p *gpuProver) solve(spr *cs.SparseR1CS, fullWitness witness.Witness) error { + solverOpts := append([]solver.Option(nil), p.solverOpts...) + if len(p.commitmentInfo) > 0 { + bsb22ID := solver.GetHintID(fcs.Bsb22CommitmentComputePlaceholder) + solverOpts = append(solverOpts, solver.OverrideHint(bsb22ID, func(_ *big.Int, ins, outs []*big.Int) error { + waitMSMInst := p.waitMSMInst + if waitMSMInst == nil { + waitMSMInst = p.waitCommitInst + } + if waitMSMInst == nil { + waitMSMInst = p.ensureInst + } + inst, err := waitMSMInst() + if err != nil { + return err + } + n := inst.n + commDepth := int(ins[0].Int64()) + ins = ins[1:] + ci := p.commitmentInfo[commDepth] + committedValues := make([]fr.Element, inst.domain0.Cardinality) + offset := inst.nbPublicVariables + for i := range ins { + committedValues[offset+ci.Committed[i]].SetBigInt(ins[i]) + } + committedValues[offset+ci.CommitmentIndex].SetRandom() + committedValues[offset+spr.GetNbConstraints()-1].SetRandom() + + inst.gpuWork.CopyFromHost(fr.Vector(committedValues[:n])) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if commDepth < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[commDepth] != nil { + inst.qWb.Pi2Src[commDepth].CopyFromDevice(inst.gpuWork) + p.pi2DeviceReady[commDepth] = true + } + canonicalBuf := make(fr.Vector, n) + inst.gpuWork.CopyToHost(canonicalBuf) + p.pi2Canonical[commDepth] = canonicalBuf + + commitment, err := inst.commit(canonicalBuf) + if err != nil { + return err + } + p.proof.Bsb22Commitments[commDepth] = commitment + + p.htfFunc.Write(p.proof.Bsb22Commitments[commDepth].Marshal()) + hashBts := p.htfFunc.Sum(nil) + p.htfFunc.Reset() + nbBuf := fr.Bytes + if p.htfFunc.Size() < fr.Bytes { + nbBuf = p.htfFunc.Size() + } + p.commitmentVal[commDepth].SetBytes(hashBts[:nbBuf]) + p.commitmentVal[commDepth].BigInt(outs[0]) + return nil + })) + } + + solution_, err := spr.Solve(fullWitness, solverOpts...) + if err != nil { + return fmt.Errorf("solve: %w", err) + } + solution := solution_.(*cs.SparseR1CSSolution) + p.evalL = fr.Vector(solution.L) + p.evalR = fr.Vector(solution.R) + p.evalO = fr.Vector(solution.O) + + var ok bool + p.wWitness, ok = fullWitness.Vector().(fr.Vector) + if !ok { + return errors.New("invalid witness type") + } + return nil +} + +func (p *gpuProver) completeQk() { + inst, err := p.ensureInst() + if err != nil { + panic(err) + } + p.qkCoeffs = inst.hBufs.qkCoeffs + copy(p.qkCoeffs, inst.qkLagrange) + copy(p.qkCoeffs, p.wWitness[:inst.nbPublicVariables]) + for i := range p.commitmentInfo { + p.qkCoeffs[inst.nbPublicVariables+p.commitmentInfo[i].CommitmentIndex] = p.commitmentVal[i] + } +} + +// commitToLRO overlaps the iFFT of L,R,O with Qk patching (via waitQk) and +// blinding-polynomial generation (via waitBlinding), both of which complete +// concurrently in sibling goroutines. +func (p *gpuProver) commitToLRO(inst *gpuInstance, waitQk, waitBlinding func() error) error { + hb := &inst.hBufs + + gpuToCanonical := func(lagrange, dst fr.Vector, dstDevice *FrVector) { + inst.gpuWork.CopyFromHost(lagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if dstDevice != nil { + dstDevice.CopyFromDevice(inst.gpuWork) + } + inst.gpuWork.CopyToHost(dst) + } + + if inst.lowMemory { + gpuToCanonical(p.evalL, hb.lCanonical, nil) + gpuToCanonical(p.evalR, hb.rCanonical, nil) + gpuToCanonical(p.evalO, hb.oCanonical, nil) + } else { + gpuToCanonical(p.evalL, hb.lCanonical, inst.qWb.LCan) + gpuToCanonical(p.evalR, hb.rCanonical, inst.qWb.RCan) + gpuToCanonical(p.evalO, hb.oCanonical, inst.qWb.OCan) + } + + if err := waitQk(); err != nil { + return err + } + inst.gpuWork.CopyFromHost(p.qkCoeffs) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if inst.lowMemory { + inst.gpuWork.CopyToHost(p.qkCoeffs) + } else { + inst.qWb.QkSrc.CopyFromDevice(inst.gpuWork) + p.qkCoeffs = nil + } + + if err := waitBlinding(); err != nil { + return err + } + + var blindWG sync.WaitGroup + blindWG.Add(3) + go func() { defer blindWG.Done(); p.lBlinded = blindInto(hb.lBlinded, hb.lCanonical, p.bpL) }() + go func() { defer blindWG.Done(); p.rBlinded = blindInto(hb.rBlinded, hb.rCanonical, p.bpR) }() + go func() { defer blindWG.Done(); p.oBlinded = blindInto(hb.oBlinded, hb.oCanonical, p.bpO) }() + blindWG.Wait() + if !inst.lowMemory { + SubtractBlindingHead(inst.qWb.LCan, p.bpL.Coefficients()) + SubtractBlindingHead(inst.qWb.RCan, p.bpR.Coefficients()) + SubtractBlindingHead(inst.qWb.OCan, p.bpO.Coefficients()) + } + + p.logTime("iFFT L,R,O,Qk + blind") + + lroCommits, err := inst.commitN(p.lBlinded, p.rBlinded, p.oBlinded) + if err != nil { + return err + } + p.proof.LRO[0] = lroCommits[0] + p.proof.LRO[1] = lroCommits[1] + p.proof.LRO[2] = lroCommits[2] + + p.logTime("MSM commit L,R,O") + return nil +} + +func (p *gpuProver) deriveGammaBeta() error { + inst := p.inst + if err := bindPublicData(p.fs, "gamma", inst.vk, p.wWitness[:inst.nbPublicVariables]); err != nil { + return err + } + var err error + p.gamma, err = deriveRandomness(p.fs, "gamma", &p.proof.LRO[0], &p.proof.LRO[1], &p.proof.LRO[2]) + if err != nil { + return err + } + p.beta, err = deriveRandomness(p.fs, "beta") + if err != nil { + return err + } + p.wWitness = nil + p.logTime("derive gamma,beta") + return nil +} + +func (p *gpuProver) buildZAndCommit() error { + inst := p.inst + + zLagrange, err := buildZGPU(inst, inst.gpuWork, p.evalL, p.evalR, p.evalO, p.beta, p.gamma) + if err != nil { + return fmt.Errorf("build Z: %w", err) + } + p.evalL, p.evalR, p.evalO = nil, nil, nil + p.logTime("build Z") + + hb := &inst.hBufs + inst.gpuWork.CopyFromHost(zLagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + inst.gpuWork.CopyToHost(hb.zLagrange) + p.zBlinded = blindInto(hb.zBlinded, hb.zLagrange, p.bpZ) + if !inst.lowMemory { + inst.qWb.ZCan.CopyFromDevice(inst.gpuWork) + SubtractBlindingHead(inst.qWb.ZCan, p.bpZ.Coefficients()) + } + + zCommit, err := inst.commit(p.zBlinded) + if err != nil { + return err + } + p.proof.Z = zCommit + p.logTime("iFFT+commit Z") + + alphaDeps := make([]*curve.G1Affine, len(p.proof.Bsb22Commitments)+1) + for i := range p.proof.Bsb22Commitments { + alphaDeps[i] = &p.proof.Bsb22Commitments[i] + } + alphaDeps[len(alphaDeps)-1] = &p.proof.Z + var aerr error + p.alpha, aerr = deriveRandomness(p.fs, "alpha", alphaDeps...) + if aerr != nil { + return aerr + } + p.logTime("derive alpha") + return nil +} + +func (p *gpuProver) computeQuotientAndCommit() error { + inst := p.inst + if err := inst.waitCanonicalReady(); err != nil { + return fmt.Errorf("initialize canonical selector data: %w", err) + } + + pointsOffloaded := false + if inst.shouldOffloadMSMForQuotient() { + if err := inst.offloadMSMPoints(); err != nil { + return fmt.Errorf("offload MSM points: %w", err) + } + pointsOffloaded = true + if err := inst.releaseMSMWorkBuffers(); err != nil { + return fmt.Errorf("release MSM work buffers: %w", err) + } + } + defer func() { + if pointsOffloaded { + _ = inst.reloadMSMPoints() + if !inst.lowMemory { + _ = inst.pinMSMWorkBuffers() + } + } + }() + + var qErr error + p.h1, p.h2, p.h3, qErr = computeNumeratorGPU( + inst, inst.gpuWork, + p.lBlinded, p.rBlinded, p.oBlinded, p.zBlinded, + p.qkCoeffs, p.pi2Canonical, p.pi2DeviceReady, + p.alpha, p.beta, p.gamma, + ) + if qErr != nil { + return fmt.Errorf("compute quotient: %w", qErr) + } + + p.logTime("quotient GPU") + + if pointsOffloaded { + if err := inst.reloadMSMPoints(); err != nil { + return fmt.Errorf("reload MSM points: %w", err) + } + if !inst.lowMemory { + if err := inst.pinMSMWorkBuffers(); err != nil { + return fmt.Errorf("re-pin MSM work buffers: %w", err) + } + } + pointsOffloaded = false + } + hCommits, err := inst.commitN(p.h1, p.h2, p.h3) + if err != nil { + return err + } + p.proof.H[0] = hCommits[0] + p.proof.H[1] = hCommits[1] + p.proof.H[2] = hCommits[2] + p.logTime("MSM commit h1,h2,h3") + + var zetaErr error + p.zeta, zetaErr = deriveRandomness(p.fs, "zeta", &p.proof.H[0], &p.proof.H[1], &p.proof.H[2]) + if zetaErr != nil { + return zetaErr + } + return nil +} + +func (inst *gpuInstance) shouldOffloadMSMForQuotient() bool { + if inst.lowMemory { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_FORCE_MSM_OFFLOAD") != "" { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_MSM_OFFLOAD") != "" { + return false + } + free, _, err := inst.dev.MemGetInfo() + if err != nil { + return true + } + reserve := uint64(inst.n) * uint64(fr.Bytes) * 8 + const minReserve = 2 << 30 + if reserve < minReserve { + reserve = minReserve + } + return free < reserve +} + +func (p *gpuProver) openAndFinalize() error { + inst := p.inst + + var zetaShifted fr.Element + zetaShifted.Mul(&p.zeta, &inst.domain0.Generator) + + openZPoly := inst.hBufs.openZBuf[:len(p.zBlinded)] + copy(openZPoly, p.zBlinded) + bzuzetaCh := make(chan fr.Element, 1) + go func() { + parallelHornerQuotient(openZPoly, zetaShifted) + bzuzetaCh <- openZPoly[0] + }() + + // Evaluate host-only blinded polys on CPU while GPU-resident selector polys + // are evaluated on device. + var blzeta, brzeta, bozeta, s1Zeta, s2Zeta fr.Element + var evalWG sync.WaitGroup + evalWG.Add(3) + go func() { defer evalWG.Done(); blzeta = polyEvalParallel(p.lBlinded, p.zeta) }() + go func() { defer evalWG.Done(); brzeta = polyEvalParallel(p.rBlinded, p.zeta) }() + go func() { defer evalWG.Done(); bozeta = polyEvalParallel(p.oBlinded, p.zeta) }() + + if inst.lowMemory { + s1Zeta = polyEvalParallel(inst.s1Canonical, p.zeta) + s2Zeta = polyEvalParallel(inst.s2Canonical, p.zeta) + } else { + s1Zeta = PolyEvalGPU(inst.dev, inst.dS1, p.zeta) + s2Zeta = PolyEvalGPU(inst.dev, inst.dS2, p.zeta) + } + + qcpzeta := make([]fr.Element, len(p.commitmentInfo)) + for i := range p.commitmentInfo { + if inst.lowMemory { + qcpzeta[i] = polyEvalParallel(inst.qcpCanonical[i], p.zeta) + } else { + qcpzeta[i] = PolyEvalGPU(inst.dev, inst.dQcp[i], p.zeta) + } + } + evalWG.Wait() + + bzuzeta := <-bzuzetaCh + p.proof.ZShiftedOpening.ClaimedValue.Set(&bzuzeta) + + var linPol []fr.Element + if inst.lowMemory { + linPol = innerComputeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.h1, p.h2, p.h3, + ) + } else { + linPol = computeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.pi2DeviceReady, p.h1, p.h2, p.h3, + ) + } + p.h1, p.h2, p.h3, p.pi2Canonical, p.pi2DeviceReady = nil, nil, nil, nil, nil + + zOpenCommit, err := inst.commit(openZPoly[1:]) + if err != nil { + return err + } + p.proof.ZShiftedOpening.H = zOpenCommit + p.logTime("eval+linearize+open Z") + + linPolZetaCh := make(chan fr.Element, 1) + go func() { + linPolZetaCh <- polyEvalParallel(linPol, p.zeta) + }() + + linPolDigest, err := inst.commit(linPol) + if err != nil { + return err + } + p.logTime("MSM commit linPol") + + nPolysToOpen := 6 + len(inst.qcpCanonical) + claimedValues := make([]fr.Element, nPolysToOpen) + claimedValues[0] = <-linPolZetaCh + claimedValues[1] = blzeta + claimedValues[2] = brzeta + claimedValues[3] = bozeta + claimedValues[4] = s1Zeta + claimedValues[5] = s2Zeta + for i := range inst.qcpCanonical { + claimedValues[6+i] = qcpzeta[i] + } + + polysToOpen := make([][]fr.Element, nPolysToOpen) + polysToOpen[0] = linPol + polysToOpen[1] = p.lBlinded + polysToOpen[2] = p.rBlinded + polysToOpen[3] = p.oBlinded + polysToOpen[4] = inst.s1Canonical + polysToOpen[5] = inst.s2Canonical + for i := range inst.qcpCanonical { + polysToOpen[6+i] = inst.qcpCanonical[i] + } + + digestsToOpen := make([]curve.G1Affine, nPolysToOpen) + digestsToOpen[0] = linPolDigest + digestsToOpen[1] = p.proof.LRO[0] + digestsToOpen[2] = p.proof.LRO[1] + digestsToOpen[3] = p.proof.LRO[2] + digestsToOpen[4] = inst.vk.S[0] + digestsToOpen[5] = inst.vk.S[1] + copy(digestsToOpen[6:], inst.vk.Qcp) + + p.proof.BatchedProof, err = gpuBatchOpen( + inst.commit, + polysToOpen, digestsToOpen, claimedValues, + p.zeta, + p.kzgFoldingHash, + p.proof.ZShiftedOpening.ClaimedValue.Marshal(), + ) + if err != nil { + return fmt.Errorf("batch opening: %w", err) + } + p.logTime("batch opening") + return nil +} + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProve — top-level prove API +// ───────────────────────────────────────────────────────────────────────────── + +func GPUProve(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, fullWitness witness.Witness, opts ...backend.ProverOption) (*curplonk.Proof, error) { + proverCfg, err := backend.NewProverConfig(opts...) + if err != nil { + return nil, fmt.Errorf("create prover config: %w", err) + } + if proverCfg.HashToFieldFn == nil { + proverCfg.HashToFieldFn = newHTF([]byte("BSB22-Plonk")) + } + + gpk.mu.Lock() + defer gpk.mu.Unlock() + + if gpk.Vk == nil { + return nil, errors.New("gpu: proving key missing verifying key") + } + + proveStart := time.Now() + logTime := func(label string) { + log.Printf(" [GPUProve n=%d] %s: %v", gpk.n, label, time.Since(proveStart)) + } + + var commitmentInfo constraint.PlonkCommitments + if spr.CommitmentInfo != nil { + commitmentInfo = spr.CommitmentInfo.(constraint.PlonkCommitments) + } + + nbCommitments := len(commitmentInfo) + newProof := &curplonk.Proof{ + Bsb22Commitments: make([]curve.G1Affine, nbCommitments), + } + + msmInstReady := make(chan struct{}) + commitInstReady := make(chan struct{}) + traceInstReady := make(chan struct{}) + var ( + msmInstPublishOnce sync.Once + commitInstPublishOnce sync.Once + traceInstPublishOnce sync.Once + msmInst *gpuInstance + commitInst *gpuInstance + traceInst *gpuInstance + msmInstErr error + commitInstErr error + traceInstErr error + ) + publishMSMInst := func(inst *gpuInstance, err error) { + msmInstPublishOnce.Do(func() { + if err != nil { + msmInstErr = err + } else { + msmInst = inst + } + close(msmInstReady) + }) + } + waitMSMInst := func() (*gpuInstance, error) { + <-msmInstReady + if msmInstErr != nil { + return nil, msmInstErr + } + if msmInst == nil { + return nil, errors.New("gpu instance initialization did not publish an MSM-ready instance") + } + return msmInst, nil + } + publishCommitInst := func(inst *gpuInstance, err error) { + commitInstPublishOnce.Do(func() { + if err != nil { + commitInstErr = err + } else { + commitInst = inst + } + close(commitInstReady) + }) + } + waitCommitInst := func() (*gpuInstance, error) { + <-commitInstReady + if commitInstErr != nil { + return nil, commitInstErr + } + if commitInst == nil { + return nil, errors.New("gpu instance initialization did not publish a commitment-ready instance") + } + return commitInst, nil + } + publishTraceInst := func(inst *gpuInstance, err error) { + traceInstPublishOnce.Do(func() { + if err != nil { + traceInstErr = err + } else { + traceInst = inst + gpk.inst = inst + } + close(traceInstReady) + }) + } + waitInst := func() (*gpuInstance, error) { + <-traceInstReady + if traceInstErr != nil { + return nil, traceInstErr + } + if traceInst == nil { + return nil, errors.New("gpu instance initialization did not publish a trace-ready instance") + } + return traceInst, nil + } + + p := &gpuProver{ + proof: *newProof, + fs: fiatshamir.NewTranscript(proverCfg.ChallengeHash, "gamma", "beta", "alpha", "zeta"), + commitmentInfo: commitmentInfo, + commitmentVal: make([]fr.Element, nbCommitments), + pi2Canonical: make([][]fr.Element, nbCommitments), + pi2DeviceReady: make([]bool, nbCommitments), + solverOpts: proverCfg.SolverOpts, + kzgFoldingHash: proverCfg.KZGFoldingHash, + htfFunc: proverCfg.HashToFieldFn, + logTime: logTime, + waitInst: waitInst, + waitMSMInst: waitMSMInst, + waitCommitInst: waitCommitInst, + } + + // Overlap CPU solve with blinding-polynomial init and Qk patching, then + // feed results into a sequential GPU pipeline. Hides the solve latency + // (~400 ms at n=2^18) behind unrelated work; recovers ~20-30% end-to-end. + chSolved := make(chan struct{}) + chBlinding := make(chan struct{}) + chQk := make(chan struct{}) + + g, gctx := errgroup.WithContext(context.Background()) + + waitCh := func(ch <-chan struct{}) error { + select { + case <-gctx.Done(): + return gctx.Err() + case <-ch: + return nil + } + } + safeGo := func(label string, fn func() error) { + g.Go(func() error { return proveStep(label, fn) }) + } + + safeGo("initGPUInstance", func() error { + if gpk.inst != nil && gpk.inst.dev == dev { + publishMSMInst(gpk.inst, nil) + publishCommitInst(gpk.inst, nil) + publishTraceInst(gpk.inst, nil) + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + msmPublished := false + commitPublished := false + tracePublished := false + inst, err := newGPUInstance(dev, gpk, spr, gpuInstanceReadyHooks{ + msm: func(inst *gpuInstance) { + msmPublished = true + publishMSMInst(inst, nil) + }, + commit: func(inst *gpuInstance) { + commitPublished = true + publishCommitInst(inst, nil) + }, + trace: func(inst *gpuInstance) { + tracePublished = true + publishTraceInst(inst, nil) + logTime("trace-ready GPU instance") + }, + }) + if err != nil { + err = fmt.Errorf("init GPU instance: %w", err) + if !msmPublished { + publishMSMInst(nil, err) + } + if !commitPublished { + publishCommitInst(nil, err) + } + if !tracePublished { + publishTraceInst(nil, err) + } + return err + } + if !msmPublished { + publishMSMInst(inst, nil) + } + if !commitPublished { + publishCommitInst(inst, nil) + } + if !tracePublished { + publishTraceInst(inst, nil) + } + logTime("init GPU instance") + return nil + }) + + safeGo("initBlinding", func() error { + p.initBlindingPolynomials() + close(chBlinding) + return nil + }) + + safeGo("solve", func() error { + if err := p.solve(spr, fullWitness); err != nil { + return err + } + logTime("solve") + close(chSolved) + return nil + }) + + safeGo("completeQk", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + p.completeQk() + close(chQk) + return nil + }) + + safeGo("pipeline", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + commitInst, err := waitCommitInst() + if err != nil { + return err + } + if err := p.commitToLRO( + commitInst, + func() error { return waitCh(chQk) }, + func() error { return waitCh(chBlinding) }, + ); err != nil { + return err + } + if _, err := p.ensureInst(); err != nil { + return err + } + if err := p.deriveGammaBeta(); err != nil { + return err + } + if err := p.buildZAndCommit(); err != nil { + return err + } + if err := p.computeQuotientAndCommit(); err != nil { + return err + } + return p.openAndFinalize() // inst.gpuWork persists (owned by gpuInstance) + }) + + if err := g.Wait(); err != nil { + return nil, err + } + + logTime("total") + result := p.proof + return &result, nil +} + +// proveStep converts a panic in fn to a labeled error so goroutines +// surface panics as normal errors through the errgroup. +func proveStep(label string, fn func() error) (err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("%s panic: %v", label, r) + } + }() + return fn() +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helper functions (ported from gpu/plonk/prove.go) +// ───────────────────────────────────────────────────────────────────────────── + +func buildZGPU( + inst *gpuInstance, gpuWork *FrVector, + evalL, evalR, evalO fr.Vector, beta, gamma fr.Element, +) (fr.Vector, error) { + dev := inst.dev + domain0 := inst.domain0 + + gpuR := inst.qWb.R + gpuO := inst.qWb.O + if inst.lowMemory { + var err error + gpuR, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z R buffer: %w", err) + } + defer gpuR.Free() + gpuO, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z O buffer: %w", err) + } + defer gpuO.Free() + } + + gpuWork.CopyFromHost(evalL) + gpuR.CopyFromHost(evalR) + gpuO.CopyFromHost(evalO) + + gMul := domain0.FrMultiplicativeGen + var gSq fr.Element + gSq.Mul(&gMul, &gMul) + + PlonkZComputeFactors(gpuWork, gpuR, gpuO, inst.dPerm, + beta, gamma, gMul, gSq, inst.log2n, inst.fftDom) + gpuR.BatchInvert(gpuO) + gpuWork.Mul(gpuWork, gpuR) + ZPrefixProduct(dev, gpuR, gpuWork, gpuO) + gpuR.CopyToHost(inst.hBufs.zLagrange) + return inst.hBufs.zLagrange, nil +} + +func computeNumeratorGPU( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + if inst.lowMemory { + return computeNumeratorGPULowMemory( + inst, gpuWork, + lBlinded, rBlinded, oBlinded, zBlinded, + qkCanonical, pi2Canonical, + alpha, beta, gamma, + ) + } + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + // Pre-allocated buffers from gpuInstance (avoids per-proof cudaMalloc/Free). + wb := &inst.qWb + gpuL, gpuR, gpuO, gpuZ := wb.L, wb.R, wb.O, wb.Z + gpuS1, gpuS2, gpuS3 := wb.S1, wb.S2, wb.S3 + gpuResult := wb.Result + gpuLCan, gpuRCan, gpuOCan, gpuZCan := wb.LCan, wb.RCan, wb.OCan, wb.ZCan + gpuCosetBlocks := wb.CosetBlock + + // Event IDs used for cross-stream synchronisation in the 4-coset loop. + const ( + evS123Done gpu.EventID = 0 // StreamTransfer → StreamCompute: S1/S2/S3 D2D done + evPermDone gpu.EventID = 1 // StreamCompute → StreamTransfer: safe to overwrite gate buffers + evCosetDone gpu.EventID = 3 // StreamCompute → StreamTransfer: full coset k done + ) + + // L/R/O/Z canonical heads were produced on-device by the iFFT phases and + // adjusted for blinding there. Keep them resident for the quotient loop. + for j := range pi2Canonical { + if j >= len(pi2DeviceReady) || pi2DeviceReady[j] { + continue + } + if j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil && len(pi2Canonical[j]) == n { + wb.Pi2Src[j].CopyFromHost(fr.Vector(pi2Canonical[j])) + pi2DeviceReady[j] = true + } + } + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + // Stream 1 must finish before overwriting gpuS1/S2/S3 with the next coset's + // selectors. PermBoundary (end of previous coset) still holds reads on S1/S2/S3. + if k > 0 { + dev.WaitEvent(gpu.StreamTransfer, evCosetDone) + } + + // Stream 1: D2D perm selectors concurrent with L/R/O/Z reduce+FFT on stream 0. + gpuS1.CopyFromDeviceStream(inst.dS1, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dS2, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dS3, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + // Stream 0: reduce blinded canonicals and FFT while D2D runs concurrently. + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + // L₁ denominator inverse: gpuWork[i] = 1/(cosetGen·ω^i - 1) + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) // result is temp; inverses stored in gpuWork + + // l1Scalar = (cosetGen^n - 1) / n = zhZeta / n at this coset + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + // Gate selectors: overlap transfer-stream D2D copies with compute-stream FFTs. + dev.RecordEvent(gpu.StreamCompute, evPermDone) + + dev.WaitEvent(gpu.StreamTransfer, evPermDone) + gpuS1.CopyFromDeviceStream(inst.dQr, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dQm, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dQo, gpu.StreamTransfer) + gpuWork.CopyFromDeviceStream(wb.QkSrc, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + gpuZ.CopyFromDevice(inst.dQl) + fftDom.CosetFFT(gpuZ, cosetGen) + + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + gpuZ.CopyFromDevice(inst.dQcp[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil { + gpuWork.CopyFromDevice(wb.Pi2Src[j]) + } else { + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + // Store the first three coset results on GPU. Keep the fourth in gpuResult. + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + dev.RecordEvent(gpu.StreamCompute, evCosetDone) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func newLowMemorySelectorCache(inst *gpuInstance, allocated *[]*FrVector) lowMemorySelectorCache { + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY_SELECTOR_CACHE") != "" { + return lowMemorySelectorCache{} + } + + upload := func(name string, data fr.Vector) *FrVector { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + log.Printf("plonk2: low-memory selector cache stopped at %s: %v", name, err) + return nil + } + *allocated = append(*allocated, v) + v.CopyFromHost(data) + return v + } + + cache := lowMemorySelectorCache{ + ql: upload("ql", inst.qlCanonical), + qr: upload("qr", inst.qrCanonical), + qm: upload("qm", inst.qmCanonical), + qo: upload("qo", inst.qoCanonical), + s1: upload("s1", inst.s1Canonical), + s2: upload("s2", inst.s2Canonical), + s3: upload("s3", inst.s3Canonical), + } + if len(inst.qcpCanonical) > 0 { + cache.qcp = make([]*FrVector, len(inst.qcpCanonical)) + for i := range inst.qcpCanonical { + cache.qcp[i] = upload(fmt.Sprintf("qcp[%d]", i), inst.qcpCanonical[i]) + } + } + + qcpCached := 0 + for i := range cache.qcp { + if cache.qcp[i] != nil { + qcpCached++ + } + } + log.Printf( + "plonk2: low-memory selector cache ql=%t qr=%t qm=%t qo=%t s1=%t s2=%t s3=%t qcp=%d/%d", + cache.ql != nil, cache.qr != nil, cache.qm != nil, cache.qo != nil, + cache.s1 != nil, cache.s2 != nil, cache.s3 != nil, + qcpCached, len(inst.qcpCanonical), + ) + return cache +} + +func computeNumeratorGPULowMemory( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + if len(qkCanonical) < n { + return nil, nil, nil, fmt.Errorf("low-memory quotient: qk canonical length %d < %d", len(qkCanonical), n) + } + + var allocated []*FrVector + alloc := func(name string) (*FrVector, error) { + v, err := NewFrVector(inst.dev, n) + if err != nil { + return nil, fmt.Errorf("alloc %s: %w", name, err) + } + allocated = append(allocated, v) + return v, nil + } + defer func() { + for _, v := range allocated { + v.Free() + } + }() + + gpuL, err := alloc("L") + if err != nil { + return nil, nil, nil, err + } + gpuR, err := alloc("R") + if err != nil { + return nil, nil, nil, err + } + gpuO, err := alloc("O") + if err != nil { + return nil, nil, nil, err + } + gpuZ, err := alloc("Z") + if err != nil { + return nil, nil, nil, err + } + gpuS1, err := alloc("S1") + if err != nil { + return nil, nil, nil, err + } + gpuS2, err := alloc("S2") + if err != nil { + return nil, nil, nil, err + } + gpuS3, err := alloc("S3") + if err != nil { + return nil, nil, nil, err + } + gpuResult, err := alloc("Result") + if err != nil { + return nil, nil, nil, err + } + gpuLCan, err := alloc("LCan") + if err != nil { + return nil, nil, nil, err + } + gpuRCan, err := alloc("RCan") + if err != nil { + return nil, nil, nil, err + } + gpuOCan, err := alloc("OCan") + if err != nil { + return nil, nil, nil, err + } + gpuZCan, err := alloc("ZCan") + if err != nil { + return nil, nil, nil, err + } + gpuQkSrc, err := alloc("QkSrc") + if err != nil { + return nil, nil, nil, err + } + var gpuCosetBlocks [3]*FrVector + for k := range gpuCosetBlocks { + gpuCosetBlocks[k], err = alloc(fmt.Sprintf("CosetBlock%d", k)) + if err != nil { + return nil, nil, nil, err + } + } + selectorCache := newLowMemorySelectorCache(inst, &allocated) + copySelector := func(dst, device *FrVector, host fr.Vector) { + if device != nil { + dst.CopyFromDevice(device) + return + } + dst.CopyFromHost(host) + } + + gpuLCan.CopyFromHost(fr.Vector(lBlinded[:n])) + gpuRCan.CopyFromHost(fr.Vector(rBlinded[:n])) + gpuOCan.CopyFromHost(fr.Vector(oBlinded[:n])) + gpuZCan.CopyFromHost(fr.Vector(zBlinded[:n])) + gpuQkSrc.CopyFromHost(fr.Vector(qkCanonical[:n])) + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + copySelector(gpuS1, selectorCache.s1, inst.s1Canonical) + copySelector(gpuS2, selectorCache.s2, inst.s2Canonical) + copySelector(gpuS3, selectorCache.s3, inst.s3Canonical) + + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) + + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + copySelector(gpuS1, selectorCache.qr, inst.qrCanonical) + copySelector(gpuS2, selectorCache.qm, inst.qmCanonical) + copySelector(gpuS3, selectorCache.qo, inst.qoCanonical) + gpuWork.CopyFromDevice(gpuQkSrc) + copySelector(gpuZ, selectorCache.ql, inst.qlCanonical) + + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + var qcpDevice *FrVector + if j < len(selectorCache.qcp) { + qcpDevice = selectorCache.qcp[j] + } + copySelector(gpuZ, qcpDevice, inst.qcpCanonical[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("low-memory quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func gpuCommit(msm *G1MSM, coeffs []fr.Element) (curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + var aff curve.G1Affine + aff.FromJacobian(&jacs[0]) + return aff, nil +} + +func gpuCommitN(msm *G1MSM, coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffSets...) + if err != nil { + return nil, err + } + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) commit(coeffs []fr.Element) (curve.G1Affine, error) { + commits, err := inst.commitN(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + return commits[0], nil +} + +func (inst *gpuInstance) commitN(coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + if inst.lowMemory { + if err := inst.reloadMSMPoints(); err != nil { + return nil, fmt.Errorf("reload MSM points: %w", err) + } + defer func() { + _ = inst.releaseMSMWorkBuffers() + _ = inst.offloadMSMPoints() + }() + } + var jacs []curve.G1Jac + var err error + if inst.splitMSM != nil { + jacs, err = MultiExpSplitBatchAt(inst.splitMSM.msm0, inst.splitMSM.msm1, inst.splitMSM.split, coeffSets...) + } else { + jacs, err = inst.msm.MultiExp(coeffSets...) + } + if err != nil { + return nil, err + } + inst.logMSMPhaseTimings(coeffSets...) + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) logMSMPhaseTimings(coeffSets ...[]fr.Element) { + if os.Getenv("GNARK_GPU_PLONK2_LOG_MSM_PHASES") == "" { + return + } + counts := make([]int, len(coeffSets)) + for i := range coeffSets { + counts[i] = len(coeffSets[i]) + } + if inst.splitMSM != nil { + primaryCounts := make([]int, len(coeffSets)) + secondaryCounts := make([]int, len(coeffSets)) + for i, count := range counts { + primaryCounts[i] = inst.splitMSM.split + if count < primaryCounts[i] { + primaryCounts[i] = count + } + secondaryCounts[i] = count - primaryCounts[i] + } + logMSMPhaseTimings(inst.n, "primary", inst.splitMSM.msm0.LastBatchPhaseTimings(), primaryCounts) + logMSMPhaseTimings(inst.n, "secondary", inst.splitMSM.msm1.LastBatchPhaseTimings(), secondaryCounts) + return + } + logMSMPhaseTimings(inst.n, "single", inst.msm.LastBatchPhaseTimings(), counts) +} + +func logMSMPhaseTimings(n int, device string, timings [][9]float32, scalarCounts []int) { + names := [...]string{ + "h2d", "build_pairs", "sort", "boundaries", "accum_seq", + "accum_par", "reduce_partial", "reduce_finalize", "d2h", + } + for i, phase := range timings { + total := float32(0) + for _, ms := range phase { + total += ms + } + scalars := 0 + if i < len(scalarCounts) { + scalars = scalarCounts[i] + } + log.Printf( + " [GPUProve n=%d] MSM phases device=%s set=%d scalars=%d total=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms", + n, device, i, scalars, total, + names[0], phase[0], names[1], phase[1], names[2], phase[2], + names[3], phase[3], names[4], phase[4], names[5], phase[5], + names[6], phase[6], names[7], phase[7], names[8], phase[8], + ) + } +} + +func gpuBatchOpen( + commit func([]fr.Element) (curve.G1Affine, error), + polys [][]fr.Element, + digests []curve.G1Affine, + claimedValues []fr.Element, + point fr.Element, + kzgFoldingHash hash.Hash, + dataTranscript []byte, +) (kzg.BatchOpeningProof, error) { + var res kzg.BatchOpeningProof + res.ClaimedValues = claimedValues + + fsGamma := fiatshamir.NewTranscript(kzgFoldingHash, "gamma") + if err := fsGamma.Bind("gamma", point.Marshal()); err != nil { + return res, err + } + for i := range digests { + if err := fsGamma.Bind("gamma", digests[i].Marshal()); err != nil { + return res, err + } + } + for i := range claimedValues { + if err := fsGamma.Bind("gamma", claimedValues[i].Marshal()); err != nil { + return res, err + } + } + if len(dataTranscript) > 0 { + if err := fsGamma.Bind("gamma", dataTranscript); err != nil { + return res, err + } + } + gammaByte, err := fsGamma.ComputeChallenge("gamma") + if err != nil { + return res, err + } + var gammaChallenge fr.Element + gammaChallenge.SetBytes(gammaByte) + + nbPolys := len(polys) + largestPoly := 0 + for _, p := range polys { + if len(p) > largestPoly { + largestPoly = len(p) + } + } + + gammas := make([]fr.Element, nbPolys) + gammas[0].SetOne() + for i := 1; i < nbPolys; i++ { + gammas[i].Mul(&gammas[i-1], &gammaChallenge) + } + + folded := make(fr.Vector, largestPoly) + nCPU := runtime.NumCPU() + chunkSize := (largestPoly + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < largestPoly; c += chunkSize { + start := c + end := start + chunkSize + if end > largestPoly { + end = largestPoly + } + wg.Add(1) + go func() { + defer wg.Done() + temp := make(fr.Vector, end-start) + for i := range nbPolys { + effEnd := end + if effEnd > len(polys[i]) { + effEnd = len(polys[i]) + } + if start >= effEnd { + continue + } + n := effEnd - start + t := fr.Vector(temp[:n]) + t.ScalarMul(fr.Vector(polys[i][start:effEnd]), &gammas[i]) + f := fr.Vector(folded[start:effEnd]) + f.Add(f, t) + } + }() + } + wg.Wait() + + var foldedEval fr.Element + for i := nbPolys - 1; i >= 0; i-- { + foldedEval.Mul(&foldedEval, &gammaChallenge).Add(&foldedEval, &claimedValues[i]) + } + folded[0].Sub(&folded[0], &foldedEval) + parallelHornerQuotient(folded, point) + h := folded[1:] + + res.H, err = commit(h) + if err != nil { + return res, err + } + return res, nil +} + +func computeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + h1, h2, h3 []fr.Element, +) []fr.Element { + n := inst.n + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + // Pre-allocated GPU buffers from gpuInstance (guaranteed non-nil after newGPUInstance). + gpuResult := inst.qWb.LinResult + gpuW := inst.qWb.LinW + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + PlonkLinearizeStatic( + gpuResult, inst.qWb.ZCan, inst.dS3, + inst.dQl, inst.dQr, inst.dQm, inst.dQo, inst.dQkFixed, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta, + ) + + for j := range qcpZeta { + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[j] != nil { + gpuW.CopyFromDevice(inst.qWb.Pi2Src[j]) + } else { + gpuW.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + gpuResult.AddScalarMul(gpuW, qcpZeta[j]) + } + + var negCoeff fr.Element + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Mul(&negCoeff, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h3[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h2[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Neg(&zhZeta) + gpuW.CopyFromHost(fr.Vector(h1[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + gpuResult.CopyToHost(fr.Vector(blindedZCanonical[:n])) + + for i := n; i < len(blindedZCanonical); i++ { + var t fr.Element + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + return blindedZCanonical +} + +func innerComputeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, + h1, h2, h3 []fr.Element, +) []fr.Element { + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + s3can := []fr.Element(inst.s3Canonical) + cql := []fr.Element(inst.qlCanonical) + cqr := []fr.Element(inst.qrCanonical) + cqm := []fr.Element(inst.qmCanonical) + cqo := []fr.Element(inst.qoCanonical) + cqk := []fr.Element(inst.qkFixedCanonical) + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + + total := len(blindedZCanonical) + nCPU := runtime.NumCPU() + chunkSize := (total + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < total; c += chunkSize { + start := c + end := start + chunkSize + if end > total { + end = total + } + wg.Add(1) + go func() { + defer wg.Done() + var t, t0, t1 fr.Element + for i := start; i < end; i++ { + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(s3can) { + t0.Mul(&s3can[i], &s1) + t.Add(&t, &t0) + } + if i < len(cqm) { + t1.Mul(&cqm[i], &rl) + t.Add(&t, &t1) + t0.Mul(&cql[i], &lZeta) + t.Add(&t, &t0) + t0.Mul(&cqr[i], &rZeta) + t.Add(&t, &t0) + t0.Mul(&cqo[i], &oZeta) + t.Add(&t, &t0) + t.Add(&t, &cqk[i]) + } + for j := range qcpZeta { + if i < len(pi2Canonical[j]) { + t0.Mul(&pi2Canonical[j][i], &qcpZeta[j]) + t.Add(&t, &t0) + } + } + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + }() + } + wg.Wait() + return blindedZCanonical +} + +// ─── Polynomial helpers ─────────────────────────────────────────────────────── + +func blindInto(dst []fr.Element, canonical []fr.Element, bp *iop.Polynomial) []fr.Element { + cbp := bp.Coefficients() + result := dst[:len(canonical)+len(cbp)] + copy(result, canonical) + copy(result[len(canonical):], cbp) + for i := 0; i < len(cbp); i++ { + result[i].Sub(&result[i], &cbp[i]) + } + return result +} + +func getRandomPolynomial(degree int) *iop.Polynomial { + coeffs := make([]fr.Element, degree+1) + for i := range coeffs { + coeffs[i].SetRandom() + } + return iop.NewPolynomial(&coeffs, iop.Form{Basis: iop.Canonical, Layout: iop.Regular}) +} + +func parallelHornerQuotient(poly []fr.Element, z fr.Element) { + n := len(poly) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + for i := n - 2; i >= 0; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + return + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + var wg sync.WaitGroup + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + wg.Add(1) + go func(lo, hi int) { + defer wg.Done() + for i := hi - 2; i >= lo; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + }(lo, hi) + } + wg.Wait() + zk := expElement(z, chunkSize) + carries := make([]fr.Element, numChunks) + for c := numChunks - 2; c >= 0; c-- { + nextLo := (c + 1) * chunkSize + nextLen := chunkSize + if nextLo+nextLen > n { + nextLen = n - nextLo + } + zkc := zk + if nextLen != chunkSize { + zkc = expElement(z, nextLen) + } + var tmp fr.Element + tmp.Mul(&carries[c+1], &zkc) + carries[c].Add(&poly[nextLo], &tmp) + } + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + if carries[c].IsZero() { + continue + } + wg.Add(1) + go func(lo, hi, c int) { + defer wg.Done() + var zPow fr.Element + zPow.Set(&z) + for i := hi - 1; i >= lo; i-- { + var corr fr.Element + corr.Mul(&zPow, &carries[c]) + poly[i].Add(&poly[i], &corr) + zPow.Mul(&zPow, &z) + } + }(lo, hi, c) + } + wg.Wait() +} + +func expElement(z fr.Element, exp int) fr.Element { + var base, acc fr.Element + base.Set(&z) + acc.SetOne() + for exp > 0 { + if exp&1 != 0 { + acc.Mul(&acc, &base) + } + base.Square(&base) + exp >>= 1 + } + return acc +} + +// ─── Fiat-Shamir helpers ────────────────────────────────────────────────────── + +func bindPublicData(fs *fiatshamir.Transcript, challenge string, vk *curplonk.VerifyingKey, publicInputs []fr.Element) error { + for _, f := range []func() []byte{ + func() []byte { return vk.S[0].Marshal() }, + func() []byte { return vk.S[1].Marshal() }, + func() []byte { return vk.S[2].Marshal() }, + func() []byte { return vk.Ql.Marshal() }, + func() []byte { return vk.Qr.Marshal() }, + func() []byte { return vk.Qm.Marshal() }, + func() []byte { return vk.Qo.Marshal() }, + func() []byte { return vk.Qk.Marshal() }, + } { + if err := fs.Bind(challenge, f()); err != nil { + return err + } + } + for i := range vk.Qcp { + if err := fs.Bind(challenge, vk.Qcp[i].Marshal()); err != nil { + return err + } + } + for i := range publicInputs { + if err := fs.Bind(challenge, publicInputs[i].Marshal()); err != nil { + return err + } + } + return nil +} + +func deriveRandomness(fs *fiatshamir.Transcript, challenge string, points ...*curve.G1Affine) (fr.Element, error) { + var buf [curve.SizeOfG1AffineUncompressed]byte + var r fr.Element + for _, p := range points { + buf = p.RawBytes() + if err := fs.Bind(challenge, buf[:]); err != nil { + return r, err + } + } + b, err := fs.ComputeChallenge(challenge) + if err != nil { + return r, err + } + r.SetBytes(b) + return r, nil +} + +func newHTF(domain []byte) hash.Hash { + return htf.New(domain) +} + +// ─── suppress unused imports ────────────────────────────────────────────────── +var _ = bits.TrailingZeros +var _ = unsafe.Pointer(nil) diff --git a/prover/gpu/internal/generator/plonk/template/prove_stub.go.tmpl b/prover/gpu/internal/generator/plonk/template/prove_stub.go.tmpl new file mode 100644 index 00000000000..09375344092 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/prove_stub.go.tmpl @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package {{.Package}} + +import ( + "errors" + + "github.com/consensys/gnark/backend" + curplonk "{{.GnarkPlonk}}" + "github.com/consensys/gnark/backend/witness" + cs "{{.GnarkCS}}" + "github.com/consensys/linea-monorepo/prover/gpu" + curve "{{.GnarkCurve}}" +) + +type GPUProvingKey struct { + Vk *curplonk.VerifyingKey +} + +func NewGPUProvingKey(_ []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + return &GPUProvingKey{Vk: vk} +} + +func (gpk *GPUProvingKey) Size() int { return 0 } +func (gpk *GPUProvingKey) Prepare(_ *gpu.Device, _ *cs.SparseR1CS) error { + return errors.New("gpu: cuda required") +} +func (gpk *GPUProvingKey) Close() {} + +func GPUProve(_ *gpu.Device, _ *GPUProvingKey, _ *cs.SparseR1CS, _ witness.Witness, _ ...backend.ProverOption) (*curplonk.Proof, error) { + return nil, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/internal/generator/plonk/template/templates.go b/prover/gpu/internal/generator/plonk/template/templates.go new file mode 100644 index 00000000000..824d6126ccd --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/templates.go @@ -0,0 +1,54 @@ +package template + +import _ "embed" + +//go:embed doc.go.tmpl +var DocTemplate string + +//go:embed cgo.go.tmpl +var CgoTemplate string + +//go:embed fr.go.tmpl +var FrTemplate string + +//go:embed fr_stub.go.tmpl +var FrStubTemplate string + +//go:embed fr_test.go.tmpl +var FrTestTemplate string + +//go:embed fft.go.tmpl +var FFTTemplate string + +//go:embed fft_stub.go.tmpl +var FFTStubTemplate string + +//go:embed fft_test.go.tmpl +var FFTTestTemplate string + +//go:embed msm.go.tmpl +var MSMTemplate string + +//go:embed msm_stub.go.tmpl +var MSMStubTemplate string + +//go:embed msm_test.go.tmpl +var MSMTestTemplate string + +//go:embed kernels.go.tmpl +var KernelsTemplate string + +//go:embed kernels_stub.go.tmpl +var KernelsStubTemplate string + +//go:embed pinned_fr.go.tmpl +var PinnedFrTemplate string + +//go:embed prove.go.tmpl +var ProveTemplate string + +//go:embed prove_stub.go.tmpl +var ProveStubTemplate string + +//go:embed plonk_test.go.tmpl +var PlonkTestTemplate string diff --git a/prover/gpu/plonk2/bls12377/cgo.go b/prover/gpu/plonk2/bls12377/cgo.go new file mode 100644 index 00000000000..14e69913ed1 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/cgo.go @@ -0,0 +1,44 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +/* +#cgo LDFLAGS: -L${SRCDIR}/../../cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/../../cuda/include + +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// curve returns the C curve identifier for bls12377, baked in at generation time. +func curveID() C.gnark_gpu_plonk2_curve_id_t { + return C.gnark_gpu_plonk2_curve_id_t(2) +} + +func devCtx(d *gpu.Device) C.gnark_gpu_context_t { + return C.gnark_gpu_context_t(d.Handle()) +} + +func toError(code C.gnark_gpu_error_t) error { + switch code { + case C.GNARK_GPU_SUCCESS: + return nil + case C.GNARK_GPU_ERROR_CUDA: + return &gpu.Error{Code: int(code), Message: "CUDA error"} + case C.GNARK_GPU_ERROR_INVALID_ARG: + return &gpu.Error{Code: int(code), Message: "invalid argument"} + case C.GNARK_GPU_ERROR_OUT_OF_MEMORY: + return &gpu.Error{Code: int(code), Message: "out of GPU memory"} + case C.GNARK_GPU_ERROR_SIZE_MISMATCH: + return &gpu.Error{Code: int(code), Message: "vector size mismatch"} + default: + return &gpu.Error{Code: int(code), Message: "unknown error"} + } +} diff --git a/prover/gpu/plonk2/bls12377/doc.go b/prover/gpu/plonk2/bls12377/doc.go new file mode 100644 index 00000000000..4b178158da2 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/doc.go @@ -0,0 +1,7 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +// Package bls12377 provides GPU-accelerated PlonK operations for the bls12377 curve. +// +// Generated from gpu/internal/generator. Do not edit by hand. +// Re-generate with: cd gpu/internal/generator && go run . +package bls12377 diff --git a/prover/gpu/plonk2/bls12377/fft.go b/prover/gpu/plonk2/bls12377/fft.go new file mode 100644 index 00000000000..f2f838eb664 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/fft.go @@ -0,0 +1,211 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "math/big" + "runtime" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain holds GPU-resident twiddle factors for NTT operations over the +// bls12377 scalar field. +// +// All NTT operations accept an optional StreamID. When provided, the operation +// is dispatched on that CUDA stream (non-blocking). When omitted, the default +// stream (stream 0) is used. +type GPUFFTDomain struct { + handle C.gnark_gpu_plonk2_ntt_domain_t + dev *gpu.Device + size int +} + +// NewFFTDomain creates a GPU NTT domain of the given size (must be a power of 2). +// +// Twiddle factors are computed using gnark-crypto's fft.Domain, then uploaded +// to GPU in AoS format. This is a one-time cost per domain size. +func NewFFTDomain(dev *gpu.Device, size int) (*GPUFFTDomain, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if size <= 0 || (size&(size-1)) != 0 { + return nil, &gpu.Error{Code: -1, Message: "size must be a positive power of 2"} + } + + domain := fft.NewDomain(uint64(size)) + halfN := size / 2 + + fwdTwiddles := make([]fr.Element, halfN) + invTwiddles := make([]fr.Element, halfN) + if halfN > 0 { + fwdTwiddles[0].SetOne() + invTwiddles[0].SetOne() + for i := 1; i < halfN; i++ { + fwdTwiddles[i].Mul(&fwdTwiddles[i-1], &domain.Generator) + invTwiddles[i].Mul(&invTwiddles[i-1], &domain.GeneratorInv) + } + } + + invN := domain.CardinalityInv + + var fwdPtr, invPtr *C.uint64_t + if halfN > 0 { + fwdPtr = (*C.uint64_t)(unsafe.Pointer(&fwdTwiddles[0])) + invPtr = (*C.uint64_t)(unsafe.Pointer(&invTwiddles[0])) + } + + var handle C.gnark_gpu_plonk2_ntt_domain_t + if err := toError(C.gnark_gpu_plonk2_ntt_domain_create( + devCtx(dev), + curveID(), + C.size_t(size), + fwdPtr, + invPtr, + (*C.uint64_t)(unsafe.Pointer(&invN)), + &handle, + )); err != nil { + return nil, err + } + + dom := &GPUFFTDomain{handle: handle, dev: dev, size: size} + runtime.SetFinalizer(dom, (*GPUFFTDomain).Close) + return dom, nil +} + +// Size returns the domain size. +func (f *GPUFFTDomain) Size() int { return f.size } + +// Close releases GPU resources. Safe to call multiple times. +func (f *GPUFFTDomain) Close() { + if f.handle != nil { + C.gnark_gpu_plonk2_ntt_domain_destroy(f.handle) + f.handle = nil + runtime.SetFinalizer(f, nil) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Forward / Inverse FFT +// ───────────────────────────────────────────────────────────────────────────── + +// FFT performs a forward NTT (DIF): natural-order input → bit-reversed output. +func (f *GPUFFTDomain) FFT(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFT size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_forward_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_forward(f.handle, v.handle)); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } +} + +// FFTInverse performs an inverse NTT (DIT): bit-reversed input → natural-order output. +// The result is scaled by 1/n. +func (f *GPUFFTDomain) FFTInverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFTInverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_inverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_inverse(f.handle, v.handle)); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } +} + +// BitReverse applies the bit-reversal permutation. +func (f *GPUFFTDomain) BitReverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: BitReverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse(f.handle, v.handle)); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Coset FFT +// +// CosetFFT evaluates p(X) on coset g·H = {g·ω^i : i=0..n-1}. +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// ───────────────────────────────────────────────────────────────────────────── + +// CosetFFT evaluates a polynomial in canonical form on coset g·H. +// Input: v holds canonical coefficients in natural order. +// Output: v holds p(g·ω⁰), p(g·ω¹), …, p(g·ωⁿ⁻¹) in natural order. +// +// Implemented as: ScaleByPowers(g) → FFT → BitReverse. +func (f *GPUFFTDomain) CosetFFT(v *FrVector, g fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFT size mismatch") + } + v.ScaleByPowers(g, stream...) + f.FFT(v, stream...) + f.BitReverse(v, stream...) +} + +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// gInv must be the inverse of the coset generator g. +// +// Implemented as: BitReverse → FFTInverse → ScaleByPowers(gInv). +func (f *GPUFFTDomain) CosetFFTInverse(v *FrVector, gInv fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFTInverse size mismatch") + } + f.BitReverse(v, stream...) + f.FFTInverse(v, stream...) + v.ScaleByPowers(gInv, stream...) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Butterfly4Inverse — decomposed iFFT(4n) for quotient computation +// ───────────────────────────────────────────────────────────────────────────── + +// Butterfly4Inverse applies a size-4 inverse DFT butterfly across 4 FrVectors. +// +// omega4Inv: inverse of the primitive 4th root of unity. +// quarter: 1/4 in Montgomery form. +func Butterfly4Inverse(b0, b1, b2, b3 *FrVector, omega4Inv, quarter fr.Element) { + if b0.n != b1.n || b1.n != b2.n || b2.n != b3.n { + panic("gpu: Butterfly4Inverse size mismatch") + } + if b0.dev != b1.dev || b1.dev != b2.dev || b2.dev != b3.dev { + panic("gpu: Butterfly4Inverse device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_butterfly4_inverse( + devCtx(b0.dev), + b0.handle, b1.handle, b2.handle, b3.handle, + (*C.uint64_t)(unsafe.Pointer(&omega4Inv)), + (*C.uint64_t)(unsafe.Pointer(&quarter)), + )); err != nil { + panic("gpu: Butterfly4Inverse failed: " + err.Error()) + } +} + +// ─── suppress unused import ─────────────────────────────────────────────────── +var _ = big.NewInt diff --git a/prover/gpu/plonk2/bls12377/fft_stub.go b/prover/gpu/plonk2/bls12377/fft_stub.go new file mode 100644 index 00000000000..f51b9f9636a --- /dev/null +++ b/prover/gpu/plonk2/bls12377/fft_stub.go @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bls12377 + +import ( + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain is a stub for non-CUDA builds. +type GPUFFTDomain struct{} + +func NewFFTDomain(_ *gpu.Device, _ int) (*GPUFFTDomain, error) { + return nil, gpu.ErrDeviceClosed +} + +func (f *GPUFFTDomain) Size() int { return 0 } +func (f *GPUFFTDomain) Close() {} +func (f *GPUFFTDomain) FFT(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) FFTInverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) BitReverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFT(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFTInverse(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} + +func Butterfly4Inverse(_, _, _, _ *FrVector, _, _ fr.Element) { panic("gpu: cuda required") } diff --git a/prover/gpu/plonk2/bls12377/fft_test.go b/prover/gpu/plonk2/bls12377/fft_test.go new file mode 100644 index 00000000000..44b9abdb943 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/fft_test.go @@ -0,0 +1,188 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377_test + +import ( + "fmt" + "testing" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bls12377" + "github.com/stretchr/testify/require" +) + +func newDomain(t testing.TB, dev *gpu.Device, size int) *bls12377.GPUFFTDomain { + t.Helper() + dom, err := bls12377.NewFFTDomain(dev, size) + require.NoError(t, err) + t.Cleanup(func() { dom.Close() }) + return dom +} + +// TestFFTRoundtrip verifies FFT(FFTInverse(v)) == v. +func TestFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16, 20} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + gV := newGPUVec(t, dev, orig) + + dom.FFT(gV) + dom.FFTInverse(gV) + dom.BitReverse(gV) // FFTInverse expects bit-reversed input; FFT output is bit-reversed + dev.Sync() + + // Actually test FFTInverse(FFT(v)) == v: + // FFT: natural → bit-reversed + // FFTInverse: bit-reversed → natural (scaled by 1/n) + // So we need FFTInverse after FFT directly. + gV2 := newGPUVec(t, dev, orig) + dom.FFT(gV2) + dom.FFTInverse(gV2) + dev.Sync() + + result := make(fr.Vector, n) + gV2.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "FFTInverse(FFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestCosetFFTRoundtrip verifies CosetFFT(CosetFFTInverse(v)) == v. +func TestCosetFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + var gInv fr.Element + gInv.Inverse(&g) + + gV := newGPUVec(t, dev, orig) + dom.CosetFFT(gV, g) + dom.CosetFFTInverse(gV, gInv) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "CosetFFTInverse(CosetFFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestFFTMatchesCPU verifies GPU FFT output matches gnark-crypto CPU FFT. +func TestFFTMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const logN = 14 + n := 1 << logN + + dom := newDomain(t, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + + orig := randFrVec(n) + cpuCopy := make(fr.Vector, n) + copy(cpuCopy, orig) + + // CPU FFT + cpuDom.FFT(cpuCopy, fft.DIF) + fft.BitReverse(cpuCopy) + + // GPU FFT (DIF: natural → bit-reversed, then BitReverse → natural) + gV := newGPUVec(t, dev, orig) + dom.FFT(gV) // natural → bit-reversed + dom.BitReverse(gV) // bit-reversed → natural + dev.Sync() + + gpuResult := make(fr.Vector, n) + gV.CopyToHost(gpuResult) + + for i := range cpuCopy { + require.True(t, cpuCopy[i].Equal(&gpuResult[i]), + "FFT mismatch at i=%d", i) + } +} + +// BenchmarkFFTForward benchmarks GPU forward NTT. +func BenchmarkFFTForward(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFT(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkFFTInverse benchmarks GPU inverse NTT. +func BenchmarkFFTInverse(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + dom.FFT(gV) // put into bit-reversed form first + dev.Sync() + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFTInverse(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkCosetFFT benchmarks GPU coset FFT. +func BenchmarkCosetFFT(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Re-upload canonical coefficients before each run + gV.CopyFromHost(src) + dom.CosetFFT(gV, g) + dev.Sync() + } + }) + } +} diff --git a/prover/gpu/plonk2/bls12377/fr.go b/prover/gpu/plonk2/bls12377/fr.go new file mode 100644 index 00000000000..9aaed4370b2 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/fr.go @@ -0,0 +1,270 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "runtime" + "sync" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector holds a vector of bls12377 scalar-field (Fr) elements on the GPU +// in Structure-of-Arrays (SoA) layout for coalesced memory access. +// +// All elements are in Montgomery form. GPU memory is SoA by limb; host memory +// uses gnark-crypto AoS Montgomery layout. +// +// All operations accept an optional gpu.StreamID. When omitted, the default +// stream (stream 0) is used. +type FrVector struct { + handle C.gnark_gpu_plonk2_fr_vector_t + dev *gpu.Device + n int +} + +var hostTransferMu sync.Mutex + +// NewFrVector allocates GPU memory for n Fr elements on dev. +// A finalizer is installed; call Free for deterministic VRAM release. +func NewFrVector(dev *gpu.Device, n int) (*FrVector, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if n <= 0 { + return nil, &gpu.Error{Code: -1, Message: "count must be positive"} + } + + var handle C.gnark_gpu_plonk2_fr_vector_t + if err := toError(C.gnark_gpu_plonk2_fr_vector_alloc( + devCtx(dev), curveID(), C.size_t(n), &handle, + )); err != nil { + return nil, err + } + + v := &FrVector{handle: handle, dev: dev, n: n} + runtime.SetFinalizer(v, (*FrVector).Free) + return v, nil +} + +// Free releases GPU memory. Safe to call multiple times. +func (v *FrVector) Free() { + if v.handle != nil { + v.bind() + C.gnark_gpu_plonk2_fr_vector_free(v.handle) + v.handle = nil + runtime.SetFinalizer(v, nil) + } +} + +// Len returns the number of elements. +func (v *FrVector) Len() int { return v.n } + +func (v *FrVector) bind() { + if err := v.dev.Bind(); err != nil { + panic("gpu: bind device failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Host ↔ Device transfers +// ───────────────────────────────────────────────────────────────────────────── + +// CopyFromHost copies host data (AoS) to GPU (SoA). Panics on size mismatch. +func (v *FrVector) CopyFromHost(src fr.Vector, _ ...gpu.StreamID) { + if len(src) != v.n { + panic("gpu: CopyFromHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_device( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&src[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyFromHost failed: " + err.Error()) + } +} + +// CopyToHost copies GPU data (SoA) back to host (AoS). Panics on size mismatch. +func (v *FrVector) CopyToHost(dst fr.Vector, _ ...gpu.StreamID) { + if len(dst) != v.n { + panic("gpu: CopyToHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_host( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&dst[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyToHost failed: " + err.Error()) + } +} + +// CopyFromDevice copies src to v (GPU-to-GPU). Panics on size or device mismatch. +func (v *FrVector) CopyFromDevice(src *FrVector, _ ...gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDevice size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDevice device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d( + devCtx(v.dev), v.handle, src.handle, + )); err != nil { + panic("gpu: CopyFromDevice failed: " + err.Error()) + } +} + +// CopyFromDeviceStream copies src to v (GPU-to-GPU) on a specific stream. +// Panics on size or device mismatch. +func (v *FrVector) CopyFromDeviceStream(src *FrVector, streamID gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDeviceStream size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDeviceStream device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d_stream( + devCtx(v.dev), v.handle, src.handle, C.int(streamID), + )); err != nil { + panic("gpu: CopyFromDeviceStream failed: " + err.Error()) + } +} + +// SetZero sets all elements to zero. +func (v *FrVector) SetZero(_ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_set_zero( + devCtx(v.dev), v.handle, + )); err != nil { + panic("gpu: SetZero failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Element-wise arithmetic (all async on the default stream) +// ───────────────────────────────────────────────────────────────────────────── + +func mustSameDeviceAndSize(v, a, b *FrVector) { + if v.n != a.n || a.n != b.n { + panic("gpu: vector size mismatch") + } + if v.dev != a.dev || a.dev != b.dev { + panic("gpu: vectors from different devices") + } +} + +// Mul computes v[i] = a[i] · b[i] (mod r). +func (v *FrVector) Mul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_mul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Mul failed: " + err.Error()) + } +} + +// Add computes v[i] = a[i] + b[i] (mod r). +func (v *FrVector) Add(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_add( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Add failed: " + err.Error()) + } +} + +// Sub computes v[i] = a[i] - b[i] (mod r). +func (v *FrVector) Sub(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_sub( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Sub failed: " + err.Error()) + } +} + +// AddMul computes v[i] += a[i] · b[i] (mod r). +func (v *FrVector) AddMul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_addmul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: AddMul failed: " + err.Error()) + } +} + +// AddScalarMul computes v[i] += a[i] · scalar (mod r). +func (v *FrVector) AddScalarMul(a *FrVector, scalar fr.Element, _ ...gpu.StreamID) { + if v.n != a.n { + panic("gpu: AddScalarMul size mismatch") + } + if v.dev != a.dev { + panic("gpu: AddScalarMul device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_add_scalar_mul( + devCtx(v.dev), v.handle, a.handle, + (*C.uint64_t)(unsafe.Pointer(&scalar)), + )); err != nil { + panic("gpu: AddScalarMul failed: " + err.Error()) + } +} + +// ScalarMul computes v[i] *= c (mod r) for all i. +func (v *FrVector) ScalarMul(c fr.Element, _ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scalar_mul( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&c)), + )); err != nil { + panic("gpu: ScalarMul failed: " + err.Error()) + } +} + +// ScaleByPowers computes v[i] *= g^i for i in [0, n). +// Used for coset FFT shifting. +func (v *FrVector) ScaleByPowers(g fr.Element, streams ...gpu.StreamID) { + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers_stream( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + C.int(streams[0]), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } +} + +// BatchInvert computes v[i] = 1/v[i] using Montgomery batch inversion. +// temp must be a separate FrVector of the same size used as scratch space. +func (v *FrVector) BatchInvert(temp *FrVector, _ ...gpu.StreamID) { + if v.n != temp.n { + panic("gpu: BatchInvert size mismatch") + } + if v.dev != temp.dev { + panic("gpu: BatchInvert device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_batch_invert( + devCtx(v.dev), v.handle, temp.handle, + )); err != nil { + panic("gpu: BatchInvert failed: " + err.Error()) + } +} diff --git a/prover/gpu/plonk2/bls12377/fr_stub.go b/prover/gpu/plonk2/bls12377/fr_stub.go new file mode 100644 index 00000000000..a46c7943d4a --- /dev/null +++ b/prover/gpu/plonk2/bls12377/fr_stub.go @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bls12377 + +import ( + "errors" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector is a stub for non-CUDA builds. +type FrVector struct{} + +func NewFrVector(_ *gpu.Device, _ int) (*FrVector, error) { + return nil, errors.New("gpu: cuda required") +} + +func (v *FrVector) Free() {} +func (v *FrVector) Len() int { return 0 } +func (v *FrVector) CopyFromHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyToHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDevice(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDeviceStream(_ *FrVector, _ gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) SetZero(_ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Mul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Add(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Sub(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddMul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddScalarMul(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (v *FrVector) ScalarMul(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) ScaleByPowers(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) BatchInvert(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } diff --git a/prover/gpu/plonk2/bls12377/fr_test.go b/prover/gpu/plonk2/bls12377/fr_test.go new file mode 100644 index 00000000000..251a37d77e6 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/fr_test.go @@ -0,0 +1,275 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377_test + +import ( + "fmt" + "testing" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bls12377" + "github.com/leanovate/gopter" + "github.com/leanovate/gopter/prop" + "github.com/stretchr/testify/require" +) + +func requireGPUDev(t testing.TB) *gpu.Device { + t.Helper() + dev, err := gpu.New() + require.NoError(t, err) + t.Cleanup(func() { dev.Close() }) + return dev +} + +func genFrElem() gopter.Gen { + return func(_ *gopter.GenParameters) *gopter.GenResult { + var e fr.Element + e.MustSetRandom() + return gopter.NewGenResult(e, gopter.NoShrinker) + } +} + +func randFrVec(n int) fr.Vector { + v := make(fr.Vector, n) + for i := range v { + v[i].MustSetRandom() + } + return v +} + +func newGPUVec(t testing.TB, dev *gpu.Device, data fr.Vector) *bls12377.FrVector { + t.Helper() + gv, err := bls12377.NewFrVector(dev, len(data)) + require.NoError(t, err) + t.Cleanup(func() { gv.Free() }) + gv.CopyFromHost(data) + dev.Sync() + return gv +} + +// TestFrVectorRoundtrip verifies CopyFromHost → CopyToHost is identity. +func TestFrVectorRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + const n = 1024 + src := randFrVec(n) + gv := newGPUVec(t, dev, src) + dst := make(fr.Vector, n) + gv.CopyToHost(dst) + for i := range src { + require.True(t, src[i].Equal(&dst[i]), "mismatch at %d", i) + } +} + +// TestFrVectorAddCommutative checks GPU Add(a,b) == GPU Add(b,a). +func TestFrVectorAddCommutative(t *testing.T) { + dev := requireGPUDev(t) + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 50 + properties := gopter.NewProperties(parameters) + + properties.Property("Add is commutative", prop.ForAll( + func(a, b fr.Element) bool { + n := 16 + aVec := make(fr.Vector, n) + bVec := make(fr.Vector, n) + for i := range aVec { + aVec[i] = a + bVec[i] = b + } + + gA, err := bls12377.NewFrVector(dev, n) + if err != nil { + return false + } + gB, _ := bls12377.NewFrVector(dev, n) + gAB, _ := bls12377.NewFrVector(dev, n) + gBA, _ := bls12377.NewFrVector(dev, n) + defer gA.Free() + defer gB.Free() + defer gAB.Free() + defer gBA.Free() + + gA.CopyFromHost(aVec) + gB.CopyFromHost(bVec) + gAB.Add(gA, gB) + gBA.Add(gB, gA) + dev.Sync() + + ab := make(fr.Vector, n) + ba := make(fr.Vector, n) + gAB.CopyToHost(ab) + gBA.CopyToHost(ba) + for i := range ab { + if !ab[i].Equal(&ba[i]) { + return false + } + } + return true + }, + genFrElem(), genFrElem(), + )) + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + +// TestFrVectorBatchInvert verifies v[i] * inv(v[i]) == 1. +func TestFrVectorBatchInvert(t *testing.T) { + dev := requireGPUDev(t) + const n = 256 + + orig := make(fr.Vector, n) + for i := range orig { + orig[i].MustSetRandom() + if orig[i].IsZero() { + orig[i].SetOne() + } + } + + gV := newGPUVec(t, dev, orig) + gTemp, err := bls12377.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + inv := make(fr.Vector, n) + gV.CopyToHost(inv) + + var one fr.Element + one.SetOne() + for i := range orig { + var product fr.Element + product.Mul(&orig[i], &inv[i]) + require.True(t, product.Equal(&one), "BatchInvert: v[%d]*inv[%d] != 1", i, i) + } +} + +// TestFrVectorScaleByPowers checks GPU ScaleByPowers matches CPU loop. +func TestFrVectorScaleByPowers(t *testing.T) { + dev := requireGPUDev(t) + const n = 512 + + var omega fr.Element + omega.MustSetRandom() + + ones := make(fr.Vector, n) + for i := range ones { + ones[i].SetOne() + } + + gV := newGPUVec(t, dev, ones) + gV.ScaleByPowers(omega) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + + expected := make(fr.Vector, n) + expected[0].SetOne() + for i := 1; i < n; i++ { + expected[i].Mul(&expected[i-1], &omega) + } + + for i := range result { + require.True(t, result[i].Equal(&expected[i]), "ScaleByPowers mismatch at %d", i) + } +} + +// TestFrVectorBatchInvertMatchesCPU verifies BatchInvert matches scalar CPU inversion. +func TestFrVectorBatchInvertMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const n = 128 + + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + + cpuInv := make(fr.Vector, n) + for i := range src { + cpuInv[i].Inverse(&src[i]) + } + + gV := newGPUVec(t, dev, src) + gTemp, err := bls12377.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + gpuInv := make(fr.Vector, n) + gV.CopyToHost(gpuInv) + + for i := range cpuInv { + require.True(t, cpuInv[i].Equal(&gpuInv[i]), + "BatchInvert mismatch at %d", i) + } +} + +// BenchmarkFrVectorAdd benchmarks GPU element-wise addition. +func BenchmarkFrVectorAdd(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20, 1 << 22} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + gA := newGPUVec(b, dev, src) + gB := newGPUVec(b, dev, src) + gC, _ := bls12377.NewFrVector(dev, n) + defer gC.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gC.Add(gA, gB) + dev.Sync() + } + }) + } +} + +// BenchmarkFrVectorBatchInvert benchmarks GPU batch inversion. +func BenchmarkFrVectorBatchInvert(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + gV := newGPUVec(b, dev, src) + gTemp, _ := bls12377.NewFrVector(dev, n) + defer gTemp.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gV.CopyFromHost(src) + gV.BatchInvert(gTemp) + dev.Sync() + } + }) + } +} + +func fmtPow2(n int) string { + switch { + case n >= 1<<20: + return fmt.Sprintf("%dM", n>>20) + case n >= 1<<10: + return fmt.Sprintf("%dK", n>>10) + default: + return fmt.Sprintf("%d", n) + } +} diff --git a/prover/gpu/plonk2/bls12377/kernels.go b/prover/gpu/plonk2/bls12377/kernels.go new file mode 100644 index 00000000000..0b6cf7af685 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/kernels.go @@ -0,0 +1,316 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +/* +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "math/big" + "runtime" + "sync" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// ZPrefixProduct computes Z[i] = product(ratio[0..i-1]) on GPU with CPU chunk scan. +func ZPrefixProduct(dev *gpu.Device, zVec, ratioVec, tempVec *FrVector) { + if zVec.n != ratioVec.n || zVec.n != tempVec.n { + panic("gpu: ZPrefixProduct size mismatch") + } + n := ratioVec.n + maxChunks := (n + 1023) / 1024 + cpHost := make([]uint64, maxChunks*4) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase1( + devCtx(dev), zVec.handle, ratioVec.handle, + (*C.uint64_t)(unsafe.Pointer(&cpHost[0])), &numChunks, + )); err != nil { + panic("gpu: ZPrefixProduct phase1 failed: " + err.Error()) + } + + nc := int(numChunks) + spHost := make([]uint64, nc*4) + copy(spHost[:4], cpHost[:4]) + for i := 1; i < nc; i++ { + prev := *(*fr.Element)(unsafe.Pointer(&spHost[(i-1)*4])) + cur := *(*fr.Element)(unsafe.Pointer(&cpHost[i*4])) + var prod fr.Element + prod.Mul(&prev, &cur) + *(*fr.Element)(unsafe.Pointer(&spHost[i*4])) = prod + } + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase3( + devCtx(dev), zVec.handle, tempVec.handle, + (*C.uint64_t)(unsafe.Pointer(&spHost[0])), C.size_t(nc), + )); err != nil { + panic("gpu: ZPrefixProduct phase3 failed: " + err.Error()) + } +} + +// PlonkZComputeFactors computes per-element Z ratio factors on GPU. +// On exit L contains numerators, R contains denominators. +func PlonkZComputeFactors( + L, R, O *FrVector, dPerm unsafe.Pointer, + beta, gamma, gMul, gSq fr.Element, + log2n uint, domain *GPUFFTDomain, +) { + n := L.n + if R.n != n || O.n != n || domain.size != n { + panic("gpu: PlonkZComputeFactors size mismatch") + } + params := [4]fr.Element{beta, gamma, gMul, gSq} + if err := toError(C.gnark_gpu_plonk2_z_compute_factors( + devCtx(L.dev), L.handle, R.handle, O.handle, + dPerm, (*C.uint64_t)(unsafe.Pointer(¶ms[0])), + C.uint(log2n), domain.handle, + )); err != nil { + panic("gpu: PlonkZComputeFactors failed: " + err.Error()) + } +} + +// PlonkGateAccum computes the fused gate constraint accumulation. +func PlonkGateAccum(result, Ql, Qr, Qm, Qo, Qk, L, R, O *FrVector, zhKInv fr.Element) { + n := result.n + if Ql.n != n || Qr.n != n || Qm.n != n || Qo.n != n || Qk.n != n || + L.n != n || R.n != n || O.n != n { + panic("gpu: PlonkGateAccum size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_gate_accum( + devCtx(result.dev), + result.handle, Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + L.handle, R.handle, O.handle, + (*C.uint64_t)(unsafe.Pointer(&zhKInv)), + )); err != nil { + panic("gpu: PlonkGateAccum failed: " + err.Error()) + } +} + +// PlonkLinearizeStatic computes the fixed-selector part of the linearized polynomial. +func PlonkLinearizeStatic( + result, Z, S3, Ql, Qr, Qm, Qo, Qk *FrVector, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta fr.Element, +) { + n := result.n + if Z.n != n || S3.n != n || Ql.n != n || Qr.n != n || Qm.n != n || + Qo.n != n || Qk.n != n { + panic("gpu: PlonkLinearizeStatic size mismatch") + } + scalars := [6]fr.Element{combinedZCoeff, s1, lZeta, rZeta, rl, oZeta} + if err := toError(C.gnark_gpu_plonk2_linearize_static( + devCtx(result.dev), + result.handle, Z.handle, S3.handle, + Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + (*C.uint64_t)(unsafe.Pointer(&scalars[0])), + )); err != nil { + panic("gpu: PlonkLinearizeStatic failed: " + err.Error()) + } +} + +// PlonkPermBoundary computes the fused permutation + boundary constraint. +func PlonkPermBoundary( + result, L, R, O, Z, S1, S2, S3, L1DenInv *FrVector, + alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen fr.Element, + domain *GPUFFTDomain, _ ...gpu.StreamID, +) { + n := result.n + if L.n != n || R.n != n || O.n != n || Z.n != n || + S1.n != n || S2.n != n || S3.n != n || L1DenInv.n != n || domain.size != n { + panic("gpu: PlonkPermBoundary size mismatch") + } + params := [7]fr.Element{alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen} + if err := toError(C.gnark_gpu_plonk2_perm_boundary( + devCtx(result.dev), + result.handle, L.handle, R.handle, O.handle, Z.handle, + S1.handle, S2.handle, S3.handle, L1DenInv.handle, + (*C.uint64_t)(unsafe.Pointer(¶ms[0])), domain.handle, + )); err != nil { + panic("gpu: PlonkPermBoundary failed: " + err.Error()) + } +} + +// ComputeL1Den computes out[i] = cosetGen·ω^i - 1 for all i. +func ComputeL1Den(out *FrVector, cosetGen fr.Element, domain *GPUFFTDomain, _ ...gpu.StreamID) { + if domain.size != out.n { + panic("gpu: ComputeL1Den domain size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_compute_l1_den( + domain.handle, out.handle, + (*C.uint64_t)(unsafe.Pointer(&cosetGen)), + )); err != nil { + panic("gpu: ComputeL1Den failed: " + err.Error()) + } +} + +// ReduceBlindedCoset reduces a blinded polynomial for coset evaluation on GPU. +func ReduceBlindedCoset(dst, src *FrVector, tail []fr.Element, cosetPowN fr.Element) { + if dst.n != src.n { + panic("gpu: ReduceBlindedCoset size mismatch") + } + var tailPtr *C.uint64_t + if len(tail) > 0 { + tailPtr = (*C.uint64_t)(unsafe.Pointer(&tail[0])) + } + if err := toError(C.gnark_gpu_plonk2_reduce_blinded_coset( + devCtx(dst.dev), dst.handle, src.handle, + tailPtr, C.size_t(len(tail)), + (*C.uint64_t)(unsafe.Pointer(&cosetPowN)), + )); err != nil { + panic("gpu: ReduceBlindedCoset failed: " + err.Error()) + } +} + +// SubtractBlindingHead subtracts tail[i] from v[i] for the blinding tail. +func SubtractBlindingHead(v *FrVector, tail []fr.Element) { + if len(tail) == 0 { + return + } + if len(tail) > v.n { + panic("gpu: SubtractBlindingHead size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_subtract_head( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&tail[0])), + C.size_t(len(tail)), + )); err != nil { + panic("gpu: SubtractBlindingHead failed: " + err.Error()) + } +} + +// DeviceAllocCopyInt64 uploads an int64 slice to GPU device memory. +func DeviceAllocCopyInt64(dev *gpu.Device, data []int64) (unsafe.Pointer, error) { + var dPtr unsafe.Pointer + if err := toError(C.gnark_gpu_device_alloc_copy_int64( + devCtx(dev), + (*C.int64_t)(unsafe.Pointer(&data[0])), + C.size_t(len(data)), + &dPtr, + )); err != nil { + return nil, err + } + return dPtr, nil +} + +// DeviceFreePtr frees device memory allocated by DeviceAllocCopyInt64. +func DeviceFreePtr(ptr unsafe.Pointer) { + if ptr != nil { + C.gnark_gpu_device_free_ptr(ptr) + } +} + +// PolyEvalGPU evaluates a GPU-resident polynomial at z using chunked Horner on +// device and a small CPU combine over chunk partials. +func PolyEvalGPU(dev *gpu.Device, v *FrVector, z fr.Element) fr.Element { + n := v.n + if n == 0 { + return fr.Element{} + } + + maxChunks := (n + 1023) / 1024 + partialsHost := make([]uint64, maxChunks*4) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_poly_eval_chunks( + devCtx(dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&z)), + (*C.uint64_t)(unsafe.Pointer(&partialsHost[0])), + &numChunks, + )); err != nil { + panic("gpu: PolyEvalGPU failed: " + err.Error()) + } + + return combinePolyEvalPartials(partialsHost, int(numChunks), z) +} + +// PolyEvalFromDevice downloads a GPU FrVector and evaluates at z using CPU Horner. +func PolyEvalFromDevice(v *FrVector, z fr.Element) fr.Element { + n := v.n + coeffs := make(fr.Vector, n) + v.CopyToHost(coeffs) + return polyEvalParallel(coeffs, z) +} + +func combinePolyEvalPartials(partialsHost []uint64, numChunks int, z fr.Element) fr.Element { + if numChunks == 0 { + return fr.Element{} + } + readPartial := func(chunk int) fr.Element { + var r fr.Element + for limb := range r { + r[limb] = partialsHost[chunk*4+limb] + } + return r + } + if numChunks == 1 { + return readPartial(0) + } + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(1024)) + result := readPartial(numChunks - 1) + for j := numChunks - 2; j >= 0; j-- { + p := readPartial(j) + result.Mul(&result, &zChunk).Add(&result, &p) + } + return result +} + +// polyEvalParallel evaluates p(z) = Σ c[i]·z^i using multi-core Horner. +func polyEvalParallel(coeffs []fr.Element, z fr.Element) fr.Element { + n := len(coeffs) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + return hornerEval(coeffs, z) + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + partials := make([]fr.Element, numChunks) + var wg sync.WaitGroup + for c := range numChunks { + start := c * chunkSize + if start >= n { + break + } + end := start + chunkSize + if end > n { + end = n + } + wg.Add(1) + go func(idx, s, e int) { + defer wg.Done() + partials[idx] = hornerEval(coeffs[s:e], z) + }(c, start, end) + } + wg.Wait() + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(int64(chunkSize))) + var result, zPow fr.Element + zPow.SetOne() + for c := range numChunks { + if c*chunkSize >= n { + break + } + var t fr.Element + t.Mul(&partials[c], &zPow) + result.Add(&result, &t) + zPow.Mul(&zPow, &zChunk) + } + return result +} + +func hornerEval(coeffs []fr.Element, z fr.Element) fr.Element { + var r fr.Element + for i := len(coeffs) - 1; i >= 0; i-- { + r.Mul(&r, &z).Add(&r, &coeffs[i]) + } + return r +} diff --git a/prover/gpu/plonk2/bls12377/kernels_stub.go b/prover/gpu/plonk2/bls12377/kernels_stub.go new file mode 100644 index 00000000000..3d185b57306 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/kernels_stub.go @@ -0,0 +1,36 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bls12377 + +import ( + "errors" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +func ZPrefixProduct(_ *gpu.Device, _, _, _ *FrVector) { panic("gpu: cuda required") } +func PlonkZComputeFactors(_, _, _ *FrVector, _ unsafe.Pointer, _, _, _, _ fr.Element, _ uint, _ *GPUFFTDomain) { + panic("gpu: cuda required") +} +func PlonkGateAccum(_, _, _, _, _, _, _, _, _ *FrVector, _ fr.Element) { panic("gpu: cuda required") } +func PlonkPermBoundary(_, _, _, _, _, _, _, _, _ *FrVector, _, _, _, _, _, _, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ComputeL1Den(_ *FrVector, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ReduceBlindedCoset(_, _ *FrVector, _ []fr.Element, _ fr.Element) { panic("gpu: cuda required") } +func DeviceAllocCopyInt64(_ *gpu.Device, _ []int64) (unsafe.Pointer, error) { + return nil, errors.New("gpu: cuda required") +} +func DeviceFreePtr(_ unsafe.Pointer) {} +func PolyEvalGPU(_ *gpu.Device, _ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} +func PolyEvalFromDevice(_ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bls12377/msm.go b/prover/gpu/plonk2/bls12377/msm.go new file mode 100644 index 00000000000..8ce2fbb042b --- /dev/null +++ b/prover/gpu/plonk2/bls12377/msm.go @@ -0,0 +1,390 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "fmt" + "log" + "math/big" + "os" + "runtime" + "strconv" + "unsafe" + + curve "github.com/consensys/gnark-crypto/ecc/bls12-377" + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// frRInv is R^{-1} mod r where R = 2^{FrLimbs*64} (the Fr Montgomery constant). +// The GPU MSM uses Montgomery-form scalars without fr_from_mont, so the result +// is R * correct_result. Multiplying by frRInv corrects this. +var frRInv big.Int + +func init() { + var rInv fr.Element + rInv[0] = 1 // Montgomery representation of R^{-1}: stores R^{-1} mod r + rInv.BigInt(&frRInv) +} + +// msmDefaultWindowBits selects the Pippenger window size for n points. +func msmDefaultWindowBits(n int) int { + switch { + case n > 1<<26: + return 20 + case n > 1<<22: + return 17 + case n > 1<<18: + return 15 + case n > 1<<12: + return 13 + default: + return 11 + } +} + +// G1MSM holds a GPU MSM context with uploaded affine base points. +// +// Points are uploaded once at construction. The context supports multiple +// MultiExp calls sharing the same base points. +type G1MSM struct { + handle C.gnark_gpu_plonk2_msm_t + dev *gpu.Device + n int + windowBits int + hostPoints []curve.G1Affine + hostPointsPtr unsafe.Pointer + lastBatchPhaseTimings [][9]float32 +} + +// NewG1MSM creates a G1MSM context by uploading affine points to the GPU. +// window_bits=0 selects a default based on point count. +func NewG1MSM(dev *gpu.Device, points []curve.G1Affine, windowBits int) (*G1MSM, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if err := dev.Bind(); err != nil { + return nil, err + } + n := len(points) + if n == 0 { + return nil, &gpu.Error{Code: -1, Message: "points must not be empty"} + } + if windowBits == 0 { + windowBits = msmDefaultWindowBits(n) + } + if override := os.Getenv("GNARK_GPU_PLONK2_MSM_WINDOW_BITS"); override != "" { + parsed, err := strconv.Atoi(override) + if err != nil { + return nil, fmt.Errorf("gpu: invalid GNARK_GPU_PLONK2_MSM_WINDOW_BITS %q: %w", override, err) + } + windowBits = parsed + } + if windowBits < 2 || windowBits > 24 { + return nil, fmt.Errorf("gpu: window bits must be in [2,24], got %d", windowBits) + } + + hostPoints := points + var hostPointsPtr unsafe.Pointer + if os.Getenv("GNARK_GPU_DISABLE_PINNED_MSM_POINTS") == "" { + nbytes := C.size_t(n) * C.size_t(unsafe.Sizeof(curve.G1Affine{})) + if err := toError(C.gnark_gpu_alloc_pinned(&hostPointsPtr, nbytes)); err == nil { + hostPoints = unsafe.Slice((*curve.G1Affine)(hostPointsPtr), n) + copy(hostPoints, points) + } else { + log.Printf("gpu: pinned MSM points unavailable (%v), using heap", err) + hostPointsPtr = nil + } + } + + var handle C.gnark_gpu_plonk2_msm_t + if err := toError(C.gnark_gpu_plonk2_msm_create( + devCtx(dev), + curveID(), + (*C.uint64_t)(unsafe.Pointer(&hostPoints[0])), + C.size_t(n), + C.int(windowBits), + &handle, + )); err != nil { + if hostPointsPtr != nil { + C.gnark_gpu_free_pinned(hostPointsPtr) + } + return nil, err + } + + m := &G1MSM{ + handle: handle, + dev: dev, + n: n, + windowBits: windowBits, + hostPoints: hostPoints, + hostPointsPtr: hostPointsPtr, + } + runtime.SetFinalizer(m, (*G1MSM).Close) + return m, nil +} + +// Close releases GPU resources. Safe to call multiple times. +func (m *G1MSM) Close() { + if m.handle != nil { + C.gnark_gpu_plonk2_msm_destroy(m.handle) + m.handle = nil + if m.hostPointsPtr != nil { + C.gnark_gpu_free_pinned(m.hostPointsPtr) + m.hostPointsPtr = nil + } + m.hostPoints = nil + runtime.SetFinalizer(m, nil) + } +} + +// Len returns the number of base points. +func (m *G1MSM) Len() int { return m.n } + +// PinWorkBuffers keeps MSM scratch buffers resident across MultiExp calls, +// amortizing cudaMalloc/Free overhead over a wave of MSMs. +func (m *G1MSM) PinWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_pin_work_buffers(m.handle)) +} + +// ReleaseWorkBuffers frees pinned scratch buffers. Subsequent MultiExp calls +// re-allocate lazily. +func (m *G1MSM) ReleaseWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_release_work_buffers(m.handle)) +} + +// OffloadPoints frees the GPU-resident base points. Call ReloadPoints before +// the next MultiExp. +func (m *G1MSM) OffloadPoints() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_offload_points(m.handle)) +} + +// ReloadPoints uploads the retained host base points after OffloadPoints. +func (m *G1MSM) ReloadPoints() error { + if len(m.hostPoints) < m.n { + return fmt.Errorf("gpu: MSM host points unavailable") + } + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_reload_points( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&m.hostPoints[0])), + C.size_t(m.n), + )) +} + +// MultiExp computes Q[i] = Σⱼ scalars[i][j] · P[j] for each scalar set. +// Each scalars[i] must have length ≤ m.Len(). +// Returns Jacobian results. +func (m *G1MSM) MultiExp(scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if err := m.dev.Bind(); err != nil { + return nil, err + } + k := len(scalars) + if k == 0 { + return nil, nil + } + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: MSM scalar set %d is empty", i) + } + if len(s) > m.n { + return nil, fmt.Errorf("gpu: MSM scalar set %d has %d elements, exceeds %d points", i, len(s), m.n) + } + } + + results := make([]curve.G1Jac, k) + m.lastBatchPhaseTimings = make([][9]float32, k) + for i, s := range scalars { + if err := toError(C.gnark_gpu_plonk2_msm_run( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&s[0])), + C.size_t(len(s)), + (*C.uint64_t)(unsafe.Pointer(&results[i])), + )); err != nil { + return nil, fmt.Errorf("gpu: MSM set %d failed: %w", i, err) + } + m.lastBatchPhaseTimings[i] = m.LastPhaseTimings() + // Montgomery correction: GPU skips fr_from_mont on scalars, so result = R * correct. + results[i].ScalarMultiplication(&results[i], &frRInv) + } + return results, nil +} + +// LastPhaseTimings returns per-phase timings (ms) from the most recent MultiExp call. +func (m *G1MSM) LastPhaseTimings() [9]float32 { + var out [9]C.float + C.gnark_gpu_plonk2_msm_get_phase_timings(m.handle, (*C.float)(unsafe.Pointer(&out[0]))) + var result [9]float32 + for i := range result { + result[i] = float32(out[i]) + } + return result +} + +// LastBatchPhaseTimings returns per-set MSM phase timings from the most recent +// MultiExp call. +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { + if len(m.lastBatchPhaseTimings) == 0 { + return nil + } + out := make([][9]float32, len(m.lastBatchPhaseTimings)) + copy(out, m.lastBatchPhaseTimings) + return out +} + +// MultiExpSplit runs the MSM split across 2 devices for ~2x speedup. +// msm0 must hold points[:n/2] and msm1 must hold points[n/2:]. +// This is an advanced API; use MultiExp for single-GPU operation. +func MultiExpSplit(msm0, msm1 *G1MSM, scalars []fr.Element) (curve.G1Jac, error) { + return MultiExpSplitAt(msm0, msm1, len(scalars)/2, scalars) +} + +// MultiExpSplitAt runs one MSM split across 2 devices at a fixed scalar index. +// msm0 must hold points[:split], and msm1 must hold points[split:]. +func MultiExpSplitAt(msm0, msm1 *G1MSM, split int, scalars []fr.Element) (curve.G1Jac, error) { + if msm0 == nil || msm1 == nil || len(scalars) == 0 { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: nil MSM or empty scalars") + } + n := len(scalars) + if split <= 0 || split >= n { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: invalid split %d for %d scalars", split, n) + } + if split > msm0.Len() || n-split > msm1.Len() { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: split exceeds MSM point capacity") + } + + type result struct { + jac curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(scalars[:split]) + if err != nil { + ch0 <- result{err: err} + return + } + ch0 <- result{jac: jacs[0]} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(scalars[split:]) + if err != nil { + ch1 <- result{err: err} + return + } + ch1 <- result{jac: jacs[0]} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return curve.G1Jac{}, r0.err + } + if r1.err != nil { + return curve.G1Jac{}, r1.err + } + r0.jac.AddAssign(&r1.jac) + return r0.jac, nil +} + +// MultiExpSplitBatchAt runs several MSMs split across 2 devices. Each device +// executes its half-batch sequentially on its own stream, and the host combines +// matching partials. +func MultiExpSplitBatchAt(msm0, msm1 *G1MSM, split int, scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if len(scalars) == 0 { + return nil, nil + } + first := make([][]fr.Element, len(scalars)) + second := make([][]fr.Element, len(scalars)) + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: split MSM scalar set %d is empty", i) + } + if split <= 0 || split >= len(s) { + return nil, fmt.Errorf("gpu: split MSM scalar set %d has invalid split %d for %d scalars", i, split, len(s)) + } + if split > msm0.Len() || len(s)-split > msm1.Len() { + return nil, fmt.Errorf("gpu: split MSM scalar set %d exceeds MSM point capacity", i) + } + first[i] = s[:split] + second[i] = s[split:] + } + + type result struct { + jacs []curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(first...) + ch0 <- result{jacs: jacs, err: err} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(second...) + ch1 <- result{jacs: jacs, err: err} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return nil, r0.err + } + if r1.err != nil { + return nil, r1.err + } + if len(r0.jacs) != len(scalars) || len(r1.jacs) != len(scalars) { + return nil, fmt.Errorf("gpu: split MSM result length mismatch") + } + for i := range r0.jacs { + r0.jacs[i].AddAssign(&r1.jacs[i]) + } + return r0.jacs, nil +} diff --git a/prover/gpu/plonk2/bls12377/msm_stub.go b/prover/gpu/plonk2/bls12377/msm_stub.go new file mode 100644 index 00000000000..a008f07858f --- /dev/null +++ b/prover/gpu/plonk2/bls12377/msm_stub.go @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bls12377 + +import ( + "errors" + + curve "github.com/consensys/gnark-crypto/ecc/bls12-377" + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// G1MSM is a stub for non-CUDA builds. +type G1MSM struct{} + +func NewG1MSM(_ *gpu.Device, _ []curve.G1Affine, _ int) (*G1MSM, error) { + return nil, errors.New("gpu: cuda required") +} + +func (m *G1MSM) Close() {} +func (m *G1MSM) Len() int { return 0 } +func (m *G1MSM) PinWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) ReleaseWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) MultiExp(_ ...[]fr.Element) ([]curve.G1Jac, error) { + return nil, errors.New("gpu: cuda required") +} +func (m *G1MSM) LastPhaseTimings() [9]float32 { return [9]float32{} } +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { return nil } + +func MultiExpSplit(_, _ *G1MSM, _ []fr.Element) (curve.G1Jac, error) { + return curve.G1Jac{}, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bls12377/msm_test.go b/prover/gpu/plonk2/bls12377/msm_test.go new file mode 100644 index 00000000000..e49f07e14c7 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/msm_test.go @@ -0,0 +1,139 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377_test + +import ( + "fmt" + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "github.com/consensys/gnark-crypto/ecc/bls12-377" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bls12377" + "github.com/stretchr/testify/require" +) + +func makeTestPoints(n int) []curve.G1Affine { + _, _, g1, _ := curve.Generators() + pts := make([]curve.G1Affine, n) + pts[0] = g1 + for i := 1; i < n; i++ { + pts[i].Add(&pts[i-1], &g1) + } + return pts +} + +// TestMSMMatchesCPU verifies GPU MSM matches gnark-crypto CPU MultiExp. +func TestMSMMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + + for _, n := range []int{1, 16, 100, 1000} { + n := n + t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + + // CPU reference + var cpuResult curve.G1Affine + cpuResult.MultiExp(pts, scalars, ecc.MultiExpConfig{}) + + // GPU + msm, err := bls12377.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars) + require.NoError(t, err) + require.Len(t, results, 1) + + var gpuAffine curve.G1Affine + gpuAffine.FromJacobian(&results[0]) + + require.True(t, cpuResult.Equal(&gpuAffine), + "MSM mismatch at n=%d", n) + }) + } +} + +// TestMSMBatchScalarSets tests MultiExp with multiple scalar sets. +func TestMSMBatchScalarSets(t *testing.T) { + dev := requireGPUDev(t) + const n = 100 + + pts := makeTestPoints(n) + scalars1 := randFrVec(n) + scalars2 := randFrVec(n) + + // CPU references + var cpu1, cpu2 curve.G1Affine + cpu1.MultiExp(pts, scalars1, ecc.MultiExpConfig{}) + cpu2.MultiExp(pts, scalars2, ecc.MultiExpConfig{}) + + // GPU batch + msm, err := bls12377.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars1, scalars2) + require.NoError(t, err) + require.Len(t, results, 2) + + var gpu1, gpu2 curve.G1Affine + gpu1.FromJacobian(&results[0]) + gpu2.FromJacobian(&results[1]) + + require.True(t, cpu1.Equal(&gpu1), "MSM set 0 mismatch") + require.True(t, cpu2.Equal(&gpu2), "MSM set 1 mismatch") +} + +// TestMSMWorkBuffers verifies PinWorkBuffers/ReleaseWorkBuffers are idempotent. +func TestMSMWorkBuffers(t *testing.T) { + dev := requireGPUDev(t) + const n = 64 + + pts := makeTestPoints(n) + scalars := randFrVec(n) + + msm, err := bls12377.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + require.NoError(t, msm.PinWorkBuffers()) + r1, err := msm.MultiExp(scalars) + require.NoError(t, err) + + require.NoError(t, msm.ReleaseWorkBuffers()) + r2, err := msm.MultiExp(scalars) + require.NoError(t, err) + + var a1, a2 curve.G1Affine + a1.FromJacobian(&r1[0]) + a2.FromJacobian(&r2[0]) + require.True(t, a1.Equal(&a2), "result changed after work buffer release") +} + +// BenchmarkMSM benchmarks GPU MSM at various sizes. +func BenchmarkMSM(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + msm, err := bls12377.NewG1MSM(dev, pts, 0) + require.NoError(b, err) + defer msm.Close() + require.NoError(b, msm.PinWorkBuffers()) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := msm.MultiExp(scalars) + require.NoError(b, err) + } + }) + } +} diff --git a/prover/gpu/plonk2/bls12377/pinned_fr.go b/prover/gpu/plonk2/bls12377/pinned_fr.go new file mode 100644 index 00000000000..4703bd5931a --- /dev/null +++ b/prover/gpu/plonk2/bls12377/pinned_fr.go @@ -0,0 +1,41 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" +) + +type pinnedFrBuffer struct { + ptr unsafe.Pointer + data []fr.Element +} + +func newPinnedFrBuffer(n int) (pinnedFrBuffer, error) { + var ptr unsafe.Pointer + nbytes := C.size_t(n) * C.size_t(fr.Bytes) + if err := toError(C.gnark_gpu_alloc_pinned(&ptr, nbytes)); err != nil { + return pinnedFrBuffer{}, err + } + return pinnedFrBuffer{ + ptr: ptr, + data: unsafe.Slice((*fr.Element)(ptr), n), + }, nil +} + +func (b *pinnedFrBuffer) free() { + if b.ptr != nil { + C.gnark_gpu_free_pinned(b.ptr) + b.ptr = nil + b.data = nil + } +} diff --git a/prover/gpu/plonk2/bls12377/plonk_test.go b/prover/gpu/plonk2/bls12377/plonk_test.go new file mode 100644 index 00000000000..85bccde80e2 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/plonk_test.go @@ -0,0 +1,205 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377_test + +import ( + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "github.com/consensys/gnark-crypto/ecc/bls12-377" + kzg "github.com/consensys/gnark-crypto/ecc/bls12-377/kzg" + gnarkplonk "github.com/consensys/gnark/backend/plonk" + curplonk "github.com/consensys/gnark/backend/plonk/bls12-377" + cs "github.com/consensys/gnark/constraint/bls12-377" + "github.com/consensys/gnark/frontend" + "github.com/consensys/gnark/frontend/cs/scs" + emPlonk "github.com/consensys/gnark/std/recursion/plonk" + "github.com/consensys/gnark/test/unsafekzg" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bls12377" + "github.com/stretchr/testify/require" +) + +// addCircuit has enough constraints for sizeSystem >= 6 (avoiding gnark's 8-coset edge case for tiny circuits). +// Circuit: a*b + c*d + e*f = out (out is public) +type addCircuit struct { + A, B, C, D, F, G frontend.Variable + Out frontend.Variable `gnark:",public"` +} + +func (c *addCircuit) Define(api frontend.API) error { + ab := api.Mul(c.A, c.B) + cd := api.Mul(c.C, c.D) + fg := api.Mul(c.F, c.G) + sum := api.Add(ab, cd) + sum2 := api.Add(sum, fg) + api.AssertIsEqual(sum2, c.Out) + return nil +} + +type commitCircuit struct { + A, B, Out frontend.Variable +} + +func (c *commitCircuit) Define(api frontend.API) error { + commitment, err := api.(frontend.Committer).Commit(c.A, c.B) + if err != nil { + return err + } + product := api.Mul(c.A, c.B) + api.AssertIsDifferent(commitment, product) + api.AssertIsEqual(api.Add(c.A, c.B), c.Out) + return nil +} + +func setupAddCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &addCircuit{}) +} + +func setupCommitCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &commitCircuit{}) +} + +func setupCircuit(t testing.TB, circuit frontend.Circuit) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + ccs, err := frontend.Compile(ecc.BLS12_377.ScalarField(), scs.NewBuilder, circuit) + require.NoError(t, err) + + srs, srsLag, err := unsafekzg.NewSRS(ccs) + require.NoError(t, err) + + _, vkIface, err := gnarkplonk.Setup(ccs, srs, srsLag) + require.NoError(t, err) + vk := vkIface.(*curplonk.VerifyingKey) + + // Extract canonical G1 SRS points from the concrete KZG SRS type. + concreteSRS := srs.(*kzg.SRS) + srsPoints := make([]curve.G1Affine, len(concreteSRS.Pk.G1)) + copy(srsPoints, concreteSRS.Pk.G1) + + return ccs.(*cs.SparseR1CS), vk, srsPoints +} + +// TestGPUProveVerify proves a small circuit with the GPU and verifies with gnark CPU. +func TestGPUProveVerify(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := bls12377.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15 + 77 + 8} + fullW, err := frontend.NewWitness(assignment, ecc.BLS12_377.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bls12377.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +// TestGPUProveMultipleProofs tests that multiple proofs can be generated from the same key. +func TestGPUProveMultipleProofs(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := bls12377.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + for i := range 3 { + a := int64(i + 1) + _ = int64(i + 2) + assignment := &addCircuit{A: a, B: a + 1, C: a + 2, D: a + 3, F: a + 4, G: a + 5, Out: a*(a+1) + (a+2)*(a+3) + (a+4)*(a+5)} + fullW, err := frontend.NewWitness(assignment, ecc.BLS12_377.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bls12377.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err, "proof %d failed", i) + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "proof %d verification failed", i) + } +} + +func TestGPUProveVerify_BSB22Commitment(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupCommitCircuit(t) + + gpk := bls12377.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &commitCircuit{A: 3, B: 5, Out: 8} + fullW, err := frontend.NewWitness(assignment, ecc.BLS12_377.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bls12377.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +func TestGPUProveVerify_BSB22Commitment_NativeRecursionOptions(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupCommitCircuit(t) + + gpk := bls12377.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &commitCircuit{A: 3, B: 5, Out: 8} + fullW, err := frontend.NewWitness(assignment, ecc.BLS12_377.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bls12377.GPUProve( + dev, + gpk, + spr, + fullW, + emPlonk.GetNativeProverOptions(ecc.BW6_761.ScalarField(), ecc.BLS12_377.ScalarField()), + ) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError( + t, + gnarkplonk.Verify( + proof, + vk, + pubW, + emPlonk.GetNativeVerifierOptions(ecc.BW6_761.ScalarField(), ecc.BLS12_377.ScalarField()), + ), + "GPU proof failed verification with native recursion options", + ) +} + +// BenchmarkGPUProve benchmarks GPU proof generation. +func BenchmarkGPUProve(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + spr, vk, srsPoints := setupAddCircuit(b) + gpk := bls12377.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15 + 77 + 8} + fullW, err := frontend.NewWitness(assignment, ecc.BLS12_377.ScalarField()) + require.NoError(b, err) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := bls12377.GPUProve(dev, gpk, spr, fullW) + require.NoError(b, err) + } +} diff --git a/prover/gpu/plonk2/bls12377/prove.go b/prover/gpu/plonk2/bls12377/prove.go new file mode 100644 index 00000000000..30326190c6f --- /dev/null +++ b/prover/gpu/plonk2/bls12377/prove.go @@ -0,0 +1,2618 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "context" + "errors" + "fmt" + "hash" + "log" + "math/big" + "math/bits" + "os" + "runtime" + "strconv" + "sync" + "time" + "unsafe" + + curve "github.com/consensys/gnark-crypto/ecc/bls12-377" + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/fft" + htf "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/hash_to_field" + iop "github.com/consensys/gnark-crypto/ecc/bls12-377/fr/iop" + kzg "github.com/consensys/gnark-crypto/ecc/bls12-377/kzg" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + + "github.com/consensys/gnark/backend" + curplonk "github.com/consensys/gnark/backend/plonk/bls12-377" + "github.com/consensys/gnark/backend/witness" + "github.com/consensys/gnark/constraint" + cs "github.com/consensys/gnark/constraint/bls12-377" + "github.com/consensys/gnark/constraint/solver" + fcs "github.com/consensys/gnark/frontend/cs" + + "github.com/consensys/linea-monorepo/prover/gpu" + "golang.org/x/sync/errgroup" +) + +const ( + id_L int = iota + id_R + id_O + id_Z + + orderBlindingL = 1 + orderBlindingR = 1 + orderBlindingO = 1 + orderBlindingZ = 2 + msmExtraPoints = 6 +) + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProvingKey — slim wrapper: VerifyingKey + lazy gpuInstance +// ───────────────────────────────────────────────────────────────────────────── + +type GPUProvingKey struct { + mu sync.Mutex + Vk *curplonk.VerifyingKey + n int + + // SRS data (consumed during instance init) + srsPoints []curve.G1Affine + pinnedN int + + inst *gpuInstance +} + +// NewGPUProvingKey creates a GPUProvingKey from affine SRS points. +func NewGPUProvingKey(srsPoints []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + n := 0 + if vk != nil { + n = int(vk.Size) + } + return &GPUProvingKey{Vk: vk, n: n, srsPoints: srsPoints} +} + +// Size returns the domain size n. +func (gpk *GPUProvingKey) Size() int { return gpk.n } + +// Prepare performs one-time GPU setup. +func (gpk *GPUProvingKey) Prepare(dev *gpu.Device, spr *cs.SparseR1CS) error { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil && gpk.inst.dev == dev { + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + inst, err := newGPUInstance(dev, gpk, spr) + if err != nil { + return err + } + gpk.inst = inst + return nil +} + +// Close releases all GPU resources. +func (gpk *GPUProvingKey) Close() { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuInstance — persistent GPU resources + circuit data +// ───────────────────────────────────────────────────────────────────────────── + +// quotientWorkBufs holds pre-allocated GPU buffers for computeNumeratorGPU and +// computeLinearizedPoly, avoiding per-proof cudaMalloc/Free overhead. +type quotientWorkBufs struct { + L, R, O, Z *FrVector // wire poly working buffers (reused per coset) + S1, S2, S3 *FrVector // perm selector buffers + Result *FrVector // coset numerator accumulator + LCan, RCan, OCan, ZCan *FrVector // canonical wire polys (uploaded once per proof) + QkSrc *FrVector // Qk canonical source (D2D per coset, avoids H2D) + Pi2Src []*FrVector // per-proof BSB22 pi2 sources (D2D per coset) + CosetBlock [3]*FrVector // GPU-resident coset results; Result keeps block 4 + LinResult, LinW *FrVector // linearized poly GPU scratch +} + +type lowMemorySelectorCache struct { + ql, qr, qm, qo *FrVector + s1, s2, s3 *FrVector + qcp []*FrVector +} + +type splitMSMBackend struct { + secondary *gpu.Device + msm0 *G1MSM + msm1 *G1MSM + split int +} + +type gpuInstance struct { + dev *gpu.Device + vk *curplonk.VerifyingKey + n int + log2n uint + lowMemory bool + canonicalReady chan struct{} + canonicalErr error + canonicalOnce sync.Once + + domain0 *fft.Domain + + msm *G1MSM + splitMSM *splitMSMBackend + fftDom *GPUFFTDomain + dPerm unsafe.Pointer + + dQl, dQr, dQm, dQo *FrVector + dS1, dS2, dS3 *FrVector + dQkFixed *FrVector + dQcp []*FrVector + + qlCanonical, qrCanonical, qmCanonical, qoCanonical fr.Vector + qkFixedCanonical fr.Vector + s1Canonical, s2Canonical, s3Canonical fr.Vector + qcpCanonical []fr.Vector + qkLagrange fr.Vector + permutation []int64 + nbPublicVariables int + commitmentInfo []uint64 + + gpuWork *FrVector // shared scratch buffer (persists for prover lifetime) + qWb quotientWorkBufs + + hBufs hostBufs +} + +type gpuInstanceReadyHooks struct { + msm func(*gpuInstance) + commit func(*gpuInstance) + trace func(*gpuInstance) +} + +type hostBufs struct { + lCanonical, rCanonical, oCanonical fr.Vector + zLagrange fr.Vector + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + hFull []fr.Element + openZBuf []fr.Element + pinned []pinnedFrBuffer +} + +func (inst *gpuInstance) initHostBufs() { + n := inst.n + var hb hostBufs + + allocPinnedHotBuffer := func(name string, n int) []fr.Element { + if os.Getenv("GNARK_GPU_DISABLE_PINNED_HOST_BUFS") == "" { + buf, err := newPinnedFrBuffer(n) + if err == nil { + hb.pinned = append(hb.pinned, buf) + return buf.data + } + log.Printf("gpu: pinned host buffer %s unavailable (%v), using heap", name, err) + } + return make([]fr.Element, n) + } + + hb = hostBufs{ + lCanonical: make(fr.Vector, n), + rCanonical: make(fr.Vector, n), + oCanonical: make(fr.Vector, n), + zLagrange: make(fr.Vector, n), + qkCoeffs: make(fr.Vector, n), + openZBuf: make([]fr.Element, n+1+orderBlindingZ), + } + hb.lBlinded = allocPinnedHotBuffer("lBlinded", n+1+orderBlindingL) + hb.rBlinded = allocPinnedHotBuffer("rBlinded", n+1+orderBlindingR) + hb.oBlinded = allocPinnedHotBuffer("oBlinded", n+1+orderBlindingO) + hb.zBlinded = allocPinnedHotBuffer("zBlinded", n+1+orderBlindingZ) + hSize := 4 * n + if needed := 3 * (n + 2); needed > hSize { + hSize = needed + } + hb.hFull = allocPinnedHotBuffer("hFull", hSize) + inst.hBufs = hb +} + +func (hb *hostBufs) free() { + for i := range hb.pinned { + hb.pinned[i].free() + } + *hb = hostBufs{} +} + +func newGPUInstance(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, hooks ...gpuInstanceReadyHooks) (*gpuInstance, error) { + inst := &gpuInstance{dev: dev, vk: gpk.Vk, n: gpk.n, canonicalReady: make(chan struct{})} + var hook gpuInstanceReadyHooks + if len(hooks) > 0 { + hook = hooks[0] + } + commitPublished := false + msmPublished := false + tracePublished := false + publishMSMReady := func() { + if hook.msm != nil && !msmPublished { + msmPublished = true + hook.msm(inst) + } + } + publishCommitReady := func() { + if hook.commit != nil && !commitPublished { + commitPublished = true + hook.commit(inst) + } + } + publishTraceReady := func() { + if hook.trace != nil && !tracePublished { + tracePublished = true + hook.trace(inst) + } + } + var traceErrCh chan error + + fail := func(msg string, err error) (*gpuInstance, error) { + wrapped := fmt.Errorf("%s: %w", msg, err) + if traceErrCh != nil { + <-traceErrCh + traceErrCh = nil + } + inst.publishCanonicalReady(wrapped) + if !msmPublished && !commitPublished && !tracePublished { + inst.close() + } + return nil, wrapped + } + + if err := inst.initCircuitShape(spr); err != nil { + return fail("init circuit shape", err) + } + inst.lowMemory = selectLowMemoryMode(dev, inst.n) + traceErrCh = make(chan error, 1) + go func() { + traceErrCh <- inst.initTraceData(spr) + }() + waitTrace := func() error { + if traceErrCh == nil { + return nil + } + err := <-traceErrCh + traceErrCh = nil + return err + } + + var err error + msmSize := inst.n + msmExtraPoints + pts := gpk.srsPoints + if msmSize > len(pts) { + msmSize = len(pts) + } + if secondaryID, ok, cfgErr := secondaryMSMDeviceID(dev.DeviceID()); cfgErr != nil { + return fail("configure secondary MSM GPU", cfgErr) + } else if ok { + split := inst.n / 2 + if split <= 0 || split >= msmSize { + return fail("configure secondary MSM GPU", fmt.Errorf("invalid split %d for MSM size %d", split, msmSize)) + } + secondary, err := gpu.New(gpu.WithDeviceID(secondaryID)) + if err != nil { + return fail("create secondary GPU device", err) + } + inst.splitMSM = &splitMSMBackend{secondary: secondary, split: split} + inst.splitMSM.msm0, err = NewG1MSM(dev, pts[:split], 0) + if err != nil { + return fail("create primary split MSM", err) + } + inst.splitMSM.msm1, err = NewG1MSM(secondary, pts[split:msmSize], 0) + if err != nil { + return fail("create secondary split MSM", err) + } + } else { + inst.msm, err = NewG1MSM(dev, pts[:msmSize], 0) + if err != nil { + return fail("create MSM", err) + } + } + gpk.srsPoints = nil // ownership transferred; free heap copy + + if !inst.lowMemory { + if perr := inst.pinMSMWorkBuffers(); perr != nil { + return fail("pin MSM work buffers", perr) + } + } + + if inst.lowMemory { + if err := inst.offloadMSMPoints(); err != nil { + return fail("offload MSM points", err) + } + } + + inst.fftDom, err = NewFFTDomain(dev, inst.n) + if err != nil { + return fail("create FFT domain", err) + } + + if inst.lowMemory { + inst.gpuWork, err = NewFrVector(dev, inst.n) + if err != nil { + return fail("alloc low-memory GPU work buffer", err) + } + if err := dev.InitMultiStream(); err != nil { + return fail("init multi-stream", err) + } + publishMSMReady() + inst.initHostBufs() + publishCommitReady() + } + + if err := waitTrace(); err != nil { + return fail("init circuit data", err) + } + + inst.dPerm, err = DeviceAllocCopyInt64(dev, inst.permutation) + if err != nil { + return fail("upload permutation", err) + } + + if inst.lowMemory { + publishTraceReady() + } + + if err := inst.initCanonicalGPU(); err != nil { + return fail("init canonical", err) + } + + if inst.lowMemory { + inst.publishCanonicalReady(nil) + return inst, nil + } + + if err := inst.uploadPolynomials(); err != nil { + return fail("upload polynomials", err) + } + + if err := inst.allocPersistentBufs(); err != nil { + return fail("alloc persistent GPU buffers", err) + } + + inst.initHostBufs() + publishMSMReady() + publishCommitReady() + publishTraceReady() + inst.publishCanonicalReady(nil) + return inst, nil +} + +func (inst *gpuInstance) publishCanonicalReady(err error) { + inst.canonicalOnce.Do(func() { + inst.canonicalErr = err + close(inst.canonicalReady) + }) +} + +func (inst *gpuInstance) waitCanonicalReady() error { + if inst.canonicalReady == nil { + return nil + } + <-inst.canonicalReady + return inst.canonicalErr +} + +func selectLowMemoryMode(dev *gpu.Device, n int) bool { + if os.Getenv("GNARK_GPU_PLONK2_FORCE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode forced for n=%d", n) + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode disabled for n=%d", n) + return false + } + free, total, err := dev.MemGetInfo() + if err != nil { + low := n >= 1<<25 + log.Printf("plonk2: low-memory GPU mode=%t for n=%d; mem query failed: %v", low, n, err) + return low + } + vecBytes := uint64(n) * uint64(fr.Bytes) + estimatedResident := vecBytes * 24 + low := estimatedResident > total/2 + log.Printf( + "plonk2: low-memory GPU mode=%t n=%d vecBytes=%d estimatedResident=%d freeVRAM=%d totalVRAM=%d", + low, n, vecBytes, estimatedResident, free, total, + ) + return low +} + +func secondaryMSMDeviceID(primaryID int) (int, bool, error) { + raw := os.Getenv("GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID") + if raw == "" { + return 0, false, nil + } + id, err := strconv.Atoi(raw) + if err != nil { + return 0, false, fmt.Errorf("invalid GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID %q: %w", raw, err) + } + if id == primaryID { + return 0, false, fmt.Errorf("secondary device matches primary device %d", primaryID) + } + if id < 0 { + return 0, false, fmt.Errorf("secondary device id must be non-negative, got %d", id) + } + return id, true, nil +} + +func (inst *gpuInstance) pinMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.PinWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.PinWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.PinWorkBuffers() +} + +func (inst *gpuInstance) releaseMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReleaseWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReleaseWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReleaseWorkBuffers() +} + +func (inst *gpuInstance) offloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.OffloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.OffloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.OffloadPoints() +} + +func (inst *gpuInstance) reloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReloadPoints() +} + +// allocPersistentBufs allocates GPU work buffers that persist across proofs. +// Avoids per-proof cudaMalloc/Free overhead (~3 ms per 64 MB alloc × 20 bufs). +func (inst *gpuInstance) allocPersistentBufs() error { + n := inst.n + alloc := func() (*FrVector, error) { + return NewFrVector(inst.dev, n) + } + wb := &inst.qWb + // Flat list mirrors the free loop in close() — keep in sync. + named := []*(*FrVector){ + &inst.gpuWork, + &wb.L, &wb.R, &wb.O, &wb.Z, + &wb.S1, &wb.S2, &wb.S3, &wb.Result, + &wb.LCan, &wb.RCan, &wb.OCan, &wb.ZCan, + &wb.QkSrc, &wb.LinResult, &wb.LinW, + } + for _, p := range named { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + *p = v + } + for k := range wb.CosetBlock { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.CosetBlock[k] = v + } + if len(inst.commitmentInfo) > 0 { + wb.Pi2Src = make([]*FrVector, len(inst.commitmentInfo)) + for i := range wb.Pi2Src { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.Pi2Src[i] = v + } + } + // Create multi-stream upfront so the quotient pipeline can use it immediately. + return inst.dev.InitMultiStream() +} + +func (inst *gpuInstance) initCircuitShape(spr *cs.SparseR1CS) error { + nbConstraints := spr.GetNbConstraints() + sizeSystem := uint64(nbConstraints + len(spr.Public)) + inst.domain0 = fft.NewDomain(sizeSystem, fft.WithoutPrecompute()) + n := int(inst.domain0.Cardinality) + if n != inst.n { + return fmt.Errorf("domain size mismatch: spr=%d SRS=%d", n, inst.n) + } + inst.log2n = uint(bits.TrailingZeros(uint(n))) + inst.nbPublicVariables = len(spr.Public) + inst.commitmentInfo = inst.vk.CommitmentConstraintIndexes + return nil +} + +func (inst *gpuInstance) initTraceData(spr *cs.SparseR1CS) error { + trace := curplonk.NewTrace(spr, inst.domain0) + inst.qlCanonical = fr.Vector(trace.Ql.Coefficients()) + inst.qrCanonical = fr.Vector(trace.Qr.Coefficients()) + inst.qmCanonical = fr.Vector(trace.Qm.Coefficients()) + inst.qoCanonical = fr.Vector(trace.Qo.Coefficients()) + inst.s1Canonical = fr.Vector(trace.S1.Coefficients()) + inst.s2Canonical = fr.Vector(trace.S2.Coefficients()) + inst.s3Canonical = fr.Vector(trace.S3.Coefficients()) + + inst.qkLagrange = make(fr.Vector, inst.n) + copy(inst.qkLagrange, trace.Qk.Coefficients()) + inst.qkFixedCanonical = fr.Vector(trace.Qk.Coefficients()) + + inst.qcpCanonical = make([]fr.Vector, len(trace.Qcp)) + for i, p := range trace.Qcp { + inst.qcpCanonical[i] = fr.Vector(p.Coefficients()) + } + inst.permutation = trace.S + return nil +} + +func (inst *gpuInstance) initCanonicalGPU() error { + n := inst.n + gpuWork, err := NewFrVector(inst.dev, n) + if err != nil { + return fmt.Errorf("alloc work vector: %w", err) + } + defer gpuWork.Free() + + iFFTSelector := func(v fr.Vector) { + gpuWork.CopyFromHost(v) + inst.fftDom.BitReverse(gpuWork) + inst.fftDom.FFTInverse(gpuWork) + gpuWork.CopyToHost(v) + } + + for _, v := range []fr.Vector{ + inst.qlCanonical, inst.qrCanonical, inst.qmCanonical, inst.qoCanonical, + inst.qkFixedCanonical, inst.s1Canonical, inst.s2Canonical, inst.s3Canonical, + } { + iFFTSelector(v) + } + for _, v := range inst.qcpCanonical { + iFFTSelector(v) + } + + return inst.dev.Sync() +} + +func (inst *gpuInstance) uploadPolynomials() error { + upload := func(data fr.Vector) (*FrVector, error) { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, err + } + v.CopyFromHost(data) + return v, nil + } + var err error + if inst.dQl, err = upload(inst.qlCanonical); err != nil { + return fmt.Errorf("upload ql: %w", err) + } + if inst.dQr, err = upload(inst.qrCanonical); err != nil { + return fmt.Errorf("upload qr: %w", err) + } + if inst.dQm, err = upload(inst.qmCanonical); err != nil { + return fmt.Errorf("upload qm: %w", err) + } + if inst.dQo, err = upload(inst.qoCanonical); err != nil { + return fmt.Errorf("upload qo: %w", err) + } + if inst.dS1, err = upload(inst.s1Canonical); err != nil { + return fmt.Errorf("upload s1: %w", err) + } + if inst.dS2, err = upload(inst.s2Canonical); err != nil { + return fmt.Errorf("upload s2: %w", err) + } + if inst.dS3, err = upload(inst.s3Canonical); err != nil { + return fmt.Errorf("upload s3: %w", err) + } + if inst.dQkFixed, err = upload(inst.qkFixedCanonical); err != nil { + return fmt.Errorf("upload qkFixed: %w", err) + } + inst.dQcp = make([]*FrVector, len(inst.qcpCanonical)) + for i, v := range inst.qcpCanonical { + if inst.dQcp[i], err = upload(v); err != nil { + return fmt.Errorf("upload qcp[%d]: %w", i, err) + } + } + return nil +} + +func (inst *gpuInstance) close() { + if inst.msm != nil { + inst.msm.Close() + inst.msm = nil + } + if inst.splitMSM != nil { + if inst.splitMSM.msm0 != nil { + inst.splitMSM.msm0.Close() + } + if inst.splitMSM.msm1 != nil { + inst.splitMSM.msm1.Close() + } + if inst.splitMSM.secondary != nil { + _ = inst.splitMSM.secondary.Close() + } + inst.splitMSM = nil + } + if inst.fftDom != nil { + inst.fftDom.Close() + inst.fftDom = nil + } + if inst.dPerm != nil { + DeviceFreePtr(inst.dPerm) + inst.dPerm = nil + } + for _, v := range []*FrVector{inst.dQl, inst.dQr, inst.dQm, inst.dQo, + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed} { + if v != nil { + v.Free() + } + } + inst.dQl, inst.dQr, inst.dQm, inst.dQo = nil, nil, nil, nil + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed = nil, nil, nil, nil + for _, v := range inst.dQcp { + if v != nil { + v.Free() + } + } + inst.dQcp = nil + // Free persistent work buffers (mirrors the alloc list in allocPersistentBufs). + wb := &inst.qWb + for _, v := range []*FrVector{ + inst.gpuWork, + wb.L, wb.R, wb.O, wb.Z, wb.S1, wb.S2, wb.S3, wb.Result, + wb.LCan, wb.RCan, wb.OCan, wb.ZCan, wb.QkSrc, wb.LinResult, wb.LinW, + } { + if v != nil { + v.Free() + } + } + for k := range wb.CosetBlock { + if wb.CosetBlock[k] != nil { + wb.CosetBlock[k].Free() + } + } + for _, v := range wb.Pi2Src { + if v != nil { + v.Free() + } + } + inst.gpuWork = nil + inst.qWb = quotientWorkBufs{} + inst.hBufs.free() +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuProver — per-proof mutable state +// ───────────────────────────────────────────────────────────────────────────── + +type gpuProver struct { + inst *gpuInstance + instMu sync.Mutex + waitInst func() (*gpuInstance, error) + waitMSMInst func() (*gpuInstance, error) + waitCommitInst func() (*gpuInstance, error) + + proof curplonk.Proof + fs *fiatshamir.Transcript + + commitmentInfo constraint.PlonkCommitments + commitmentVal []fr.Element + pi2Canonical [][]fr.Element + pi2DeviceReady []bool + solverOpts []solver.Option + kzgFoldingHash hash.Hash + htfFunc hash.Hash + + evalL, evalR, evalO fr.Vector + wWitness fr.Vector + bpL, bpR, bpO, bpZ *iop.Polynomial + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + h1, h2, h3 []fr.Element + gamma, beta, alpha, zeta fr.Element + + logTime func(string) +} + +// ─── Prove phases ───────────────────────────────────────────────────────────── + +func (p *gpuProver) ensureInst() (*gpuInstance, error) { + p.instMu.Lock() + if p.inst != nil { + inst := p.inst + p.instMu.Unlock() + return inst, nil + } + waitInst := p.waitInst + p.instMu.Unlock() + if waitInst == nil { + return nil, errors.New("gpu instance is not initialized") + } + inst, err := waitInst() + if err != nil { + return nil, err + } + p.instMu.Lock() + if p.inst == nil { + p.inst = inst + } + inst = p.inst + p.instMu.Unlock() + return inst, nil +} + +func (p *gpuProver) initBlindingPolynomials() { + p.bpL = getRandomPolynomial(orderBlindingL) + p.bpR = getRandomPolynomial(orderBlindingR) + p.bpO = getRandomPolynomial(orderBlindingO) + p.bpZ = getRandomPolynomial(orderBlindingZ) +} + +func (p *gpuProver) solve(spr *cs.SparseR1CS, fullWitness witness.Witness) error { + solverOpts := append([]solver.Option(nil), p.solverOpts...) + if len(p.commitmentInfo) > 0 { + bsb22ID := solver.GetHintID(fcs.Bsb22CommitmentComputePlaceholder) + solverOpts = append(solverOpts, solver.OverrideHint(bsb22ID, func(_ *big.Int, ins, outs []*big.Int) error { + waitMSMInst := p.waitMSMInst + if waitMSMInst == nil { + waitMSMInst = p.waitCommitInst + } + if waitMSMInst == nil { + waitMSMInst = p.ensureInst + } + inst, err := waitMSMInst() + if err != nil { + return err + } + n := inst.n + commDepth := int(ins[0].Int64()) + ins = ins[1:] + ci := p.commitmentInfo[commDepth] + committedValues := make([]fr.Element, inst.domain0.Cardinality) + offset := inst.nbPublicVariables + for i := range ins { + committedValues[offset+ci.Committed[i]].SetBigInt(ins[i]) + } + committedValues[offset+ci.CommitmentIndex].SetRandom() + committedValues[offset+spr.GetNbConstraints()-1].SetRandom() + + inst.gpuWork.CopyFromHost(fr.Vector(committedValues[:n])) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if commDepth < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[commDepth] != nil { + inst.qWb.Pi2Src[commDepth].CopyFromDevice(inst.gpuWork) + p.pi2DeviceReady[commDepth] = true + } + canonicalBuf := make(fr.Vector, n) + inst.gpuWork.CopyToHost(canonicalBuf) + p.pi2Canonical[commDepth] = canonicalBuf + + commitment, err := inst.commit(canonicalBuf) + if err != nil { + return err + } + p.proof.Bsb22Commitments[commDepth] = commitment + + p.htfFunc.Write(p.proof.Bsb22Commitments[commDepth].Marshal()) + hashBts := p.htfFunc.Sum(nil) + p.htfFunc.Reset() + nbBuf := fr.Bytes + if p.htfFunc.Size() < fr.Bytes { + nbBuf = p.htfFunc.Size() + } + p.commitmentVal[commDepth].SetBytes(hashBts[:nbBuf]) + p.commitmentVal[commDepth].BigInt(outs[0]) + return nil + })) + } + + solution_, err := spr.Solve(fullWitness, solverOpts...) + if err != nil { + return fmt.Errorf("solve: %w", err) + } + solution := solution_.(*cs.SparseR1CSSolution) + p.evalL = fr.Vector(solution.L) + p.evalR = fr.Vector(solution.R) + p.evalO = fr.Vector(solution.O) + + var ok bool + p.wWitness, ok = fullWitness.Vector().(fr.Vector) + if !ok { + return errors.New("invalid witness type") + } + return nil +} + +func (p *gpuProver) completeQk() { + inst, err := p.ensureInst() + if err != nil { + panic(err) + } + p.qkCoeffs = inst.hBufs.qkCoeffs + copy(p.qkCoeffs, inst.qkLagrange) + copy(p.qkCoeffs, p.wWitness[:inst.nbPublicVariables]) + for i := range p.commitmentInfo { + p.qkCoeffs[inst.nbPublicVariables+p.commitmentInfo[i].CommitmentIndex] = p.commitmentVal[i] + } +} + +// commitToLRO overlaps the iFFT of L,R,O with Qk patching (via waitQk) and +// blinding-polynomial generation (via waitBlinding), both of which complete +// concurrently in sibling goroutines. +func (p *gpuProver) commitToLRO(inst *gpuInstance, waitQk, waitBlinding func() error) error { + hb := &inst.hBufs + + gpuToCanonical := func(lagrange, dst fr.Vector, dstDevice *FrVector) { + inst.gpuWork.CopyFromHost(lagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if dstDevice != nil { + dstDevice.CopyFromDevice(inst.gpuWork) + } + inst.gpuWork.CopyToHost(dst) + } + + if inst.lowMemory { + gpuToCanonical(p.evalL, hb.lCanonical, nil) + gpuToCanonical(p.evalR, hb.rCanonical, nil) + gpuToCanonical(p.evalO, hb.oCanonical, nil) + } else { + gpuToCanonical(p.evalL, hb.lCanonical, inst.qWb.LCan) + gpuToCanonical(p.evalR, hb.rCanonical, inst.qWb.RCan) + gpuToCanonical(p.evalO, hb.oCanonical, inst.qWb.OCan) + } + + if err := waitQk(); err != nil { + return err + } + inst.gpuWork.CopyFromHost(p.qkCoeffs) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if inst.lowMemory { + inst.gpuWork.CopyToHost(p.qkCoeffs) + } else { + inst.qWb.QkSrc.CopyFromDevice(inst.gpuWork) + p.qkCoeffs = nil + } + + if err := waitBlinding(); err != nil { + return err + } + + var blindWG sync.WaitGroup + blindWG.Add(3) + go func() { defer blindWG.Done(); p.lBlinded = blindInto(hb.lBlinded, hb.lCanonical, p.bpL) }() + go func() { defer blindWG.Done(); p.rBlinded = blindInto(hb.rBlinded, hb.rCanonical, p.bpR) }() + go func() { defer blindWG.Done(); p.oBlinded = blindInto(hb.oBlinded, hb.oCanonical, p.bpO) }() + blindWG.Wait() + if !inst.lowMemory { + SubtractBlindingHead(inst.qWb.LCan, p.bpL.Coefficients()) + SubtractBlindingHead(inst.qWb.RCan, p.bpR.Coefficients()) + SubtractBlindingHead(inst.qWb.OCan, p.bpO.Coefficients()) + } + + p.logTime("iFFT L,R,O,Qk + blind") + + lroCommits, err := inst.commitN(p.lBlinded, p.rBlinded, p.oBlinded) + if err != nil { + return err + } + p.proof.LRO[0] = lroCommits[0] + p.proof.LRO[1] = lroCommits[1] + p.proof.LRO[2] = lroCommits[2] + + p.logTime("MSM commit L,R,O") + return nil +} + +func (p *gpuProver) deriveGammaBeta() error { + inst := p.inst + if err := bindPublicData(p.fs, "gamma", inst.vk, p.wWitness[:inst.nbPublicVariables]); err != nil { + return err + } + var err error + p.gamma, err = deriveRandomness(p.fs, "gamma", &p.proof.LRO[0], &p.proof.LRO[1], &p.proof.LRO[2]) + if err != nil { + return err + } + p.beta, err = deriveRandomness(p.fs, "beta") + if err != nil { + return err + } + p.wWitness = nil + p.logTime("derive gamma,beta") + return nil +} + +func (p *gpuProver) buildZAndCommit() error { + inst := p.inst + + zLagrange, err := buildZGPU(inst, inst.gpuWork, p.evalL, p.evalR, p.evalO, p.beta, p.gamma) + if err != nil { + return fmt.Errorf("build Z: %w", err) + } + p.evalL, p.evalR, p.evalO = nil, nil, nil + p.logTime("build Z") + + hb := &inst.hBufs + inst.gpuWork.CopyFromHost(zLagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + inst.gpuWork.CopyToHost(hb.zLagrange) + p.zBlinded = blindInto(hb.zBlinded, hb.zLagrange, p.bpZ) + if !inst.lowMemory { + inst.qWb.ZCan.CopyFromDevice(inst.gpuWork) + SubtractBlindingHead(inst.qWb.ZCan, p.bpZ.Coefficients()) + } + + zCommit, err := inst.commit(p.zBlinded) + if err != nil { + return err + } + p.proof.Z = zCommit + p.logTime("iFFT+commit Z") + + alphaDeps := make([]*curve.G1Affine, len(p.proof.Bsb22Commitments)+1) + for i := range p.proof.Bsb22Commitments { + alphaDeps[i] = &p.proof.Bsb22Commitments[i] + } + alphaDeps[len(alphaDeps)-1] = &p.proof.Z + var aerr error + p.alpha, aerr = deriveRandomness(p.fs, "alpha", alphaDeps...) + if aerr != nil { + return aerr + } + p.logTime("derive alpha") + return nil +} + +func (p *gpuProver) computeQuotientAndCommit() error { + inst := p.inst + if err := inst.waitCanonicalReady(); err != nil { + return fmt.Errorf("initialize canonical selector data: %w", err) + } + + pointsOffloaded := false + if inst.shouldOffloadMSMForQuotient() { + if err := inst.offloadMSMPoints(); err != nil { + return fmt.Errorf("offload MSM points: %w", err) + } + pointsOffloaded = true + if err := inst.releaseMSMWorkBuffers(); err != nil { + return fmt.Errorf("release MSM work buffers: %w", err) + } + } + defer func() { + if pointsOffloaded { + _ = inst.reloadMSMPoints() + if !inst.lowMemory { + _ = inst.pinMSMWorkBuffers() + } + } + }() + + var qErr error + p.h1, p.h2, p.h3, qErr = computeNumeratorGPU( + inst, inst.gpuWork, + p.lBlinded, p.rBlinded, p.oBlinded, p.zBlinded, + p.qkCoeffs, p.pi2Canonical, p.pi2DeviceReady, + p.alpha, p.beta, p.gamma, + ) + if qErr != nil { + return fmt.Errorf("compute quotient: %w", qErr) + } + + p.logTime("quotient GPU") + + if pointsOffloaded { + if err := inst.reloadMSMPoints(); err != nil { + return fmt.Errorf("reload MSM points: %w", err) + } + if !inst.lowMemory { + if err := inst.pinMSMWorkBuffers(); err != nil { + return fmt.Errorf("re-pin MSM work buffers: %w", err) + } + } + pointsOffloaded = false + } + hCommits, err := inst.commitN(p.h1, p.h2, p.h3) + if err != nil { + return err + } + p.proof.H[0] = hCommits[0] + p.proof.H[1] = hCommits[1] + p.proof.H[2] = hCommits[2] + p.logTime("MSM commit h1,h2,h3") + + var zetaErr error + p.zeta, zetaErr = deriveRandomness(p.fs, "zeta", &p.proof.H[0], &p.proof.H[1], &p.proof.H[2]) + if zetaErr != nil { + return zetaErr + } + return nil +} + +func (inst *gpuInstance) shouldOffloadMSMForQuotient() bool { + if inst.lowMemory { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_FORCE_MSM_OFFLOAD") != "" { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_MSM_OFFLOAD") != "" { + return false + } + free, _, err := inst.dev.MemGetInfo() + if err != nil { + return true + } + reserve := uint64(inst.n) * uint64(fr.Bytes) * 8 + const minReserve = 2 << 30 + if reserve < minReserve { + reserve = minReserve + } + return free < reserve +} + +func (p *gpuProver) openAndFinalize() error { + inst := p.inst + + var zetaShifted fr.Element + zetaShifted.Mul(&p.zeta, &inst.domain0.Generator) + + openZPoly := inst.hBufs.openZBuf[:len(p.zBlinded)] + copy(openZPoly, p.zBlinded) + bzuzetaCh := make(chan fr.Element, 1) + go func() { + parallelHornerQuotient(openZPoly, zetaShifted) + bzuzetaCh <- openZPoly[0] + }() + + // Evaluate host-only blinded polys on CPU while GPU-resident selector polys + // are evaluated on device. + var blzeta, brzeta, bozeta, s1Zeta, s2Zeta fr.Element + var evalWG sync.WaitGroup + evalWG.Add(3) + go func() { defer evalWG.Done(); blzeta = polyEvalParallel(p.lBlinded, p.zeta) }() + go func() { defer evalWG.Done(); brzeta = polyEvalParallel(p.rBlinded, p.zeta) }() + go func() { defer evalWG.Done(); bozeta = polyEvalParallel(p.oBlinded, p.zeta) }() + + if inst.lowMemory { + s1Zeta = polyEvalParallel(inst.s1Canonical, p.zeta) + s2Zeta = polyEvalParallel(inst.s2Canonical, p.zeta) + } else { + s1Zeta = PolyEvalGPU(inst.dev, inst.dS1, p.zeta) + s2Zeta = PolyEvalGPU(inst.dev, inst.dS2, p.zeta) + } + + qcpzeta := make([]fr.Element, len(p.commitmentInfo)) + for i := range p.commitmentInfo { + if inst.lowMemory { + qcpzeta[i] = polyEvalParallel(inst.qcpCanonical[i], p.zeta) + } else { + qcpzeta[i] = PolyEvalGPU(inst.dev, inst.dQcp[i], p.zeta) + } + } + evalWG.Wait() + + bzuzeta := <-bzuzetaCh + p.proof.ZShiftedOpening.ClaimedValue.Set(&bzuzeta) + + var linPol []fr.Element + if inst.lowMemory { + linPol = innerComputeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.h1, p.h2, p.h3, + ) + } else { + linPol = computeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.pi2DeviceReady, p.h1, p.h2, p.h3, + ) + } + p.h1, p.h2, p.h3, p.pi2Canonical, p.pi2DeviceReady = nil, nil, nil, nil, nil + + zOpenCommit, err := inst.commit(openZPoly[1:]) + if err != nil { + return err + } + p.proof.ZShiftedOpening.H = zOpenCommit + p.logTime("eval+linearize+open Z") + + linPolZetaCh := make(chan fr.Element, 1) + go func() { + linPolZetaCh <- polyEvalParallel(linPol, p.zeta) + }() + + linPolDigest, err := inst.commit(linPol) + if err != nil { + return err + } + p.logTime("MSM commit linPol") + + nPolysToOpen := 6 + len(inst.qcpCanonical) + claimedValues := make([]fr.Element, nPolysToOpen) + claimedValues[0] = <-linPolZetaCh + claimedValues[1] = blzeta + claimedValues[2] = brzeta + claimedValues[3] = bozeta + claimedValues[4] = s1Zeta + claimedValues[5] = s2Zeta + for i := range inst.qcpCanonical { + claimedValues[6+i] = qcpzeta[i] + } + + polysToOpen := make([][]fr.Element, nPolysToOpen) + polysToOpen[0] = linPol + polysToOpen[1] = p.lBlinded + polysToOpen[2] = p.rBlinded + polysToOpen[3] = p.oBlinded + polysToOpen[4] = inst.s1Canonical + polysToOpen[5] = inst.s2Canonical + for i := range inst.qcpCanonical { + polysToOpen[6+i] = inst.qcpCanonical[i] + } + + digestsToOpen := make([]curve.G1Affine, nPolysToOpen) + digestsToOpen[0] = linPolDigest + digestsToOpen[1] = p.proof.LRO[0] + digestsToOpen[2] = p.proof.LRO[1] + digestsToOpen[3] = p.proof.LRO[2] + digestsToOpen[4] = inst.vk.S[0] + digestsToOpen[5] = inst.vk.S[1] + copy(digestsToOpen[6:], inst.vk.Qcp) + + p.proof.BatchedProof, err = gpuBatchOpen( + inst.commit, + polysToOpen, digestsToOpen, claimedValues, + p.zeta, + p.kzgFoldingHash, + p.proof.ZShiftedOpening.ClaimedValue.Marshal(), + ) + if err != nil { + return fmt.Errorf("batch opening: %w", err) + } + p.logTime("batch opening") + return nil +} + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProve — top-level prove API +// ───────────────────────────────────────────────────────────────────────────── + +func GPUProve(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, fullWitness witness.Witness, opts ...backend.ProverOption) (*curplonk.Proof, error) { + proverCfg, err := backend.NewProverConfig(opts...) + if err != nil { + return nil, fmt.Errorf("create prover config: %w", err) + } + if proverCfg.HashToFieldFn == nil { + proverCfg.HashToFieldFn = newHTF([]byte("BSB22-Plonk")) + } + + gpk.mu.Lock() + defer gpk.mu.Unlock() + + if gpk.Vk == nil { + return nil, errors.New("gpu: proving key missing verifying key") + } + + proveStart := time.Now() + logTime := func(label string) { + log.Printf(" [GPUProve n=%d] %s: %v", gpk.n, label, time.Since(proveStart)) + } + + var commitmentInfo constraint.PlonkCommitments + if spr.CommitmentInfo != nil { + commitmentInfo = spr.CommitmentInfo.(constraint.PlonkCommitments) + } + + nbCommitments := len(commitmentInfo) + newProof := &curplonk.Proof{ + Bsb22Commitments: make([]curve.G1Affine, nbCommitments), + } + + msmInstReady := make(chan struct{}) + commitInstReady := make(chan struct{}) + traceInstReady := make(chan struct{}) + var ( + msmInstPublishOnce sync.Once + commitInstPublishOnce sync.Once + traceInstPublishOnce sync.Once + msmInst *gpuInstance + commitInst *gpuInstance + traceInst *gpuInstance + msmInstErr error + commitInstErr error + traceInstErr error + ) + publishMSMInst := func(inst *gpuInstance, err error) { + msmInstPublishOnce.Do(func() { + if err != nil { + msmInstErr = err + } else { + msmInst = inst + } + close(msmInstReady) + }) + } + waitMSMInst := func() (*gpuInstance, error) { + <-msmInstReady + if msmInstErr != nil { + return nil, msmInstErr + } + if msmInst == nil { + return nil, errors.New("gpu instance initialization did not publish an MSM-ready instance") + } + return msmInst, nil + } + publishCommitInst := func(inst *gpuInstance, err error) { + commitInstPublishOnce.Do(func() { + if err != nil { + commitInstErr = err + } else { + commitInst = inst + } + close(commitInstReady) + }) + } + waitCommitInst := func() (*gpuInstance, error) { + <-commitInstReady + if commitInstErr != nil { + return nil, commitInstErr + } + if commitInst == nil { + return nil, errors.New("gpu instance initialization did not publish a commitment-ready instance") + } + return commitInst, nil + } + publishTraceInst := func(inst *gpuInstance, err error) { + traceInstPublishOnce.Do(func() { + if err != nil { + traceInstErr = err + } else { + traceInst = inst + gpk.inst = inst + } + close(traceInstReady) + }) + } + waitInst := func() (*gpuInstance, error) { + <-traceInstReady + if traceInstErr != nil { + return nil, traceInstErr + } + if traceInst == nil { + return nil, errors.New("gpu instance initialization did not publish a trace-ready instance") + } + return traceInst, nil + } + + p := &gpuProver{ + proof: *newProof, + fs: fiatshamir.NewTranscript(proverCfg.ChallengeHash, "gamma", "beta", "alpha", "zeta"), + commitmentInfo: commitmentInfo, + commitmentVal: make([]fr.Element, nbCommitments), + pi2Canonical: make([][]fr.Element, nbCommitments), + pi2DeviceReady: make([]bool, nbCommitments), + solverOpts: proverCfg.SolverOpts, + kzgFoldingHash: proverCfg.KZGFoldingHash, + htfFunc: proverCfg.HashToFieldFn, + logTime: logTime, + waitInst: waitInst, + waitMSMInst: waitMSMInst, + waitCommitInst: waitCommitInst, + } + + // Overlap CPU solve with blinding-polynomial init and Qk patching, then + // feed results into a sequential GPU pipeline. Hides the solve latency + // (~400 ms at n=2^18) behind unrelated work; recovers ~20-30% end-to-end. + chSolved := make(chan struct{}) + chBlinding := make(chan struct{}) + chQk := make(chan struct{}) + + g, gctx := errgroup.WithContext(context.Background()) + + waitCh := func(ch <-chan struct{}) error { + select { + case <-gctx.Done(): + return gctx.Err() + case <-ch: + return nil + } + } + safeGo := func(label string, fn func() error) { + g.Go(func() error { return proveStep(label, fn) }) + } + + safeGo("initGPUInstance", func() error { + if gpk.inst != nil && gpk.inst.dev == dev { + publishMSMInst(gpk.inst, nil) + publishCommitInst(gpk.inst, nil) + publishTraceInst(gpk.inst, nil) + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + msmPublished := false + commitPublished := false + tracePublished := false + inst, err := newGPUInstance(dev, gpk, spr, gpuInstanceReadyHooks{ + msm: func(inst *gpuInstance) { + msmPublished = true + publishMSMInst(inst, nil) + }, + commit: func(inst *gpuInstance) { + commitPublished = true + publishCommitInst(inst, nil) + }, + trace: func(inst *gpuInstance) { + tracePublished = true + publishTraceInst(inst, nil) + logTime("trace-ready GPU instance") + }, + }) + if err != nil { + err = fmt.Errorf("init GPU instance: %w", err) + if !msmPublished { + publishMSMInst(nil, err) + } + if !commitPublished { + publishCommitInst(nil, err) + } + if !tracePublished { + publishTraceInst(nil, err) + } + return err + } + if !msmPublished { + publishMSMInst(inst, nil) + } + if !commitPublished { + publishCommitInst(inst, nil) + } + if !tracePublished { + publishTraceInst(inst, nil) + } + logTime("init GPU instance") + return nil + }) + + safeGo("initBlinding", func() error { + p.initBlindingPolynomials() + close(chBlinding) + return nil + }) + + safeGo("solve", func() error { + if err := p.solve(spr, fullWitness); err != nil { + return err + } + logTime("solve") + close(chSolved) + return nil + }) + + safeGo("completeQk", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + p.completeQk() + close(chQk) + return nil + }) + + safeGo("pipeline", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + commitInst, err := waitCommitInst() + if err != nil { + return err + } + if err := p.commitToLRO( + commitInst, + func() error { return waitCh(chQk) }, + func() error { return waitCh(chBlinding) }, + ); err != nil { + return err + } + if _, err := p.ensureInst(); err != nil { + return err + } + if err := p.deriveGammaBeta(); err != nil { + return err + } + if err := p.buildZAndCommit(); err != nil { + return err + } + if err := p.computeQuotientAndCommit(); err != nil { + return err + } + return p.openAndFinalize() // inst.gpuWork persists (owned by gpuInstance) + }) + + if err := g.Wait(); err != nil { + return nil, err + } + + logTime("total") + result := p.proof + return &result, nil +} + +// proveStep converts a panic in fn to a labeled error so goroutines +// surface panics as normal errors through the errgroup. +func proveStep(label string, fn func() error) (err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("%s panic: %v", label, r) + } + }() + return fn() +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helper functions (ported from gpu/plonk/prove.go) +// ───────────────────────────────────────────────────────────────────────────── + +func buildZGPU( + inst *gpuInstance, gpuWork *FrVector, + evalL, evalR, evalO fr.Vector, beta, gamma fr.Element, +) (fr.Vector, error) { + dev := inst.dev + domain0 := inst.domain0 + + gpuR := inst.qWb.R + gpuO := inst.qWb.O + if inst.lowMemory { + var err error + gpuR, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z R buffer: %w", err) + } + defer gpuR.Free() + gpuO, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z O buffer: %w", err) + } + defer gpuO.Free() + } + + gpuWork.CopyFromHost(evalL) + gpuR.CopyFromHost(evalR) + gpuO.CopyFromHost(evalO) + + gMul := domain0.FrMultiplicativeGen + var gSq fr.Element + gSq.Mul(&gMul, &gMul) + + PlonkZComputeFactors(gpuWork, gpuR, gpuO, inst.dPerm, + beta, gamma, gMul, gSq, inst.log2n, inst.fftDom) + gpuR.BatchInvert(gpuO) + gpuWork.Mul(gpuWork, gpuR) + ZPrefixProduct(dev, gpuR, gpuWork, gpuO) + gpuR.CopyToHost(inst.hBufs.zLagrange) + return inst.hBufs.zLagrange, nil +} + +func computeNumeratorGPU( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + if inst.lowMemory { + return computeNumeratorGPULowMemory( + inst, gpuWork, + lBlinded, rBlinded, oBlinded, zBlinded, + qkCanonical, pi2Canonical, + alpha, beta, gamma, + ) + } + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + // Pre-allocated buffers from gpuInstance (avoids per-proof cudaMalloc/Free). + wb := &inst.qWb + gpuL, gpuR, gpuO, gpuZ := wb.L, wb.R, wb.O, wb.Z + gpuS1, gpuS2, gpuS3 := wb.S1, wb.S2, wb.S3 + gpuResult := wb.Result + gpuLCan, gpuRCan, gpuOCan, gpuZCan := wb.LCan, wb.RCan, wb.OCan, wb.ZCan + gpuCosetBlocks := wb.CosetBlock + + // Event IDs used for cross-stream synchronisation in the 4-coset loop. + const ( + evS123Done gpu.EventID = 0 // StreamTransfer → StreamCompute: S1/S2/S3 D2D done + evPermDone gpu.EventID = 1 // StreamCompute → StreamTransfer: safe to overwrite gate buffers + evCosetDone gpu.EventID = 3 // StreamCompute → StreamTransfer: full coset k done + ) + + // L/R/O/Z canonical heads were produced on-device by the iFFT phases and + // adjusted for blinding there. Keep them resident for the quotient loop. + for j := range pi2Canonical { + if j >= len(pi2DeviceReady) || pi2DeviceReady[j] { + continue + } + if j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil && len(pi2Canonical[j]) == n { + wb.Pi2Src[j].CopyFromHost(fr.Vector(pi2Canonical[j])) + pi2DeviceReady[j] = true + } + } + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + // Stream 1 must finish before overwriting gpuS1/S2/S3 with the next coset's + // selectors. PermBoundary (end of previous coset) still holds reads on S1/S2/S3. + if k > 0 { + dev.WaitEvent(gpu.StreamTransfer, evCosetDone) + } + + // Stream 1: D2D perm selectors concurrent with L/R/O/Z reduce+FFT on stream 0. + gpuS1.CopyFromDeviceStream(inst.dS1, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dS2, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dS3, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + // Stream 0: reduce blinded canonicals and FFT while D2D runs concurrently. + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + // L₁ denominator inverse: gpuWork[i] = 1/(cosetGen·ω^i - 1) + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) // result is temp; inverses stored in gpuWork + + // l1Scalar = (cosetGen^n - 1) / n = zhZeta / n at this coset + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + // Gate selectors: overlap transfer-stream D2D copies with compute-stream FFTs. + dev.RecordEvent(gpu.StreamCompute, evPermDone) + + dev.WaitEvent(gpu.StreamTransfer, evPermDone) + gpuS1.CopyFromDeviceStream(inst.dQr, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dQm, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dQo, gpu.StreamTransfer) + gpuWork.CopyFromDeviceStream(wb.QkSrc, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + gpuZ.CopyFromDevice(inst.dQl) + fftDom.CosetFFT(gpuZ, cosetGen) + + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + gpuZ.CopyFromDevice(inst.dQcp[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil { + gpuWork.CopyFromDevice(wb.Pi2Src[j]) + } else { + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + // Store the first three coset results on GPU. Keep the fourth in gpuResult. + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + dev.RecordEvent(gpu.StreamCompute, evCosetDone) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func newLowMemorySelectorCache(inst *gpuInstance, allocated *[]*FrVector) lowMemorySelectorCache { + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY_SELECTOR_CACHE") != "" { + return lowMemorySelectorCache{} + } + + upload := func(name string, data fr.Vector) *FrVector { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + log.Printf("plonk2: low-memory selector cache stopped at %s: %v", name, err) + return nil + } + *allocated = append(*allocated, v) + v.CopyFromHost(data) + return v + } + + cache := lowMemorySelectorCache{ + ql: upload("ql", inst.qlCanonical), + qr: upload("qr", inst.qrCanonical), + qm: upload("qm", inst.qmCanonical), + qo: upload("qo", inst.qoCanonical), + s1: upload("s1", inst.s1Canonical), + s2: upload("s2", inst.s2Canonical), + s3: upload("s3", inst.s3Canonical), + } + if len(inst.qcpCanonical) > 0 { + cache.qcp = make([]*FrVector, len(inst.qcpCanonical)) + for i := range inst.qcpCanonical { + cache.qcp[i] = upload(fmt.Sprintf("qcp[%d]", i), inst.qcpCanonical[i]) + } + } + + qcpCached := 0 + for i := range cache.qcp { + if cache.qcp[i] != nil { + qcpCached++ + } + } + log.Printf( + "plonk2: low-memory selector cache ql=%t qr=%t qm=%t qo=%t s1=%t s2=%t s3=%t qcp=%d/%d", + cache.ql != nil, cache.qr != nil, cache.qm != nil, cache.qo != nil, + cache.s1 != nil, cache.s2 != nil, cache.s3 != nil, + qcpCached, len(inst.qcpCanonical), + ) + return cache +} + +func computeNumeratorGPULowMemory( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + if len(qkCanonical) < n { + return nil, nil, nil, fmt.Errorf("low-memory quotient: qk canonical length %d < %d", len(qkCanonical), n) + } + + var allocated []*FrVector + alloc := func(name string) (*FrVector, error) { + v, err := NewFrVector(inst.dev, n) + if err != nil { + return nil, fmt.Errorf("alloc %s: %w", name, err) + } + allocated = append(allocated, v) + return v, nil + } + defer func() { + for _, v := range allocated { + v.Free() + } + }() + + gpuL, err := alloc("L") + if err != nil { + return nil, nil, nil, err + } + gpuR, err := alloc("R") + if err != nil { + return nil, nil, nil, err + } + gpuO, err := alloc("O") + if err != nil { + return nil, nil, nil, err + } + gpuZ, err := alloc("Z") + if err != nil { + return nil, nil, nil, err + } + gpuS1, err := alloc("S1") + if err != nil { + return nil, nil, nil, err + } + gpuS2, err := alloc("S2") + if err != nil { + return nil, nil, nil, err + } + gpuS3, err := alloc("S3") + if err != nil { + return nil, nil, nil, err + } + gpuResult, err := alloc("Result") + if err != nil { + return nil, nil, nil, err + } + gpuLCan, err := alloc("LCan") + if err != nil { + return nil, nil, nil, err + } + gpuRCan, err := alloc("RCan") + if err != nil { + return nil, nil, nil, err + } + gpuOCan, err := alloc("OCan") + if err != nil { + return nil, nil, nil, err + } + gpuZCan, err := alloc("ZCan") + if err != nil { + return nil, nil, nil, err + } + gpuQkSrc, err := alloc("QkSrc") + if err != nil { + return nil, nil, nil, err + } + var gpuCosetBlocks [3]*FrVector + for k := range gpuCosetBlocks { + gpuCosetBlocks[k], err = alloc(fmt.Sprintf("CosetBlock%d", k)) + if err != nil { + return nil, nil, nil, err + } + } + selectorCache := newLowMemorySelectorCache(inst, &allocated) + copySelector := func(dst, device *FrVector, host fr.Vector) { + if device != nil { + dst.CopyFromDevice(device) + return + } + dst.CopyFromHost(host) + } + + gpuLCan.CopyFromHost(fr.Vector(lBlinded[:n])) + gpuRCan.CopyFromHost(fr.Vector(rBlinded[:n])) + gpuOCan.CopyFromHost(fr.Vector(oBlinded[:n])) + gpuZCan.CopyFromHost(fr.Vector(zBlinded[:n])) + gpuQkSrc.CopyFromHost(fr.Vector(qkCanonical[:n])) + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + copySelector(gpuS1, selectorCache.s1, inst.s1Canonical) + copySelector(gpuS2, selectorCache.s2, inst.s2Canonical) + copySelector(gpuS3, selectorCache.s3, inst.s3Canonical) + + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) + + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + copySelector(gpuS1, selectorCache.qr, inst.qrCanonical) + copySelector(gpuS2, selectorCache.qm, inst.qmCanonical) + copySelector(gpuS3, selectorCache.qo, inst.qoCanonical) + gpuWork.CopyFromDevice(gpuQkSrc) + copySelector(gpuZ, selectorCache.ql, inst.qlCanonical) + + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + var qcpDevice *FrVector + if j < len(selectorCache.qcp) { + qcpDevice = selectorCache.qcp[j] + } + copySelector(gpuZ, qcpDevice, inst.qcpCanonical[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("low-memory quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func gpuCommit(msm *G1MSM, coeffs []fr.Element) (curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + var aff curve.G1Affine + aff.FromJacobian(&jacs[0]) + return aff, nil +} + +func gpuCommitN(msm *G1MSM, coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffSets...) + if err != nil { + return nil, err + } + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) commit(coeffs []fr.Element) (curve.G1Affine, error) { + commits, err := inst.commitN(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + return commits[0], nil +} + +func (inst *gpuInstance) commitN(coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + if inst.lowMemory { + if err := inst.reloadMSMPoints(); err != nil { + return nil, fmt.Errorf("reload MSM points: %w", err) + } + defer func() { + _ = inst.releaseMSMWorkBuffers() + _ = inst.offloadMSMPoints() + }() + } + var jacs []curve.G1Jac + var err error + if inst.splitMSM != nil { + jacs, err = MultiExpSplitBatchAt(inst.splitMSM.msm0, inst.splitMSM.msm1, inst.splitMSM.split, coeffSets...) + } else { + jacs, err = inst.msm.MultiExp(coeffSets...) + } + if err != nil { + return nil, err + } + inst.logMSMPhaseTimings(coeffSets...) + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) logMSMPhaseTimings(coeffSets ...[]fr.Element) { + if os.Getenv("GNARK_GPU_PLONK2_LOG_MSM_PHASES") == "" { + return + } + counts := make([]int, len(coeffSets)) + for i := range coeffSets { + counts[i] = len(coeffSets[i]) + } + if inst.splitMSM != nil { + primaryCounts := make([]int, len(coeffSets)) + secondaryCounts := make([]int, len(coeffSets)) + for i, count := range counts { + primaryCounts[i] = inst.splitMSM.split + if count < primaryCounts[i] { + primaryCounts[i] = count + } + secondaryCounts[i] = count - primaryCounts[i] + } + logMSMPhaseTimings(inst.n, "primary", inst.splitMSM.msm0.LastBatchPhaseTimings(), primaryCounts) + logMSMPhaseTimings(inst.n, "secondary", inst.splitMSM.msm1.LastBatchPhaseTimings(), secondaryCounts) + return + } + logMSMPhaseTimings(inst.n, "single", inst.msm.LastBatchPhaseTimings(), counts) +} + +func logMSMPhaseTimings(n int, device string, timings [][9]float32, scalarCounts []int) { + names := [...]string{ + "h2d", "build_pairs", "sort", "boundaries", "accum_seq", + "accum_par", "reduce_partial", "reduce_finalize", "d2h", + } + for i, phase := range timings { + total := float32(0) + for _, ms := range phase { + total += ms + } + scalars := 0 + if i < len(scalarCounts) { + scalars = scalarCounts[i] + } + log.Printf( + " [GPUProve n=%d] MSM phases device=%s set=%d scalars=%d total=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms", + n, device, i, scalars, total, + names[0], phase[0], names[1], phase[1], names[2], phase[2], + names[3], phase[3], names[4], phase[4], names[5], phase[5], + names[6], phase[6], names[7], phase[7], names[8], phase[8], + ) + } +} + +func gpuBatchOpen( + commit func([]fr.Element) (curve.G1Affine, error), + polys [][]fr.Element, + digests []curve.G1Affine, + claimedValues []fr.Element, + point fr.Element, + kzgFoldingHash hash.Hash, + dataTranscript []byte, +) (kzg.BatchOpeningProof, error) { + var res kzg.BatchOpeningProof + res.ClaimedValues = claimedValues + + fsGamma := fiatshamir.NewTranscript(kzgFoldingHash, "gamma") + if err := fsGamma.Bind("gamma", point.Marshal()); err != nil { + return res, err + } + for i := range digests { + if err := fsGamma.Bind("gamma", digests[i].Marshal()); err != nil { + return res, err + } + } + for i := range claimedValues { + if err := fsGamma.Bind("gamma", claimedValues[i].Marshal()); err != nil { + return res, err + } + } + if len(dataTranscript) > 0 { + if err := fsGamma.Bind("gamma", dataTranscript); err != nil { + return res, err + } + } + gammaByte, err := fsGamma.ComputeChallenge("gamma") + if err != nil { + return res, err + } + var gammaChallenge fr.Element + gammaChallenge.SetBytes(gammaByte) + + nbPolys := len(polys) + largestPoly := 0 + for _, p := range polys { + if len(p) > largestPoly { + largestPoly = len(p) + } + } + + gammas := make([]fr.Element, nbPolys) + gammas[0].SetOne() + for i := 1; i < nbPolys; i++ { + gammas[i].Mul(&gammas[i-1], &gammaChallenge) + } + + folded := make(fr.Vector, largestPoly) + nCPU := runtime.NumCPU() + chunkSize := (largestPoly + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < largestPoly; c += chunkSize { + start := c + end := start + chunkSize + if end > largestPoly { + end = largestPoly + } + wg.Add(1) + go func() { + defer wg.Done() + temp := make(fr.Vector, end-start) + for i := range nbPolys { + effEnd := end + if effEnd > len(polys[i]) { + effEnd = len(polys[i]) + } + if start >= effEnd { + continue + } + n := effEnd - start + t := fr.Vector(temp[:n]) + t.ScalarMul(fr.Vector(polys[i][start:effEnd]), &gammas[i]) + f := fr.Vector(folded[start:effEnd]) + f.Add(f, t) + } + }() + } + wg.Wait() + + var foldedEval fr.Element + for i := nbPolys - 1; i >= 0; i-- { + foldedEval.Mul(&foldedEval, &gammaChallenge).Add(&foldedEval, &claimedValues[i]) + } + folded[0].Sub(&folded[0], &foldedEval) + parallelHornerQuotient(folded, point) + h := folded[1:] + + res.H, err = commit(h) + if err != nil { + return res, err + } + return res, nil +} + +func computeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + h1, h2, h3 []fr.Element, +) []fr.Element { + n := inst.n + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + // Pre-allocated GPU buffers from gpuInstance (guaranteed non-nil after newGPUInstance). + gpuResult := inst.qWb.LinResult + gpuW := inst.qWb.LinW + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + PlonkLinearizeStatic( + gpuResult, inst.qWb.ZCan, inst.dS3, + inst.dQl, inst.dQr, inst.dQm, inst.dQo, inst.dQkFixed, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta, + ) + + for j := range qcpZeta { + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[j] != nil { + gpuW.CopyFromDevice(inst.qWb.Pi2Src[j]) + } else { + gpuW.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + gpuResult.AddScalarMul(gpuW, qcpZeta[j]) + } + + var negCoeff fr.Element + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Mul(&negCoeff, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h3[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h2[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Neg(&zhZeta) + gpuW.CopyFromHost(fr.Vector(h1[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + gpuResult.CopyToHost(fr.Vector(blindedZCanonical[:n])) + + for i := n; i < len(blindedZCanonical); i++ { + var t fr.Element + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + return blindedZCanonical +} + +func innerComputeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, + h1, h2, h3 []fr.Element, +) []fr.Element { + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + s3can := []fr.Element(inst.s3Canonical) + cql := []fr.Element(inst.qlCanonical) + cqr := []fr.Element(inst.qrCanonical) + cqm := []fr.Element(inst.qmCanonical) + cqo := []fr.Element(inst.qoCanonical) + cqk := []fr.Element(inst.qkFixedCanonical) + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + + total := len(blindedZCanonical) + nCPU := runtime.NumCPU() + chunkSize := (total + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < total; c += chunkSize { + start := c + end := start + chunkSize + if end > total { + end = total + } + wg.Add(1) + go func() { + defer wg.Done() + var t, t0, t1 fr.Element + for i := start; i < end; i++ { + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(s3can) { + t0.Mul(&s3can[i], &s1) + t.Add(&t, &t0) + } + if i < len(cqm) { + t1.Mul(&cqm[i], &rl) + t.Add(&t, &t1) + t0.Mul(&cql[i], &lZeta) + t.Add(&t, &t0) + t0.Mul(&cqr[i], &rZeta) + t.Add(&t, &t0) + t0.Mul(&cqo[i], &oZeta) + t.Add(&t, &t0) + t.Add(&t, &cqk[i]) + } + for j := range qcpZeta { + if i < len(pi2Canonical[j]) { + t0.Mul(&pi2Canonical[j][i], &qcpZeta[j]) + t.Add(&t, &t0) + } + } + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + }() + } + wg.Wait() + return blindedZCanonical +} + +// ─── Polynomial helpers ─────────────────────────────────────────────────────── + +func blindInto(dst []fr.Element, canonical []fr.Element, bp *iop.Polynomial) []fr.Element { + cbp := bp.Coefficients() + result := dst[:len(canonical)+len(cbp)] + copy(result, canonical) + copy(result[len(canonical):], cbp) + for i := 0; i < len(cbp); i++ { + result[i].Sub(&result[i], &cbp[i]) + } + return result +} + +func getRandomPolynomial(degree int) *iop.Polynomial { + coeffs := make([]fr.Element, degree+1) + for i := range coeffs { + coeffs[i].SetRandom() + } + return iop.NewPolynomial(&coeffs, iop.Form{Basis: iop.Canonical, Layout: iop.Regular}) +} + +func parallelHornerQuotient(poly []fr.Element, z fr.Element) { + n := len(poly) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + for i := n - 2; i >= 0; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + return + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + var wg sync.WaitGroup + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + wg.Add(1) + go func(lo, hi int) { + defer wg.Done() + for i := hi - 2; i >= lo; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + }(lo, hi) + } + wg.Wait() + zk := expElement(z, chunkSize) + carries := make([]fr.Element, numChunks) + for c := numChunks - 2; c >= 0; c-- { + nextLo := (c + 1) * chunkSize + nextLen := chunkSize + if nextLo+nextLen > n { + nextLen = n - nextLo + } + zkc := zk + if nextLen != chunkSize { + zkc = expElement(z, nextLen) + } + var tmp fr.Element + tmp.Mul(&carries[c+1], &zkc) + carries[c].Add(&poly[nextLo], &tmp) + } + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + if carries[c].IsZero() { + continue + } + wg.Add(1) + go func(lo, hi, c int) { + defer wg.Done() + var zPow fr.Element + zPow.Set(&z) + for i := hi - 1; i >= lo; i-- { + var corr fr.Element + corr.Mul(&zPow, &carries[c]) + poly[i].Add(&poly[i], &corr) + zPow.Mul(&zPow, &z) + } + }(lo, hi, c) + } + wg.Wait() +} + +func expElement(z fr.Element, exp int) fr.Element { + var base, acc fr.Element + base.Set(&z) + acc.SetOne() + for exp > 0 { + if exp&1 != 0 { + acc.Mul(&acc, &base) + } + base.Square(&base) + exp >>= 1 + } + return acc +} + +// ─── Fiat-Shamir helpers ────────────────────────────────────────────────────── + +func bindPublicData(fs *fiatshamir.Transcript, challenge string, vk *curplonk.VerifyingKey, publicInputs []fr.Element) error { + for _, f := range []func() []byte{ + func() []byte { return vk.S[0].Marshal() }, + func() []byte { return vk.S[1].Marshal() }, + func() []byte { return vk.S[2].Marshal() }, + func() []byte { return vk.Ql.Marshal() }, + func() []byte { return vk.Qr.Marshal() }, + func() []byte { return vk.Qm.Marshal() }, + func() []byte { return vk.Qo.Marshal() }, + func() []byte { return vk.Qk.Marshal() }, + } { + if err := fs.Bind(challenge, f()); err != nil { + return err + } + } + for i := range vk.Qcp { + if err := fs.Bind(challenge, vk.Qcp[i].Marshal()); err != nil { + return err + } + } + for i := range publicInputs { + if err := fs.Bind(challenge, publicInputs[i].Marshal()); err != nil { + return err + } + } + return nil +} + +func deriveRandomness(fs *fiatshamir.Transcript, challenge string, points ...*curve.G1Affine) (fr.Element, error) { + var buf [curve.SizeOfG1AffineUncompressed]byte + var r fr.Element + for _, p := range points { + buf = p.RawBytes() + if err := fs.Bind(challenge, buf[:]); err != nil { + return r, err + } + } + b, err := fs.ComputeChallenge(challenge) + if err != nil { + return r, err + } + r.SetBytes(b) + return r, nil +} + +func newHTF(domain []byte) hash.Hash { + return htf.New(domain) +} + +// ─── suppress unused imports ────────────────────────────────────────────────── +var _ = bits.TrailingZeros +var _ = unsafe.Pointer(nil) diff --git a/prover/gpu/plonk2/bls12377/prove_stub.go b/prover/gpu/plonk2/bls12377/prove_stub.go new file mode 100644 index 00000000000..ac6484e154c --- /dev/null +++ b/prover/gpu/plonk2/bls12377/prove_stub.go @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bls12377 + +import ( + "errors" + + curve "github.com/consensys/gnark-crypto/ecc/bls12-377" + "github.com/consensys/gnark/backend" + curplonk "github.com/consensys/gnark/backend/plonk/bls12-377" + "github.com/consensys/gnark/backend/witness" + cs "github.com/consensys/gnark/constraint/bls12-377" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +type GPUProvingKey struct { + Vk *curplonk.VerifyingKey +} + +func NewGPUProvingKey(_ []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + return &GPUProvingKey{Vk: vk} +} + +func (gpk *GPUProvingKey) Size() int { return 0 } +func (gpk *GPUProvingKey) Prepare(_ *gpu.Device, _ *cs.SparseR1CS) error { + return errors.New("gpu: cuda required") +} +func (gpk *GPUProvingKey) Close() {} + +func GPUProve(_ *gpu.Device, _ *GPUProvingKey, _ *cs.SparseR1CS, _ witness.Witness, _ ...backend.ProverOption) (*curplonk.Proof, error) { + return nil, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bn254/cgo.go b/prover/gpu/plonk2/bn254/cgo.go new file mode 100644 index 00000000000..c2c179f3a68 --- /dev/null +++ b/prover/gpu/plonk2/bn254/cgo.go @@ -0,0 +1,44 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +/* +#cgo LDFLAGS: -L${SRCDIR}/../../cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/../../cuda/include + +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// curve returns the C curve identifier for bn254, baked in at generation time. +func curveID() C.gnark_gpu_plonk2_curve_id_t { + return C.gnark_gpu_plonk2_curve_id_t(1) +} + +func devCtx(d *gpu.Device) C.gnark_gpu_context_t { + return C.gnark_gpu_context_t(d.Handle()) +} + +func toError(code C.gnark_gpu_error_t) error { + switch code { + case C.GNARK_GPU_SUCCESS: + return nil + case C.GNARK_GPU_ERROR_CUDA: + return &gpu.Error{Code: int(code), Message: "CUDA error"} + case C.GNARK_GPU_ERROR_INVALID_ARG: + return &gpu.Error{Code: int(code), Message: "invalid argument"} + case C.GNARK_GPU_ERROR_OUT_OF_MEMORY: + return &gpu.Error{Code: int(code), Message: "out of GPU memory"} + case C.GNARK_GPU_ERROR_SIZE_MISMATCH: + return &gpu.Error{Code: int(code), Message: "vector size mismatch"} + default: + return &gpu.Error{Code: int(code), Message: "unknown error"} + } +} diff --git a/prover/gpu/plonk2/bn254/doc.go b/prover/gpu/plonk2/bn254/doc.go new file mode 100644 index 00000000000..388f3f1661f --- /dev/null +++ b/prover/gpu/plonk2/bn254/doc.go @@ -0,0 +1,7 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +// Package bn254 provides GPU-accelerated PlonK operations for the bn254 curve. +// +// Generated from gpu/internal/generator. Do not edit by hand. +// Re-generate with: cd gpu/internal/generator && go run . +package bn254 diff --git a/prover/gpu/plonk2/bn254/fft.go b/prover/gpu/plonk2/bn254/fft.go new file mode 100644 index 00000000000..b93ecb70bcd --- /dev/null +++ b/prover/gpu/plonk2/bn254/fft.go @@ -0,0 +1,211 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "math/big" + "runtime" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain holds GPU-resident twiddle factors for NTT operations over the +// bn254 scalar field. +// +// All NTT operations accept an optional StreamID. When provided, the operation +// is dispatched on that CUDA stream (non-blocking). When omitted, the default +// stream (stream 0) is used. +type GPUFFTDomain struct { + handle C.gnark_gpu_plonk2_ntt_domain_t + dev *gpu.Device + size int +} + +// NewFFTDomain creates a GPU NTT domain of the given size (must be a power of 2). +// +// Twiddle factors are computed using gnark-crypto's fft.Domain, then uploaded +// to GPU in AoS format. This is a one-time cost per domain size. +func NewFFTDomain(dev *gpu.Device, size int) (*GPUFFTDomain, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if size <= 0 || (size&(size-1)) != 0 { + return nil, &gpu.Error{Code: -1, Message: "size must be a positive power of 2"} + } + + domain := fft.NewDomain(uint64(size)) + halfN := size / 2 + + fwdTwiddles := make([]fr.Element, halfN) + invTwiddles := make([]fr.Element, halfN) + if halfN > 0 { + fwdTwiddles[0].SetOne() + invTwiddles[0].SetOne() + for i := 1; i < halfN; i++ { + fwdTwiddles[i].Mul(&fwdTwiddles[i-1], &domain.Generator) + invTwiddles[i].Mul(&invTwiddles[i-1], &domain.GeneratorInv) + } + } + + invN := domain.CardinalityInv + + var fwdPtr, invPtr *C.uint64_t + if halfN > 0 { + fwdPtr = (*C.uint64_t)(unsafe.Pointer(&fwdTwiddles[0])) + invPtr = (*C.uint64_t)(unsafe.Pointer(&invTwiddles[0])) + } + + var handle C.gnark_gpu_plonk2_ntt_domain_t + if err := toError(C.gnark_gpu_plonk2_ntt_domain_create( + devCtx(dev), + curveID(), + C.size_t(size), + fwdPtr, + invPtr, + (*C.uint64_t)(unsafe.Pointer(&invN)), + &handle, + )); err != nil { + return nil, err + } + + dom := &GPUFFTDomain{handle: handle, dev: dev, size: size} + runtime.SetFinalizer(dom, (*GPUFFTDomain).Close) + return dom, nil +} + +// Size returns the domain size. +func (f *GPUFFTDomain) Size() int { return f.size } + +// Close releases GPU resources. Safe to call multiple times. +func (f *GPUFFTDomain) Close() { + if f.handle != nil { + C.gnark_gpu_plonk2_ntt_domain_destroy(f.handle) + f.handle = nil + runtime.SetFinalizer(f, nil) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Forward / Inverse FFT +// ───────────────────────────────────────────────────────────────────────────── + +// FFT performs a forward NTT (DIF): natural-order input → bit-reversed output. +func (f *GPUFFTDomain) FFT(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFT size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_forward_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_forward(f.handle, v.handle)); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } +} + +// FFTInverse performs an inverse NTT (DIT): bit-reversed input → natural-order output. +// The result is scaled by 1/n. +func (f *GPUFFTDomain) FFTInverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFTInverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_inverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_inverse(f.handle, v.handle)); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } +} + +// BitReverse applies the bit-reversal permutation. +func (f *GPUFFTDomain) BitReverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: BitReverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse(f.handle, v.handle)); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Coset FFT +// +// CosetFFT evaluates p(X) on coset g·H = {g·ω^i : i=0..n-1}. +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// ───────────────────────────────────────────────────────────────────────────── + +// CosetFFT evaluates a polynomial in canonical form on coset g·H. +// Input: v holds canonical coefficients in natural order. +// Output: v holds p(g·ω⁰), p(g·ω¹), …, p(g·ωⁿ⁻¹) in natural order. +// +// Implemented as: ScaleByPowers(g) → FFT → BitReverse. +func (f *GPUFFTDomain) CosetFFT(v *FrVector, g fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFT size mismatch") + } + v.ScaleByPowers(g, stream...) + f.FFT(v, stream...) + f.BitReverse(v, stream...) +} + +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// gInv must be the inverse of the coset generator g. +// +// Implemented as: BitReverse → FFTInverse → ScaleByPowers(gInv). +func (f *GPUFFTDomain) CosetFFTInverse(v *FrVector, gInv fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFTInverse size mismatch") + } + f.BitReverse(v, stream...) + f.FFTInverse(v, stream...) + v.ScaleByPowers(gInv, stream...) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Butterfly4Inverse — decomposed iFFT(4n) for quotient computation +// ───────────────────────────────────────────────────────────────────────────── + +// Butterfly4Inverse applies a size-4 inverse DFT butterfly across 4 FrVectors. +// +// omega4Inv: inverse of the primitive 4th root of unity. +// quarter: 1/4 in Montgomery form. +func Butterfly4Inverse(b0, b1, b2, b3 *FrVector, omega4Inv, quarter fr.Element) { + if b0.n != b1.n || b1.n != b2.n || b2.n != b3.n { + panic("gpu: Butterfly4Inverse size mismatch") + } + if b0.dev != b1.dev || b1.dev != b2.dev || b2.dev != b3.dev { + panic("gpu: Butterfly4Inverse device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_butterfly4_inverse( + devCtx(b0.dev), + b0.handle, b1.handle, b2.handle, b3.handle, + (*C.uint64_t)(unsafe.Pointer(&omega4Inv)), + (*C.uint64_t)(unsafe.Pointer(&quarter)), + )); err != nil { + panic("gpu: Butterfly4Inverse failed: " + err.Error()) + } +} + +// ─── suppress unused import ─────────────────────────────────────────────────── +var _ = big.NewInt diff --git a/prover/gpu/plonk2/bn254/fft_stub.go b/prover/gpu/plonk2/bn254/fft_stub.go new file mode 100644 index 00000000000..9b64af8e6bd --- /dev/null +++ b/prover/gpu/plonk2/bn254/fft_stub.go @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bn254 + +import ( + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain is a stub for non-CUDA builds. +type GPUFFTDomain struct{} + +func NewFFTDomain(_ *gpu.Device, _ int) (*GPUFFTDomain, error) { + return nil, gpu.ErrDeviceClosed +} + +func (f *GPUFFTDomain) Size() int { return 0 } +func (f *GPUFFTDomain) Close() {} +func (f *GPUFFTDomain) FFT(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) FFTInverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) BitReverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFT(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFTInverse(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} + +func Butterfly4Inverse(_, _, _, _ *FrVector, _, _ fr.Element) { panic("gpu: cuda required") } diff --git a/prover/gpu/plonk2/bn254/fft_test.go b/prover/gpu/plonk2/bn254/fft_test.go new file mode 100644 index 00000000000..95e8a7d8057 --- /dev/null +++ b/prover/gpu/plonk2/bn254/fft_test.go @@ -0,0 +1,188 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254_test + +import ( + "fmt" + "testing" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bn254" + "github.com/stretchr/testify/require" +) + +func newDomain(t testing.TB, dev *gpu.Device, size int) *bn254.GPUFFTDomain { + t.Helper() + dom, err := bn254.NewFFTDomain(dev, size) + require.NoError(t, err) + t.Cleanup(func() { dom.Close() }) + return dom +} + +// TestFFTRoundtrip verifies FFT(FFTInverse(v)) == v. +func TestFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16, 20} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + gV := newGPUVec(t, dev, orig) + + dom.FFT(gV) + dom.FFTInverse(gV) + dom.BitReverse(gV) // FFTInverse expects bit-reversed input; FFT output is bit-reversed + dev.Sync() + + // Actually test FFTInverse(FFT(v)) == v: + // FFT: natural → bit-reversed + // FFTInverse: bit-reversed → natural (scaled by 1/n) + // So we need FFTInverse after FFT directly. + gV2 := newGPUVec(t, dev, orig) + dom.FFT(gV2) + dom.FFTInverse(gV2) + dev.Sync() + + result := make(fr.Vector, n) + gV2.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "FFTInverse(FFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestCosetFFTRoundtrip verifies CosetFFT(CosetFFTInverse(v)) == v. +func TestCosetFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + var gInv fr.Element + gInv.Inverse(&g) + + gV := newGPUVec(t, dev, orig) + dom.CosetFFT(gV, g) + dom.CosetFFTInverse(gV, gInv) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "CosetFFTInverse(CosetFFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestFFTMatchesCPU verifies GPU FFT output matches gnark-crypto CPU FFT. +func TestFFTMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const logN = 14 + n := 1 << logN + + dom := newDomain(t, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + + orig := randFrVec(n) + cpuCopy := make(fr.Vector, n) + copy(cpuCopy, orig) + + // CPU FFT + cpuDom.FFT(cpuCopy, fft.DIF) + fft.BitReverse(cpuCopy) + + // GPU FFT (DIF: natural → bit-reversed, then BitReverse → natural) + gV := newGPUVec(t, dev, orig) + dom.FFT(gV) // natural → bit-reversed + dom.BitReverse(gV) // bit-reversed → natural + dev.Sync() + + gpuResult := make(fr.Vector, n) + gV.CopyToHost(gpuResult) + + for i := range cpuCopy { + require.True(t, cpuCopy[i].Equal(&gpuResult[i]), + "FFT mismatch at i=%d", i) + } +} + +// BenchmarkFFTForward benchmarks GPU forward NTT. +func BenchmarkFFTForward(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFT(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkFFTInverse benchmarks GPU inverse NTT. +func BenchmarkFFTInverse(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + dom.FFT(gV) // put into bit-reversed form first + dev.Sync() + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFTInverse(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkCosetFFT benchmarks GPU coset FFT. +func BenchmarkCosetFFT(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Re-upload canonical coefficients before each run + gV.CopyFromHost(src) + dom.CosetFFT(gV, g) + dev.Sync() + } + }) + } +} diff --git a/prover/gpu/plonk2/bn254/fr.go b/prover/gpu/plonk2/bn254/fr.go new file mode 100644 index 00000000000..e338052fbb1 --- /dev/null +++ b/prover/gpu/plonk2/bn254/fr.go @@ -0,0 +1,270 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "runtime" + "sync" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector holds a vector of bn254 scalar-field (Fr) elements on the GPU +// in Structure-of-Arrays (SoA) layout for coalesced memory access. +// +// All elements are in Montgomery form. GPU memory is SoA by limb; host memory +// uses gnark-crypto AoS Montgomery layout. +// +// All operations accept an optional gpu.StreamID. When omitted, the default +// stream (stream 0) is used. +type FrVector struct { + handle C.gnark_gpu_plonk2_fr_vector_t + dev *gpu.Device + n int +} + +var hostTransferMu sync.Mutex + +// NewFrVector allocates GPU memory for n Fr elements on dev. +// A finalizer is installed; call Free for deterministic VRAM release. +func NewFrVector(dev *gpu.Device, n int) (*FrVector, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if n <= 0 { + return nil, &gpu.Error{Code: -1, Message: "count must be positive"} + } + + var handle C.gnark_gpu_plonk2_fr_vector_t + if err := toError(C.gnark_gpu_plonk2_fr_vector_alloc( + devCtx(dev), curveID(), C.size_t(n), &handle, + )); err != nil { + return nil, err + } + + v := &FrVector{handle: handle, dev: dev, n: n} + runtime.SetFinalizer(v, (*FrVector).Free) + return v, nil +} + +// Free releases GPU memory. Safe to call multiple times. +func (v *FrVector) Free() { + if v.handle != nil { + v.bind() + C.gnark_gpu_plonk2_fr_vector_free(v.handle) + v.handle = nil + runtime.SetFinalizer(v, nil) + } +} + +// Len returns the number of elements. +func (v *FrVector) Len() int { return v.n } + +func (v *FrVector) bind() { + if err := v.dev.Bind(); err != nil { + panic("gpu: bind device failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Host ↔ Device transfers +// ───────────────────────────────────────────────────────────────────────────── + +// CopyFromHost copies host data (AoS) to GPU (SoA). Panics on size mismatch. +func (v *FrVector) CopyFromHost(src fr.Vector, _ ...gpu.StreamID) { + if len(src) != v.n { + panic("gpu: CopyFromHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_device( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&src[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyFromHost failed: " + err.Error()) + } +} + +// CopyToHost copies GPU data (SoA) back to host (AoS). Panics on size mismatch. +func (v *FrVector) CopyToHost(dst fr.Vector, _ ...gpu.StreamID) { + if len(dst) != v.n { + panic("gpu: CopyToHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_host( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&dst[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyToHost failed: " + err.Error()) + } +} + +// CopyFromDevice copies src to v (GPU-to-GPU). Panics on size or device mismatch. +func (v *FrVector) CopyFromDevice(src *FrVector, _ ...gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDevice size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDevice device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d( + devCtx(v.dev), v.handle, src.handle, + )); err != nil { + panic("gpu: CopyFromDevice failed: " + err.Error()) + } +} + +// CopyFromDeviceStream copies src to v (GPU-to-GPU) on a specific stream. +// Panics on size or device mismatch. +func (v *FrVector) CopyFromDeviceStream(src *FrVector, streamID gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDeviceStream size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDeviceStream device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d_stream( + devCtx(v.dev), v.handle, src.handle, C.int(streamID), + )); err != nil { + panic("gpu: CopyFromDeviceStream failed: " + err.Error()) + } +} + +// SetZero sets all elements to zero. +func (v *FrVector) SetZero(_ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_set_zero( + devCtx(v.dev), v.handle, + )); err != nil { + panic("gpu: SetZero failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Element-wise arithmetic (all async on the default stream) +// ───────────────────────────────────────────────────────────────────────────── + +func mustSameDeviceAndSize(v, a, b *FrVector) { + if v.n != a.n || a.n != b.n { + panic("gpu: vector size mismatch") + } + if v.dev != a.dev || a.dev != b.dev { + panic("gpu: vectors from different devices") + } +} + +// Mul computes v[i] = a[i] · b[i] (mod r). +func (v *FrVector) Mul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_mul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Mul failed: " + err.Error()) + } +} + +// Add computes v[i] = a[i] + b[i] (mod r). +func (v *FrVector) Add(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_add( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Add failed: " + err.Error()) + } +} + +// Sub computes v[i] = a[i] - b[i] (mod r). +func (v *FrVector) Sub(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_sub( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Sub failed: " + err.Error()) + } +} + +// AddMul computes v[i] += a[i] · b[i] (mod r). +func (v *FrVector) AddMul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_addmul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: AddMul failed: " + err.Error()) + } +} + +// AddScalarMul computes v[i] += a[i] · scalar (mod r). +func (v *FrVector) AddScalarMul(a *FrVector, scalar fr.Element, _ ...gpu.StreamID) { + if v.n != a.n { + panic("gpu: AddScalarMul size mismatch") + } + if v.dev != a.dev { + panic("gpu: AddScalarMul device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_add_scalar_mul( + devCtx(v.dev), v.handle, a.handle, + (*C.uint64_t)(unsafe.Pointer(&scalar)), + )); err != nil { + panic("gpu: AddScalarMul failed: " + err.Error()) + } +} + +// ScalarMul computes v[i] *= c (mod r) for all i. +func (v *FrVector) ScalarMul(c fr.Element, _ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scalar_mul( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&c)), + )); err != nil { + panic("gpu: ScalarMul failed: " + err.Error()) + } +} + +// ScaleByPowers computes v[i] *= g^i for i in [0, n). +// Used for coset FFT shifting. +func (v *FrVector) ScaleByPowers(g fr.Element, streams ...gpu.StreamID) { + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers_stream( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + C.int(streams[0]), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } +} + +// BatchInvert computes v[i] = 1/v[i] using Montgomery batch inversion. +// temp must be a separate FrVector of the same size used as scratch space. +func (v *FrVector) BatchInvert(temp *FrVector, _ ...gpu.StreamID) { + if v.n != temp.n { + panic("gpu: BatchInvert size mismatch") + } + if v.dev != temp.dev { + panic("gpu: BatchInvert device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_batch_invert( + devCtx(v.dev), v.handle, temp.handle, + )); err != nil { + panic("gpu: BatchInvert failed: " + err.Error()) + } +} diff --git a/prover/gpu/plonk2/bn254/fr_stub.go b/prover/gpu/plonk2/bn254/fr_stub.go new file mode 100644 index 00000000000..993094f1333 --- /dev/null +++ b/prover/gpu/plonk2/bn254/fr_stub.go @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bn254 + +import ( + "errors" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector is a stub for non-CUDA builds. +type FrVector struct{} + +func NewFrVector(_ *gpu.Device, _ int) (*FrVector, error) { + return nil, errors.New("gpu: cuda required") +} + +func (v *FrVector) Free() {} +func (v *FrVector) Len() int { return 0 } +func (v *FrVector) CopyFromHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyToHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDevice(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDeviceStream(_ *FrVector, _ gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) SetZero(_ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Mul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Add(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Sub(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddMul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddScalarMul(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (v *FrVector) ScalarMul(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) ScaleByPowers(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) BatchInvert(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } diff --git a/prover/gpu/plonk2/bn254/fr_test.go b/prover/gpu/plonk2/bn254/fr_test.go new file mode 100644 index 00000000000..0498e8c78ba --- /dev/null +++ b/prover/gpu/plonk2/bn254/fr_test.go @@ -0,0 +1,275 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254_test + +import ( + "fmt" + "testing" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bn254" + "github.com/leanovate/gopter" + "github.com/leanovate/gopter/prop" + "github.com/stretchr/testify/require" +) + +func requireGPUDev(t testing.TB) *gpu.Device { + t.Helper() + dev, err := gpu.New() + require.NoError(t, err) + t.Cleanup(func() { dev.Close() }) + return dev +} + +func genFrElem() gopter.Gen { + return func(_ *gopter.GenParameters) *gopter.GenResult { + var e fr.Element + e.MustSetRandom() + return gopter.NewGenResult(e, gopter.NoShrinker) + } +} + +func randFrVec(n int) fr.Vector { + v := make(fr.Vector, n) + for i := range v { + v[i].MustSetRandom() + } + return v +} + +func newGPUVec(t testing.TB, dev *gpu.Device, data fr.Vector) *bn254.FrVector { + t.Helper() + gv, err := bn254.NewFrVector(dev, len(data)) + require.NoError(t, err) + t.Cleanup(func() { gv.Free() }) + gv.CopyFromHost(data) + dev.Sync() + return gv +} + +// TestFrVectorRoundtrip verifies CopyFromHost → CopyToHost is identity. +func TestFrVectorRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + const n = 1024 + src := randFrVec(n) + gv := newGPUVec(t, dev, src) + dst := make(fr.Vector, n) + gv.CopyToHost(dst) + for i := range src { + require.True(t, src[i].Equal(&dst[i]), "mismatch at %d", i) + } +} + +// TestFrVectorAddCommutative checks GPU Add(a,b) == GPU Add(b,a). +func TestFrVectorAddCommutative(t *testing.T) { + dev := requireGPUDev(t) + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 50 + properties := gopter.NewProperties(parameters) + + properties.Property("Add is commutative", prop.ForAll( + func(a, b fr.Element) bool { + n := 16 + aVec := make(fr.Vector, n) + bVec := make(fr.Vector, n) + for i := range aVec { + aVec[i] = a + bVec[i] = b + } + + gA, err := bn254.NewFrVector(dev, n) + if err != nil { + return false + } + gB, _ := bn254.NewFrVector(dev, n) + gAB, _ := bn254.NewFrVector(dev, n) + gBA, _ := bn254.NewFrVector(dev, n) + defer gA.Free() + defer gB.Free() + defer gAB.Free() + defer gBA.Free() + + gA.CopyFromHost(aVec) + gB.CopyFromHost(bVec) + gAB.Add(gA, gB) + gBA.Add(gB, gA) + dev.Sync() + + ab := make(fr.Vector, n) + ba := make(fr.Vector, n) + gAB.CopyToHost(ab) + gBA.CopyToHost(ba) + for i := range ab { + if !ab[i].Equal(&ba[i]) { + return false + } + } + return true + }, + genFrElem(), genFrElem(), + )) + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + +// TestFrVectorBatchInvert verifies v[i] * inv(v[i]) == 1. +func TestFrVectorBatchInvert(t *testing.T) { + dev := requireGPUDev(t) + const n = 256 + + orig := make(fr.Vector, n) + for i := range orig { + orig[i].MustSetRandom() + if orig[i].IsZero() { + orig[i].SetOne() + } + } + + gV := newGPUVec(t, dev, orig) + gTemp, err := bn254.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + inv := make(fr.Vector, n) + gV.CopyToHost(inv) + + var one fr.Element + one.SetOne() + for i := range orig { + var product fr.Element + product.Mul(&orig[i], &inv[i]) + require.True(t, product.Equal(&one), "BatchInvert: v[%d]*inv[%d] != 1", i, i) + } +} + +// TestFrVectorScaleByPowers checks GPU ScaleByPowers matches CPU loop. +func TestFrVectorScaleByPowers(t *testing.T) { + dev := requireGPUDev(t) + const n = 512 + + var omega fr.Element + omega.MustSetRandom() + + ones := make(fr.Vector, n) + for i := range ones { + ones[i].SetOne() + } + + gV := newGPUVec(t, dev, ones) + gV.ScaleByPowers(omega) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + + expected := make(fr.Vector, n) + expected[0].SetOne() + for i := 1; i < n; i++ { + expected[i].Mul(&expected[i-1], &omega) + } + + for i := range result { + require.True(t, result[i].Equal(&expected[i]), "ScaleByPowers mismatch at %d", i) + } +} + +// TestFrVectorBatchInvertMatchesCPU verifies BatchInvert matches scalar CPU inversion. +func TestFrVectorBatchInvertMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const n = 128 + + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + + cpuInv := make(fr.Vector, n) + for i := range src { + cpuInv[i].Inverse(&src[i]) + } + + gV := newGPUVec(t, dev, src) + gTemp, err := bn254.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + gpuInv := make(fr.Vector, n) + gV.CopyToHost(gpuInv) + + for i := range cpuInv { + require.True(t, cpuInv[i].Equal(&gpuInv[i]), + "BatchInvert mismatch at %d", i) + } +} + +// BenchmarkFrVectorAdd benchmarks GPU element-wise addition. +func BenchmarkFrVectorAdd(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20, 1 << 22} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + gA := newGPUVec(b, dev, src) + gB := newGPUVec(b, dev, src) + gC, _ := bn254.NewFrVector(dev, n) + defer gC.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gC.Add(gA, gB) + dev.Sync() + } + }) + } +} + +// BenchmarkFrVectorBatchInvert benchmarks GPU batch inversion. +func BenchmarkFrVectorBatchInvert(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + gV := newGPUVec(b, dev, src) + gTemp, _ := bn254.NewFrVector(dev, n) + defer gTemp.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gV.CopyFromHost(src) + gV.BatchInvert(gTemp) + dev.Sync() + } + }) + } +} + +func fmtPow2(n int) string { + switch { + case n >= 1<<20: + return fmt.Sprintf("%dM", n>>20) + case n >= 1<<10: + return fmt.Sprintf("%dK", n>>10) + default: + return fmt.Sprintf("%d", n) + } +} diff --git a/prover/gpu/plonk2/bn254/kernels.go b/prover/gpu/plonk2/bn254/kernels.go new file mode 100644 index 00000000000..9af9b279a0e --- /dev/null +++ b/prover/gpu/plonk2/bn254/kernels.go @@ -0,0 +1,316 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +/* +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "math/big" + "runtime" + "sync" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// ZPrefixProduct computes Z[i] = product(ratio[0..i-1]) on GPU with CPU chunk scan. +func ZPrefixProduct(dev *gpu.Device, zVec, ratioVec, tempVec *FrVector) { + if zVec.n != ratioVec.n || zVec.n != tempVec.n { + panic("gpu: ZPrefixProduct size mismatch") + } + n := ratioVec.n + maxChunks := (n + 1023) / 1024 + cpHost := make([]uint64, maxChunks*4) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase1( + devCtx(dev), zVec.handle, ratioVec.handle, + (*C.uint64_t)(unsafe.Pointer(&cpHost[0])), &numChunks, + )); err != nil { + panic("gpu: ZPrefixProduct phase1 failed: " + err.Error()) + } + + nc := int(numChunks) + spHost := make([]uint64, nc*4) + copy(spHost[:4], cpHost[:4]) + for i := 1; i < nc; i++ { + prev := *(*fr.Element)(unsafe.Pointer(&spHost[(i-1)*4])) + cur := *(*fr.Element)(unsafe.Pointer(&cpHost[i*4])) + var prod fr.Element + prod.Mul(&prev, &cur) + *(*fr.Element)(unsafe.Pointer(&spHost[i*4])) = prod + } + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase3( + devCtx(dev), zVec.handle, tempVec.handle, + (*C.uint64_t)(unsafe.Pointer(&spHost[0])), C.size_t(nc), + )); err != nil { + panic("gpu: ZPrefixProduct phase3 failed: " + err.Error()) + } +} + +// PlonkZComputeFactors computes per-element Z ratio factors on GPU. +// On exit L contains numerators, R contains denominators. +func PlonkZComputeFactors( + L, R, O *FrVector, dPerm unsafe.Pointer, + beta, gamma, gMul, gSq fr.Element, + log2n uint, domain *GPUFFTDomain, +) { + n := L.n + if R.n != n || O.n != n || domain.size != n { + panic("gpu: PlonkZComputeFactors size mismatch") + } + params := [4]fr.Element{beta, gamma, gMul, gSq} + if err := toError(C.gnark_gpu_plonk2_z_compute_factors( + devCtx(L.dev), L.handle, R.handle, O.handle, + dPerm, (*C.uint64_t)(unsafe.Pointer(¶ms[0])), + C.uint(log2n), domain.handle, + )); err != nil { + panic("gpu: PlonkZComputeFactors failed: " + err.Error()) + } +} + +// PlonkGateAccum computes the fused gate constraint accumulation. +func PlonkGateAccum(result, Ql, Qr, Qm, Qo, Qk, L, R, O *FrVector, zhKInv fr.Element) { + n := result.n + if Ql.n != n || Qr.n != n || Qm.n != n || Qo.n != n || Qk.n != n || + L.n != n || R.n != n || O.n != n { + panic("gpu: PlonkGateAccum size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_gate_accum( + devCtx(result.dev), + result.handle, Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + L.handle, R.handle, O.handle, + (*C.uint64_t)(unsafe.Pointer(&zhKInv)), + )); err != nil { + panic("gpu: PlonkGateAccum failed: " + err.Error()) + } +} + +// PlonkLinearizeStatic computes the fixed-selector part of the linearized polynomial. +func PlonkLinearizeStatic( + result, Z, S3, Ql, Qr, Qm, Qo, Qk *FrVector, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta fr.Element, +) { + n := result.n + if Z.n != n || S3.n != n || Ql.n != n || Qr.n != n || Qm.n != n || + Qo.n != n || Qk.n != n { + panic("gpu: PlonkLinearizeStatic size mismatch") + } + scalars := [6]fr.Element{combinedZCoeff, s1, lZeta, rZeta, rl, oZeta} + if err := toError(C.gnark_gpu_plonk2_linearize_static( + devCtx(result.dev), + result.handle, Z.handle, S3.handle, + Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + (*C.uint64_t)(unsafe.Pointer(&scalars[0])), + )); err != nil { + panic("gpu: PlonkLinearizeStatic failed: " + err.Error()) + } +} + +// PlonkPermBoundary computes the fused permutation + boundary constraint. +func PlonkPermBoundary( + result, L, R, O, Z, S1, S2, S3, L1DenInv *FrVector, + alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen fr.Element, + domain *GPUFFTDomain, _ ...gpu.StreamID, +) { + n := result.n + if L.n != n || R.n != n || O.n != n || Z.n != n || + S1.n != n || S2.n != n || S3.n != n || L1DenInv.n != n || domain.size != n { + panic("gpu: PlonkPermBoundary size mismatch") + } + params := [7]fr.Element{alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen} + if err := toError(C.gnark_gpu_plonk2_perm_boundary( + devCtx(result.dev), + result.handle, L.handle, R.handle, O.handle, Z.handle, + S1.handle, S2.handle, S3.handle, L1DenInv.handle, + (*C.uint64_t)(unsafe.Pointer(¶ms[0])), domain.handle, + )); err != nil { + panic("gpu: PlonkPermBoundary failed: " + err.Error()) + } +} + +// ComputeL1Den computes out[i] = cosetGen·ω^i - 1 for all i. +func ComputeL1Den(out *FrVector, cosetGen fr.Element, domain *GPUFFTDomain, _ ...gpu.StreamID) { + if domain.size != out.n { + panic("gpu: ComputeL1Den domain size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_compute_l1_den( + domain.handle, out.handle, + (*C.uint64_t)(unsafe.Pointer(&cosetGen)), + )); err != nil { + panic("gpu: ComputeL1Den failed: " + err.Error()) + } +} + +// ReduceBlindedCoset reduces a blinded polynomial for coset evaluation on GPU. +func ReduceBlindedCoset(dst, src *FrVector, tail []fr.Element, cosetPowN fr.Element) { + if dst.n != src.n { + panic("gpu: ReduceBlindedCoset size mismatch") + } + var tailPtr *C.uint64_t + if len(tail) > 0 { + tailPtr = (*C.uint64_t)(unsafe.Pointer(&tail[0])) + } + if err := toError(C.gnark_gpu_plonk2_reduce_blinded_coset( + devCtx(dst.dev), dst.handle, src.handle, + tailPtr, C.size_t(len(tail)), + (*C.uint64_t)(unsafe.Pointer(&cosetPowN)), + )); err != nil { + panic("gpu: ReduceBlindedCoset failed: " + err.Error()) + } +} + +// SubtractBlindingHead subtracts tail[i] from v[i] for the blinding tail. +func SubtractBlindingHead(v *FrVector, tail []fr.Element) { + if len(tail) == 0 { + return + } + if len(tail) > v.n { + panic("gpu: SubtractBlindingHead size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_subtract_head( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&tail[0])), + C.size_t(len(tail)), + )); err != nil { + panic("gpu: SubtractBlindingHead failed: " + err.Error()) + } +} + +// DeviceAllocCopyInt64 uploads an int64 slice to GPU device memory. +func DeviceAllocCopyInt64(dev *gpu.Device, data []int64) (unsafe.Pointer, error) { + var dPtr unsafe.Pointer + if err := toError(C.gnark_gpu_device_alloc_copy_int64( + devCtx(dev), + (*C.int64_t)(unsafe.Pointer(&data[0])), + C.size_t(len(data)), + &dPtr, + )); err != nil { + return nil, err + } + return dPtr, nil +} + +// DeviceFreePtr frees device memory allocated by DeviceAllocCopyInt64. +func DeviceFreePtr(ptr unsafe.Pointer) { + if ptr != nil { + C.gnark_gpu_device_free_ptr(ptr) + } +} + +// PolyEvalGPU evaluates a GPU-resident polynomial at z using chunked Horner on +// device and a small CPU combine over chunk partials. +func PolyEvalGPU(dev *gpu.Device, v *FrVector, z fr.Element) fr.Element { + n := v.n + if n == 0 { + return fr.Element{} + } + + maxChunks := (n + 1023) / 1024 + partialsHost := make([]uint64, maxChunks*4) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_poly_eval_chunks( + devCtx(dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&z)), + (*C.uint64_t)(unsafe.Pointer(&partialsHost[0])), + &numChunks, + )); err != nil { + panic("gpu: PolyEvalGPU failed: " + err.Error()) + } + + return combinePolyEvalPartials(partialsHost, int(numChunks), z) +} + +// PolyEvalFromDevice downloads a GPU FrVector and evaluates at z using CPU Horner. +func PolyEvalFromDevice(v *FrVector, z fr.Element) fr.Element { + n := v.n + coeffs := make(fr.Vector, n) + v.CopyToHost(coeffs) + return polyEvalParallel(coeffs, z) +} + +func combinePolyEvalPartials(partialsHost []uint64, numChunks int, z fr.Element) fr.Element { + if numChunks == 0 { + return fr.Element{} + } + readPartial := func(chunk int) fr.Element { + var r fr.Element + for limb := range r { + r[limb] = partialsHost[chunk*4+limb] + } + return r + } + if numChunks == 1 { + return readPartial(0) + } + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(1024)) + result := readPartial(numChunks - 1) + for j := numChunks - 2; j >= 0; j-- { + p := readPartial(j) + result.Mul(&result, &zChunk).Add(&result, &p) + } + return result +} + +// polyEvalParallel evaluates p(z) = Σ c[i]·z^i using multi-core Horner. +func polyEvalParallel(coeffs []fr.Element, z fr.Element) fr.Element { + n := len(coeffs) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + return hornerEval(coeffs, z) + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + partials := make([]fr.Element, numChunks) + var wg sync.WaitGroup + for c := range numChunks { + start := c * chunkSize + if start >= n { + break + } + end := start + chunkSize + if end > n { + end = n + } + wg.Add(1) + go func(idx, s, e int) { + defer wg.Done() + partials[idx] = hornerEval(coeffs[s:e], z) + }(c, start, end) + } + wg.Wait() + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(int64(chunkSize))) + var result, zPow fr.Element + zPow.SetOne() + for c := range numChunks { + if c*chunkSize >= n { + break + } + var t fr.Element + t.Mul(&partials[c], &zPow) + result.Add(&result, &t) + zPow.Mul(&zPow, &zChunk) + } + return result +} + +func hornerEval(coeffs []fr.Element, z fr.Element) fr.Element { + var r fr.Element + for i := len(coeffs) - 1; i >= 0; i-- { + r.Mul(&r, &z).Add(&r, &coeffs[i]) + } + return r +} diff --git a/prover/gpu/plonk2/bn254/kernels_stub.go b/prover/gpu/plonk2/bn254/kernels_stub.go new file mode 100644 index 00000000000..2ecd53bce30 --- /dev/null +++ b/prover/gpu/plonk2/bn254/kernels_stub.go @@ -0,0 +1,36 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bn254 + +import ( + "errors" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +func ZPrefixProduct(_ *gpu.Device, _, _, _ *FrVector) { panic("gpu: cuda required") } +func PlonkZComputeFactors(_, _, _ *FrVector, _ unsafe.Pointer, _, _, _, _ fr.Element, _ uint, _ *GPUFFTDomain) { + panic("gpu: cuda required") +} +func PlonkGateAccum(_, _, _, _, _, _, _, _, _ *FrVector, _ fr.Element) { panic("gpu: cuda required") } +func PlonkPermBoundary(_, _, _, _, _, _, _, _, _ *FrVector, _, _, _, _, _, _, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ComputeL1Den(_ *FrVector, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ReduceBlindedCoset(_, _ *FrVector, _ []fr.Element, _ fr.Element) { panic("gpu: cuda required") } +func DeviceAllocCopyInt64(_ *gpu.Device, _ []int64) (unsafe.Pointer, error) { + return nil, errors.New("gpu: cuda required") +} +func DeviceFreePtr(_ unsafe.Pointer) {} +func PolyEvalGPU(_ *gpu.Device, _ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} +func PolyEvalFromDevice(_ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bn254/msm.go b/prover/gpu/plonk2/bn254/msm.go new file mode 100644 index 00000000000..7b32693cf22 --- /dev/null +++ b/prover/gpu/plonk2/bn254/msm.go @@ -0,0 +1,390 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "fmt" + "log" + "math/big" + "os" + "runtime" + "strconv" + "unsafe" + + curve "github.com/consensys/gnark-crypto/ecc/bn254" + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// frRInv is R^{-1} mod r where R = 2^{FrLimbs*64} (the Fr Montgomery constant). +// The GPU MSM uses Montgomery-form scalars without fr_from_mont, so the result +// is R * correct_result. Multiplying by frRInv corrects this. +var frRInv big.Int + +func init() { + var rInv fr.Element + rInv[0] = 1 // Montgomery representation of R^{-1}: stores R^{-1} mod r + rInv.BigInt(&frRInv) +} + +// msmDefaultWindowBits selects the Pippenger window size for n points. +func msmDefaultWindowBits(n int) int { + switch { + case n > 1<<26: + return 19 + case n > 1<<22: + return 17 + case n > 1<<18: + return 15 + case n > 1<<12: + return 13 + default: + return 11 + } +} + +// G1MSM holds a GPU MSM context with uploaded affine base points. +// +// Points are uploaded once at construction. The context supports multiple +// MultiExp calls sharing the same base points. +type G1MSM struct { + handle C.gnark_gpu_plonk2_msm_t + dev *gpu.Device + n int + windowBits int + hostPoints []curve.G1Affine + hostPointsPtr unsafe.Pointer + lastBatchPhaseTimings [][9]float32 +} + +// NewG1MSM creates a G1MSM context by uploading affine points to the GPU. +// window_bits=0 selects a default based on point count. +func NewG1MSM(dev *gpu.Device, points []curve.G1Affine, windowBits int) (*G1MSM, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if err := dev.Bind(); err != nil { + return nil, err + } + n := len(points) + if n == 0 { + return nil, &gpu.Error{Code: -1, Message: "points must not be empty"} + } + if windowBits == 0 { + windowBits = msmDefaultWindowBits(n) + } + if override := os.Getenv("GNARK_GPU_PLONK2_MSM_WINDOW_BITS"); override != "" { + parsed, err := strconv.Atoi(override) + if err != nil { + return nil, fmt.Errorf("gpu: invalid GNARK_GPU_PLONK2_MSM_WINDOW_BITS %q: %w", override, err) + } + windowBits = parsed + } + if windowBits < 2 || windowBits > 24 { + return nil, fmt.Errorf("gpu: window bits must be in [2,24], got %d", windowBits) + } + + hostPoints := points + var hostPointsPtr unsafe.Pointer + if os.Getenv("GNARK_GPU_DISABLE_PINNED_MSM_POINTS") == "" { + nbytes := C.size_t(n) * C.size_t(unsafe.Sizeof(curve.G1Affine{})) + if err := toError(C.gnark_gpu_alloc_pinned(&hostPointsPtr, nbytes)); err == nil { + hostPoints = unsafe.Slice((*curve.G1Affine)(hostPointsPtr), n) + copy(hostPoints, points) + } else { + log.Printf("gpu: pinned MSM points unavailable (%v), using heap", err) + hostPointsPtr = nil + } + } + + var handle C.gnark_gpu_plonk2_msm_t + if err := toError(C.gnark_gpu_plonk2_msm_create( + devCtx(dev), + curveID(), + (*C.uint64_t)(unsafe.Pointer(&hostPoints[0])), + C.size_t(n), + C.int(windowBits), + &handle, + )); err != nil { + if hostPointsPtr != nil { + C.gnark_gpu_free_pinned(hostPointsPtr) + } + return nil, err + } + + m := &G1MSM{ + handle: handle, + dev: dev, + n: n, + windowBits: windowBits, + hostPoints: hostPoints, + hostPointsPtr: hostPointsPtr, + } + runtime.SetFinalizer(m, (*G1MSM).Close) + return m, nil +} + +// Close releases GPU resources. Safe to call multiple times. +func (m *G1MSM) Close() { + if m.handle != nil { + C.gnark_gpu_plonk2_msm_destroy(m.handle) + m.handle = nil + if m.hostPointsPtr != nil { + C.gnark_gpu_free_pinned(m.hostPointsPtr) + m.hostPointsPtr = nil + } + m.hostPoints = nil + runtime.SetFinalizer(m, nil) + } +} + +// Len returns the number of base points. +func (m *G1MSM) Len() int { return m.n } + +// PinWorkBuffers keeps MSM scratch buffers resident across MultiExp calls, +// amortizing cudaMalloc/Free overhead over a wave of MSMs. +func (m *G1MSM) PinWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_pin_work_buffers(m.handle)) +} + +// ReleaseWorkBuffers frees pinned scratch buffers. Subsequent MultiExp calls +// re-allocate lazily. +func (m *G1MSM) ReleaseWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_release_work_buffers(m.handle)) +} + +// OffloadPoints frees the GPU-resident base points. Call ReloadPoints before +// the next MultiExp. +func (m *G1MSM) OffloadPoints() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_offload_points(m.handle)) +} + +// ReloadPoints uploads the retained host base points after OffloadPoints. +func (m *G1MSM) ReloadPoints() error { + if len(m.hostPoints) < m.n { + return fmt.Errorf("gpu: MSM host points unavailable") + } + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_reload_points( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&m.hostPoints[0])), + C.size_t(m.n), + )) +} + +// MultiExp computes Q[i] = Σⱼ scalars[i][j] · P[j] for each scalar set. +// Each scalars[i] must have length ≤ m.Len(). +// Returns Jacobian results. +func (m *G1MSM) MultiExp(scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if err := m.dev.Bind(); err != nil { + return nil, err + } + k := len(scalars) + if k == 0 { + return nil, nil + } + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: MSM scalar set %d is empty", i) + } + if len(s) > m.n { + return nil, fmt.Errorf("gpu: MSM scalar set %d has %d elements, exceeds %d points", i, len(s), m.n) + } + } + + results := make([]curve.G1Jac, k) + m.lastBatchPhaseTimings = make([][9]float32, k) + for i, s := range scalars { + if err := toError(C.gnark_gpu_plonk2_msm_run( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&s[0])), + C.size_t(len(s)), + (*C.uint64_t)(unsafe.Pointer(&results[i])), + )); err != nil { + return nil, fmt.Errorf("gpu: MSM set %d failed: %w", i, err) + } + m.lastBatchPhaseTimings[i] = m.LastPhaseTimings() + // Montgomery correction: GPU skips fr_from_mont on scalars, so result = R * correct. + results[i].ScalarMultiplication(&results[i], &frRInv) + } + return results, nil +} + +// LastPhaseTimings returns per-phase timings (ms) from the most recent MultiExp call. +func (m *G1MSM) LastPhaseTimings() [9]float32 { + var out [9]C.float + C.gnark_gpu_plonk2_msm_get_phase_timings(m.handle, (*C.float)(unsafe.Pointer(&out[0]))) + var result [9]float32 + for i := range result { + result[i] = float32(out[i]) + } + return result +} + +// LastBatchPhaseTimings returns per-set MSM phase timings from the most recent +// MultiExp call. +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { + if len(m.lastBatchPhaseTimings) == 0 { + return nil + } + out := make([][9]float32, len(m.lastBatchPhaseTimings)) + copy(out, m.lastBatchPhaseTimings) + return out +} + +// MultiExpSplit runs the MSM split across 2 devices for ~2x speedup. +// msm0 must hold points[:n/2] and msm1 must hold points[n/2:]. +// This is an advanced API; use MultiExp for single-GPU operation. +func MultiExpSplit(msm0, msm1 *G1MSM, scalars []fr.Element) (curve.G1Jac, error) { + return MultiExpSplitAt(msm0, msm1, len(scalars)/2, scalars) +} + +// MultiExpSplitAt runs one MSM split across 2 devices at a fixed scalar index. +// msm0 must hold points[:split], and msm1 must hold points[split:]. +func MultiExpSplitAt(msm0, msm1 *G1MSM, split int, scalars []fr.Element) (curve.G1Jac, error) { + if msm0 == nil || msm1 == nil || len(scalars) == 0 { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: nil MSM or empty scalars") + } + n := len(scalars) + if split <= 0 || split >= n { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: invalid split %d for %d scalars", split, n) + } + if split > msm0.Len() || n-split > msm1.Len() { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: split exceeds MSM point capacity") + } + + type result struct { + jac curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(scalars[:split]) + if err != nil { + ch0 <- result{err: err} + return + } + ch0 <- result{jac: jacs[0]} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(scalars[split:]) + if err != nil { + ch1 <- result{err: err} + return + } + ch1 <- result{jac: jacs[0]} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return curve.G1Jac{}, r0.err + } + if r1.err != nil { + return curve.G1Jac{}, r1.err + } + r0.jac.AddAssign(&r1.jac) + return r0.jac, nil +} + +// MultiExpSplitBatchAt runs several MSMs split across 2 devices. Each device +// executes its half-batch sequentially on its own stream, and the host combines +// matching partials. +func MultiExpSplitBatchAt(msm0, msm1 *G1MSM, split int, scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if len(scalars) == 0 { + return nil, nil + } + first := make([][]fr.Element, len(scalars)) + second := make([][]fr.Element, len(scalars)) + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: split MSM scalar set %d is empty", i) + } + if split <= 0 || split >= len(s) { + return nil, fmt.Errorf("gpu: split MSM scalar set %d has invalid split %d for %d scalars", i, split, len(s)) + } + if split > msm0.Len() || len(s)-split > msm1.Len() { + return nil, fmt.Errorf("gpu: split MSM scalar set %d exceeds MSM point capacity", i) + } + first[i] = s[:split] + second[i] = s[split:] + } + + type result struct { + jacs []curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(first...) + ch0 <- result{jacs: jacs, err: err} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(second...) + ch1 <- result{jacs: jacs, err: err} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return nil, r0.err + } + if r1.err != nil { + return nil, r1.err + } + if len(r0.jacs) != len(scalars) || len(r1.jacs) != len(scalars) { + return nil, fmt.Errorf("gpu: split MSM result length mismatch") + } + for i := range r0.jacs { + r0.jacs[i].AddAssign(&r1.jacs[i]) + } + return r0.jacs, nil +} diff --git a/prover/gpu/plonk2/bn254/msm_stub.go b/prover/gpu/plonk2/bn254/msm_stub.go new file mode 100644 index 00000000000..0163e41461f --- /dev/null +++ b/prover/gpu/plonk2/bn254/msm_stub.go @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bn254 + +import ( + "errors" + + curve "github.com/consensys/gnark-crypto/ecc/bn254" + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// G1MSM is a stub for non-CUDA builds. +type G1MSM struct{} + +func NewG1MSM(_ *gpu.Device, _ []curve.G1Affine, _ int) (*G1MSM, error) { + return nil, errors.New("gpu: cuda required") +} + +func (m *G1MSM) Close() {} +func (m *G1MSM) Len() int { return 0 } +func (m *G1MSM) PinWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) ReleaseWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) MultiExp(_ ...[]fr.Element) ([]curve.G1Jac, error) { + return nil, errors.New("gpu: cuda required") +} +func (m *G1MSM) LastPhaseTimings() [9]float32 { return [9]float32{} } +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { return nil } + +func MultiExpSplit(_, _ *G1MSM, _ []fr.Element) (curve.G1Jac, error) { + return curve.G1Jac{}, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bn254/msm_test.go b/prover/gpu/plonk2/bn254/msm_test.go new file mode 100644 index 00000000000..ec2fe37a1c2 --- /dev/null +++ b/prover/gpu/plonk2/bn254/msm_test.go @@ -0,0 +1,139 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254_test + +import ( + "fmt" + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "github.com/consensys/gnark-crypto/ecc/bn254" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bn254" + "github.com/stretchr/testify/require" +) + +func makeTestPoints(n int) []curve.G1Affine { + _, _, g1, _ := curve.Generators() + pts := make([]curve.G1Affine, n) + pts[0] = g1 + for i := 1; i < n; i++ { + pts[i].Add(&pts[i-1], &g1) + } + return pts +} + +// TestMSMMatchesCPU verifies GPU MSM matches gnark-crypto CPU MultiExp. +func TestMSMMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + + for _, n := range []int{1, 16, 100, 1000} { + n := n + t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + + // CPU reference + var cpuResult curve.G1Affine + cpuResult.MultiExp(pts, scalars, ecc.MultiExpConfig{}) + + // GPU + msm, err := bn254.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars) + require.NoError(t, err) + require.Len(t, results, 1) + + var gpuAffine curve.G1Affine + gpuAffine.FromJacobian(&results[0]) + + require.True(t, cpuResult.Equal(&gpuAffine), + "MSM mismatch at n=%d", n) + }) + } +} + +// TestMSMBatchScalarSets tests MultiExp with multiple scalar sets. +func TestMSMBatchScalarSets(t *testing.T) { + dev := requireGPUDev(t) + const n = 100 + + pts := makeTestPoints(n) + scalars1 := randFrVec(n) + scalars2 := randFrVec(n) + + // CPU references + var cpu1, cpu2 curve.G1Affine + cpu1.MultiExp(pts, scalars1, ecc.MultiExpConfig{}) + cpu2.MultiExp(pts, scalars2, ecc.MultiExpConfig{}) + + // GPU batch + msm, err := bn254.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars1, scalars2) + require.NoError(t, err) + require.Len(t, results, 2) + + var gpu1, gpu2 curve.G1Affine + gpu1.FromJacobian(&results[0]) + gpu2.FromJacobian(&results[1]) + + require.True(t, cpu1.Equal(&gpu1), "MSM set 0 mismatch") + require.True(t, cpu2.Equal(&gpu2), "MSM set 1 mismatch") +} + +// TestMSMWorkBuffers verifies PinWorkBuffers/ReleaseWorkBuffers are idempotent. +func TestMSMWorkBuffers(t *testing.T) { + dev := requireGPUDev(t) + const n = 64 + + pts := makeTestPoints(n) + scalars := randFrVec(n) + + msm, err := bn254.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + require.NoError(t, msm.PinWorkBuffers()) + r1, err := msm.MultiExp(scalars) + require.NoError(t, err) + + require.NoError(t, msm.ReleaseWorkBuffers()) + r2, err := msm.MultiExp(scalars) + require.NoError(t, err) + + var a1, a2 curve.G1Affine + a1.FromJacobian(&r1[0]) + a2.FromJacobian(&r2[0]) + require.True(t, a1.Equal(&a2), "result changed after work buffer release") +} + +// BenchmarkMSM benchmarks GPU MSM at various sizes. +func BenchmarkMSM(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + msm, err := bn254.NewG1MSM(dev, pts, 0) + require.NoError(b, err) + defer msm.Close() + require.NoError(b, msm.PinWorkBuffers()) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := msm.MultiExp(scalars) + require.NoError(b, err) + } + }) + } +} diff --git a/prover/gpu/plonk2/bn254/pinned_fr.go b/prover/gpu/plonk2/bn254/pinned_fr.go new file mode 100644 index 00000000000..07781d266f2 --- /dev/null +++ b/prover/gpu/plonk2/bn254/pinned_fr.go @@ -0,0 +1,41 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" +) + +type pinnedFrBuffer struct { + ptr unsafe.Pointer + data []fr.Element +} + +func newPinnedFrBuffer(n int) (pinnedFrBuffer, error) { + var ptr unsafe.Pointer + nbytes := C.size_t(n) * C.size_t(fr.Bytes) + if err := toError(C.gnark_gpu_alloc_pinned(&ptr, nbytes)); err != nil { + return pinnedFrBuffer{}, err + } + return pinnedFrBuffer{ + ptr: ptr, + data: unsafe.Slice((*fr.Element)(ptr), n), + }, nil +} + +func (b *pinnedFrBuffer) free() { + if b.ptr != nil { + C.gnark_gpu_free_pinned(b.ptr) + b.ptr = nil + b.data = nil + } +} diff --git a/prover/gpu/plonk2/bn254/plonk_test.go b/prover/gpu/plonk2/bn254/plonk_test.go new file mode 100644 index 00000000000..93e142c199e --- /dev/null +++ b/prover/gpu/plonk2/bn254/plonk_test.go @@ -0,0 +1,169 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254_test + +import ( + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "github.com/consensys/gnark-crypto/ecc/bn254" + kzg "github.com/consensys/gnark-crypto/ecc/bn254/kzg" + gnarkplonk "github.com/consensys/gnark/backend/plonk" + curplonk "github.com/consensys/gnark/backend/plonk/bn254" + cs "github.com/consensys/gnark/constraint/bn254" + "github.com/consensys/gnark/frontend" + "github.com/consensys/gnark/frontend/cs/scs" + "github.com/consensys/gnark/test/unsafekzg" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bn254" + "github.com/stretchr/testify/require" +) + +// addCircuit has enough constraints for sizeSystem >= 6 (avoiding gnark's 8-coset edge case for tiny circuits). +// Circuit: a*b + c*d + e*f = out (out is public) +type addCircuit struct { + A, B, C, D, F, G frontend.Variable + Out frontend.Variable `gnark:",public"` +} + +func (c *addCircuit) Define(api frontend.API) error { + ab := api.Mul(c.A, c.B) + cd := api.Mul(c.C, c.D) + fg := api.Mul(c.F, c.G) + sum := api.Add(ab, cd) + sum2 := api.Add(sum, fg) + api.AssertIsEqual(sum2, c.Out) + return nil +} + +type commitCircuit struct { + A, B, Out frontend.Variable +} + +func (c *commitCircuit) Define(api frontend.API) error { + commitment, err := api.(frontend.Committer).Commit(c.A, c.B) + if err != nil { + return err + } + product := api.Mul(c.A, c.B) + api.AssertIsDifferent(commitment, product) + api.AssertIsEqual(api.Add(c.A, c.B), c.Out) + return nil +} + +func setupAddCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &addCircuit{}) +} + +func setupCommitCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &commitCircuit{}) +} + +func setupCircuit(t testing.TB, circuit frontend.Circuit) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + ccs, err := frontend.Compile(ecc.BN254.ScalarField(), scs.NewBuilder, circuit) + require.NoError(t, err) + + srs, srsLag, err := unsafekzg.NewSRS(ccs) + require.NoError(t, err) + + _, vkIface, err := gnarkplonk.Setup(ccs, srs, srsLag) + require.NoError(t, err) + vk := vkIface.(*curplonk.VerifyingKey) + + // Extract canonical G1 SRS points from the concrete KZG SRS type. + concreteSRS := srs.(*kzg.SRS) + srsPoints := make([]curve.G1Affine, len(concreteSRS.Pk.G1)) + copy(srsPoints, concreteSRS.Pk.G1) + + return ccs.(*cs.SparseR1CS), vk, srsPoints +} + +// TestGPUProveVerify proves a small circuit with the GPU and verifies with gnark CPU. +func TestGPUProveVerify(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := bn254.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15 + 77 + 8} + fullW, err := frontend.NewWitness(assignment, ecc.BN254.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bn254.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +// TestGPUProveMultipleProofs tests that multiple proofs can be generated from the same key. +func TestGPUProveMultipleProofs(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := bn254.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + for i := range 3 { + a := int64(i + 1) + _ = int64(i + 2) + assignment := &addCircuit{A: a, B: a + 1, C: a + 2, D: a + 3, F: a + 4, G: a + 5, Out: a*(a+1) + (a+2)*(a+3) + (a+4)*(a+5)} + fullW, err := frontend.NewWitness(assignment, ecc.BN254.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bn254.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err, "proof %d failed", i) + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "proof %d verification failed", i) + } +} + +func TestGPUProveVerify_BSB22Commitment(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupCommitCircuit(t) + + gpk := bn254.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &commitCircuit{A: 3, B: 5, Out: 8} + fullW, err := frontend.NewWitness(assignment, ecc.BN254.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bn254.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +// BenchmarkGPUProve benchmarks GPU proof generation. +func BenchmarkGPUProve(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + spr, vk, srsPoints := setupAddCircuit(b) + gpk := bn254.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15 + 77 + 8} + fullW, err := frontend.NewWitness(assignment, ecc.BN254.ScalarField()) + require.NoError(b, err) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := bn254.GPUProve(dev, gpk, spr, fullW) + require.NoError(b, err) + } +} diff --git a/prover/gpu/plonk2/bn254/prove.go b/prover/gpu/plonk2/bn254/prove.go new file mode 100644 index 00000000000..34d507d0ccb --- /dev/null +++ b/prover/gpu/plonk2/bn254/prove.go @@ -0,0 +1,2618 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "context" + "errors" + "fmt" + "hash" + "log" + "math/big" + "math/bits" + "os" + "runtime" + "strconv" + "sync" + "time" + "unsafe" + + curve "github.com/consensys/gnark-crypto/ecc/bn254" + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft" + htf "github.com/consensys/gnark-crypto/ecc/bn254/fr/hash_to_field" + iop "github.com/consensys/gnark-crypto/ecc/bn254/fr/iop" + kzg "github.com/consensys/gnark-crypto/ecc/bn254/kzg" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + + "github.com/consensys/gnark/backend" + curplonk "github.com/consensys/gnark/backend/plonk/bn254" + "github.com/consensys/gnark/backend/witness" + "github.com/consensys/gnark/constraint" + cs "github.com/consensys/gnark/constraint/bn254" + "github.com/consensys/gnark/constraint/solver" + fcs "github.com/consensys/gnark/frontend/cs" + + "github.com/consensys/linea-monorepo/prover/gpu" + "golang.org/x/sync/errgroup" +) + +const ( + id_L int = iota + id_R + id_O + id_Z + + orderBlindingL = 1 + orderBlindingR = 1 + orderBlindingO = 1 + orderBlindingZ = 2 + msmExtraPoints = 6 +) + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProvingKey — slim wrapper: VerifyingKey + lazy gpuInstance +// ───────────────────────────────────────────────────────────────────────────── + +type GPUProvingKey struct { + mu sync.Mutex + Vk *curplonk.VerifyingKey + n int + + // SRS data (consumed during instance init) + srsPoints []curve.G1Affine + pinnedN int + + inst *gpuInstance +} + +// NewGPUProvingKey creates a GPUProvingKey from affine SRS points. +func NewGPUProvingKey(srsPoints []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + n := 0 + if vk != nil { + n = int(vk.Size) + } + return &GPUProvingKey{Vk: vk, n: n, srsPoints: srsPoints} +} + +// Size returns the domain size n. +func (gpk *GPUProvingKey) Size() int { return gpk.n } + +// Prepare performs one-time GPU setup. +func (gpk *GPUProvingKey) Prepare(dev *gpu.Device, spr *cs.SparseR1CS) error { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil && gpk.inst.dev == dev { + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + inst, err := newGPUInstance(dev, gpk, spr) + if err != nil { + return err + } + gpk.inst = inst + return nil +} + +// Close releases all GPU resources. +func (gpk *GPUProvingKey) Close() { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuInstance — persistent GPU resources + circuit data +// ───────────────────────────────────────────────────────────────────────────── + +// quotientWorkBufs holds pre-allocated GPU buffers for computeNumeratorGPU and +// computeLinearizedPoly, avoiding per-proof cudaMalloc/Free overhead. +type quotientWorkBufs struct { + L, R, O, Z *FrVector // wire poly working buffers (reused per coset) + S1, S2, S3 *FrVector // perm selector buffers + Result *FrVector // coset numerator accumulator + LCan, RCan, OCan, ZCan *FrVector // canonical wire polys (uploaded once per proof) + QkSrc *FrVector // Qk canonical source (D2D per coset, avoids H2D) + Pi2Src []*FrVector // per-proof BSB22 pi2 sources (D2D per coset) + CosetBlock [3]*FrVector // GPU-resident coset results; Result keeps block 4 + LinResult, LinW *FrVector // linearized poly GPU scratch +} + +type lowMemorySelectorCache struct { + ql, qr, qm, qo *FrVector + s1, s2, s3 *FrVector + qcp []*FrVector +} + +type splitMSMBackend struct { + secondary *gpu.Device + msm0 *G1MSM + msm1 *G1MSM + split int +} + +type gpuInstance struct { + dev *gpu.Device + vk *curplonk.VerifyingKey + n int + log2n uint + lowMemory bool + canonicalReady chan struct{} + canonicalErr error + canonicalOnce sync.Once + + domain0 *fft.Domain + + msm *G1MSM + splitMSM *splitMSMBackend + fftDom *GPUFFTDomain + dPerm unsafe.Pointer + + dQl, dQr, dQm, dQo *FrVector + dS1, dS2, dS3 *FrVector + dQkFixed *FrVector + dQcp []*FrVector + + qlCanonical, qrCanonical, qmCanonical, qoCanonical fr.Vector + qkFixedCanonical fr.Vector + s1Canonical, s2Canonical, s3Canonical fr.Vector + qcpCanonical []fr.Vector + qkLagrange fr.Vector + permutation []int64 + nbPublicVariables int + commitmentInfo []uint64 + + gpuWork *FrVector // shared scratch buffer (persists for prover lifetime) + qWb quotientWorkBufs + + hBufs hostBufs +} + +type gpuInstanceReadyHooks struct { + msm func(*gpuInstance) + commit func(*gpuInstance) + trace func(*gpuInstance) +} + +type hostBufs struct { + lCanonical, rCanonical, oCanonical fr.Vector + zLagrange fr.Vector + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + hFull []fr.Element + openZBuf []fr.Element + pinned []pinnedFrBuffer +} + +func (inst *gpuInstance) initHostBufs() { + n := inst.n + var hb hostBufs + + allocPinnedHotBuffer := func(name string, n int) []fr.Element { + if os.Getenv("GNARK_GPU_DISABLE_PINNED_HOST_BUFS") == "" { + buf, err := newPinnedFrBuffer(n) + if err == nil { + hb.pinned = append(hb.pinned, buf) + return buf.data + } + log.Printf("gpu: pinned host buffer %s unavailable (%v), using heap", name, err) + } + return make([]fr.Element, n) + } + + hb = hostBufs{ + lCanonical: make(fr.Vector, n), + rCanonical: make(fr.Vector, n), + oCanonical: make(fr.Vector, n), + zLagrange: make(fr.Vector, n), + qkCoeffs: make(fr.Vector, n), + openZBuf: make([]fr.Element, n+1+orderBlindingZ), + } + hb.lBlinded = allocPinnedHotBuffer("lBlinded", n+1+orderBlindingL) + hb.rBlinded = allocPinnedHotBuffer("rBlinded", n+1+orderBlindingR) + hb.oBlinded = allocPinnedHotBuffer("oBlinded", n+1+orderBlindingO) + hb.zBlinded = allocPinnedHotBuffer("zBlinded", n+1+orderBlindingZ) + hSize := 4 * n + if needed := 3 * (n + 2); needed > hSize { + hSize = needed + } + hb.hFull = allocPinnedHotBuffer("hFull", hSize) + inst.hBufs = hb +} + +func (hb *hostBufs) free() { + for i := range hb.pinned { + hb.pinned[i].free() + } + *hb = hostBufs{} +} + +func newGPUInstance(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, hooks ...gpuInstanceReadyHooks) (*gpuInstance, error) { + inst := &gpuInstance{dev: dev, vk: gpk.Vk, n: gpk.n, canonicalReady: make(chan struct{})} + var hook gpuInstanceReadyHooks + if len(hooks) > 0 { + hook = hooks[0] + } + commitPublished := false + msmPublished := false + tracePublished := false + publishMSMReady := func() { + if hook.msm != nil && !msmPublished { + msmPublished = true + hook.msm(inst) + } + } + publishCommitReady := func() { + if hook.commit != nil && !commitPublished { + commitPublished = true + hook.commit(inst) + } + } + publishTraceReady := func() { + if hook.trace != nil && !tracePublished { + tracePublished = true + hook.trace(inst) + } + } + var traceErrCh chan error + + fail := func(msg string, err error) (*gpuInstance, error) { + wrapped := fmt.Errorf("%s: %w", msg, err) + if traceErrCh != nil { + <-traceErrCh + traceErrCh = nil + } + inst.publishCanonicalReady(wrapped) + if !msmPublished && !commitPublished && !tracePublished { + inst.close() + } + return nil, wrapped + } + + if err := inst.initCircuitShape(spr); err != nil { + return fail("init circuit shape", err) + } + inst.lowMemory = selectLowMemoryMode(dev, inst.n) + traceErrCh = make(chan error, 1) + go func() { + traceErrCh <- inst.initTraceData(spr) + }() + waitTrace := func() error { + if traceErrCh == nil { + return nil + } + err := <-traceErrCh + traceErrCh = nil + return err + } + + var err error + msmSize := inst.n + msmExtraPoints + pts := gpk.srsPoints + if msmSize > len(pts) { + msmSize = len(pts) + } + if secondaryID, ok, cfgErr := secondaryMSMDeviceID(dev.DeviceID()); cfgErr != nil { + return fail("configure secondary MSM GPU", cfgErr) + } else if ok { + split := inst.n / 2 + if split <= 0 || split >= msmSize { + return fail("configure secondary MSM GPU", fmt.Errorf("invalid split %d for MSM size %d", split, msmSize)) + } + secondary, err := gpu.New(gpu.WithDeviceID(secondaryID)) + if err != nil { + return fail("create secondary GPU device", err) + } + inst.splitMSM = &splitMSMBackend{secondary: secondary, split: split} + inst.splitMSM.msm0, err = NewG1MSM(dev, pts[:split], 0) + if err != nil { + return fail("create primary split MSM", err) + } + inst.splitMSM.msm1, err = NewG1MSM(secondary, pts[split:msmSize], 0) + if err != nil { + return fail("create secondary split MSM", err) + } + } else { + inst.msm, err = NewG1MSM(dev, pts[:msmSize], 0) + if err != nil { + return fail("create MSM", err) + } + } + gpk.srsPoints = nil // ownership transferred; free heap copy + + if !inst.lowMemory { + if perr := inst.pinMSMWorkBuffers(); perr != nil { + return fail("pin MSM work buffers", perr) + } + } + + if inst.lowMemory { + if err := inst.offloadMSMPoints(); err != nil { + return fail("offload MSM points", err) + } + } + + inst.fftDom, err = NewFFTDomain(dev, inst.n) + if err != nil { + return fail("create FFT domain", err) + } + + if inst.lowMemory { + inst.gpuWork, err = NewFrVector(dev, inst.n) + if err != nil { + return fail("alloc low-memory GPU work buffer", err) + } + if err := dev.InitMultiStream(); err != nil { + return fail("init multi-stream", err) + } + publishMSMReady() + inst.initHostBufs() + publishCommitReady() + } + + if err := waitTrace(); err != nil { + return fail("init circuit data", err) + } + + inst.dPerm, err = DeviceAllocCopyInt64(dev, inst.permutation) + if err != nil { + return fail("upload permutation", err) + } + + if inst.lowMemory { + publishTraceReady() + } + + if err := inst.initCanonicalGPU(); err != nil { + return fail("init canonical", err) + } + + if inst.lowMemory { + inst.publishCanonicalReady(nil) + return inst, nil + } + + if err := inst.uploadPolynomials(); err != nil { + return fail("upload polynomials", err) + } + + if err := inst.allocPersistentBufs(); err != nil { + return fail("alloc persistent GPU buffers", err) + } + + inst.initHostBufs() + publishMSMReady() + publishCommitReady() + publishTraceReady() + inst.publishCanonicalReady(nil) + return inst, nil +} + +func (inst *gpuInstance) publishCanonicalReady(err error) { + inst.canonicalOnce.Do(func() { + inst.canonicalErr = err + close(inst.canonicalReady) + }) +} + +func (inst *gpuInstance) waitCanonicalReady() error { + if inst.canonicalReady == nil { + return nil + } + <-inst.canonicalReady + return inst.canonicalErr +} + +func selectLowMemoryMode(dev *gpu.Device, n int) bool { + if os.Getenv("GNARK_GPU_PLONK2_FORCE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode forced for n=%d", n) + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode disabled for n=%d", n) + return false + } + free, total, err := dev.MemGetInfo() + if err != nil { + low := n >= 1<<25 + log.Printf("plonk2: low-memory GPU mode=%t for n=%d; mem query failed: %v", low, n, err) + return low + } + vecBytes := uint64(n) * uint64(fr.Bytes) + estimatedResident := vecBytes * 24 + low := estimatedResident > total/2 + log.Printf( + "plonk2: low-memory GPU mode=%t n=%d vecBytes=%d estimatedResident=%d freeVRAM=%d totalVRAM=%d", + low, n, vecBytes, estimatedResident, free, total, + ) + return low +} + +func secondaryMSMDeviceID(primaryID int) (int, bool, error) { + raw := os.Getenv("GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID") + if raw == "" { + return 0, false, nil + } + id, err := strconv.Atoi(raw) + if err != nil { + return 0, false, fmt.Errorf("invalid GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID %q: %w", raw, err) + } + if id == primaryID { + return 0, false, fmt.Errorf("secondary device matches primary device %d", primaryID) + } + if id < 0 { + return 0, false, fmt.Errorf("secondary device id must be non-negative, got %d", id) + } + return id, true, nil +} + +func (inst *gpuInstance) pinMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.PinWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.PinWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.PinWorkBuffers() +} + +func (inst *gpuInstance) releaseMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReleaseWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReleaseWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReleaseWorkBuffers() +} + +func (inst *gpuInstance) offloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.OffloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.OffloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.OffloadPoints() +} + +func (inst *gpuInstance) reloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReloadPoints() +} + +// allocPersistentBufs allocates GPU work buffers that persist across proofs. +// Avoids per-proof cudaMalloc/Free overhead (~3 ms per 64 MB alloc × 20 bufs). +func (inst *gpuInstance) allocPersistentBufs() error { + n := inst.n + alloc := func() (*FrVector, error) { + return NewFrVector(inst.dev, n) + } + wb := &inst.qWb + // Flat list mirrors the free loop in close() — keep in sync. + named := []*(*FrVector){ + &inst.gpuWork, + &wb.L, &wb.R, &wb.O, &wb.Z, + &wb.S1, &wb.S2, &wb.S3, &wb.Result, + &wb.LCan, &wb.RCan, &wb.OCan, &wb.ZCan, + &wb.QkSrc, &wb.LinResult, &wb.LinW, + } + for _, p := range named { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + *p = v + } + for k := range wb.CosetBlock { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.CosetBlock[k] = v + } + if len(inst.commitmentInfo) > 0 { + wb.Pi2Src = make([]*FrVector, len(inst.commitmentInfo)) + for i := range wb.Pi2Src { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.Pi2Src[i] = v + } + } + // Create multi-stream upfront so the quotient pipeline can use it immediately. + return inst.dev.InitMultiStream() +} + +func (inst *gpuInstance) initCircuitShape(spr *cs.SparseR1CS) error { + nbConstraints := spr.GetNbConstraints() + sizeSystem := uint64(nbConstraints + len(spr.Public)) + inst.domain0 = fft.NewDomain(sizeSystem, fft.WithoutPrecompute()) + n := int(inst.domain0.Cardinality) + if n != inst.n { + return fmt.Errorf("domain size mismatch: spr=%d SRS=%d", n, inst.n) + } + inst.log2n = uint(bits.TrailingZeros(uint(n))) + inst.nbPublicVariables = len(spr.Public) + inst.commitmentInfo = inst.vk.CommitmentConstraintIndexes + return nil +} + +func (inst *gpuInstance) initTraceData(spr *cs.SparseR1CS) error { + trace := curplonk.NewTrace(spr, inst.domain0) + inst.qlCanonical = fr.Vector(trace.Ql.Coefficients()) + inst.qrCanonical = fr.Vector(trace.Qr.Coefficients()) + inst.qmCanonical = fr.Vector(trace.Qm.Coefficients()) + inst.qoCanonical = fr.Vector(trace.Qo.Coefficients()) + inst.s1Canonical = fr.Vector(trace.S1.Coefficients()) + inst.s2Canonical = fr.Vector(trace.S2.Coefficients()) + inst.s3Canonical = fr.Vector(trace.S3.Coefficients()) + + inst.qkLagrange = make(fr.Vector, inst.n) + copy(inst.qkLagrange, trace.Qk.Coefficients()) + inst.qkFixedCanonical = fr.Vector(trace.Qk.Coefficients()) + + inst.qcpCanonical = make([]fr.Vector, len(trace.Qcp)) + for i, p := range trace.Qcp { + inst.qcpCanonical[i] = fr.Vector(p.Coefficients()) + } + inst.permutation = trace.S + return nil +} + +func (inst *gpuInstance) initCanonicalGPU() error { + n := inst.n + gpuWork, err := NewFrVector(inst.dev, n) + if err != nil { + return fmt.Errorf("alloc work vector: %w", err) + } + defer gpuWork.Free() + + iFFTSelector := func(v fr.Vector) { + gpuWork.CopyFromHost(v) + inst.fftDom.BitReverse(gpuWork) + inst.fftDom.FFTInverse(gpuWork) + gpuWork.CopyToHost(v) + } + + for _, v := range []fr.Vector{ + inst.qlCanonical, inst.qrCanonical, inst.qmCanonical, inst.qoCanonical, + inst.qkFixedCanonical, inst.s1Canonical, inst.s2Canonical, inst.s3Canonical, + } { + iFFTSelector(v) + } + for _, v := range inst.qcpCanonical { + iFFTSelector(v) + } + + return inst.dev.Sync() +} + +func (inst *gpuInstance) uploadPolynomials() error { + upload := func(data fr.Vector) (*FrVector, error) { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, err + } + v.CopyFromHost(data) + return v, nil + } + var err error + if inst.dQl, err = upload(inst.qlCanonical); err != nil { + return fmt.Errorf("upload ql: %w", err) + } + if inst.dQr, err = upload(inst.qrCanonical); err != nil { + return fmt.Errorf("upload qr: %w", err) + } + if inst.dQm, err = upload(inst.qmCanonical); err != nil { + return fmt.Errorf("upload qm: %w", err) + } + if inst.dQo, err = upload(inst.qoCanonical); err != nil { + return fmt.Errorf("upload qo: %w", err) + } + if inst.dS1, err = upload(inst.s1Canonical); err != nil { + return fmt.Errorf("upload s1: %w", err) + } + if inst.dS2, err = upload(inst.s2Canonical); err != nil { + return fmt.Errorf("upload s2: %w", err) + } + if inst.dS3, err = upload(inst.s3Canonical); err != nil { + return fmt.Errorf("upload s3: %w", err) + } + if inst.dQkFixed, err = upload(inst.qkFixedCanonical); err != nil { + return fmt.Errorf("upload qkFixed: %w", err) + } + inst.dQcp = make([]*FrVector, len(inst.qcpCanonical)) + for i, v := range inst.qcpCanonical { + if inst.dQcp[i], err = upload(v); err != nil { + return fmt.Errorf("upload qcp[%d]: %w", i, err) + } + } + return nil +} + +func (inst *gpuInstance) close() { + if inst.msm != nil { + inst.msm.Close() + inst.msm = nil + } + if inst.splitMSM != nil { + if inst.splitMSM.msm0 != nil { + inst.splitMSM.msm0.Close() + } + if inst.splitMSM.msm1 != nil { + inst.splitMSM.msm1.Close() + } + if inst.splitMSM.secondary != nil { + _ = inst.splitMSM.secondary.Close() + } + inst.splitMSM = nil + } + if inst.fftDom != nil { + inst.fftDom.Close() + inst.fftDom = nil + } + if inst.dPerm != nil { + DeviceFreePtr(inst.dPerm) + inst.dPerm = nil + } + for _, v := range []*FrVector{inst.dQl, inst.dQr, inst.dQm, inst.dQo, + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed} { + if v != nil { + v.Free() + } + } + inst.dQl, inst.dQr, inst.dQm, inst.dQo = nil, nil, nil, nil + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed = nil, nil, nil, nil + for _, v := range inst.dQcp { + if v != nil { + v.Free() + } + } + inst.dQcp = nil + // Free persistent work buffers (mirrors the alloc list in allocPersistentBufs). + wb := &inst.qWb + for _, v := range []*FrVector{ + inst.gpuWork, + wb.L, wb.R, wb.O, wb.Z, wb.S1, wb.S2, wb.S3, wb.Result, + wb.LCan, wb.RCan, wb.OCan, wb.ZCan, wb.QkSrc, wb.LinResult, wb.LinW, + } { + if v != nil { + v.Free() + } + } + for k := range wb.CosetBlock { + if wb.CosetBlock[k] != nil { + wb.CosetBlock[k].Free() + } + } + for _, v := range wb.Pi2Src { + if v != nil { + v.Free() + } + } + inst.gpuWork = nil + inst.qWb = quotientWorkBufs{} + inst.hBufs.free() +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuProver — per-proof mutable state +// ───────────────────────────────────────────────────────────────────────────── + +type gpuProver struct { + inst *gpuInstance + instMu sync.Mutex + waitInst func() (*gpuInstance, error) + waitMSMInst func() (*gpuInstance, error) + waitCommitInst func() (*gpuInstance, error) + + proof curplonk.Proof + fs *fiatshamir.Transcript + + commitmentInfo constraint.PlonkCommitments + commitmentVal []fr.Element + pi2Canonical [][]fr.Element + pi2DeviceReady []bool + solverOpts []solver.Option + kzgFoldingHash hash.Hash + htfFunc hash.Hash + + evalL, evalR, evalO fr.Vector + wWitness fr.Vector + bpL, bpR, bpO, bpZ *iop.Polynomial + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + h1, h2, h3 []fr.Element + gamma, beta, alpha, zeta fr.Element + + logTime func(string) +} + +// ─── Prove phases ───────────────────────────────────────────────────────────── + +func (p *gpuProver) ensureInst() (*gpuInstance, error) { + p.instMu.Lock() + if p.inst != nil { + inst := p.inst + p.instMu.Unlock() + return inst, nil + } + waitInst := p.waitInst + p.instMu.Unlock() + if waitInst == nil { + return nil, errors.New("gpu instance is not initialized") + } + inst, err := waitInst() + if err != nil { + return nil, err + } + p.instMu.Lock() + if p.inst == nil { + p.inst = inst + } + inst = p.inst + p.instMu.Unlock() + return inst, nil +} + +func (p *gpuProver) initBlindingPolynomials() { + p.bpL = getRandomPolynomial(orderBlindingL) + p.bpR = getRandomPolynomial(orderBlindingR) + p.bpO = getRandomPolynomial(orderBlindingO) + p.bpZ = getRandomPolynomial(orderBlindingZ) +} + +func (p *gpuProver) solve(spr *cs.SparseR1CS, fullWitness witness.Witness) error { + solverOpts := append([]solver.Option(nil), p.solverOpts...) + if len(p.commitmentInfo) > 0 { + bsb22ID := solver.GetHintID(fcs.Bsb22CommitmentComputePlaceholder) + solverOpts = append(solverOpts, solver.OverrideHint(bsb22ID, func(_ *big.Int, ins, outs []*big.Int) error { + waitMSMInst := p.waitMSMInst + if waitMSMInst == nil { + waitMSMInst = p.waitCommitInst + } + if waitMSMInst == nil { + waitMSMInst = p.ensureInst + } + inst, err := waitMSMInst() + if err != nil { + return err + } + n := inst.n + commDepth := int(ins[0].Int64()) + ins = ins[1:] + ci := p.commitmentInfo[commDepth] + committedValues := make([]fr.Element, inst.domain0.Cardinality) + offset := inst.nbPublicVariables + for i := range ins { + committedValues[offset+ci.Committed[i]].SetBigInt(ins[i]) + } + committedValues[offset+ci.CommitmentIndex].SetRandom() + committedValues[offset+spr.GetNbConstraints()-1].SetRandom() + + inst.gpuWork.CopyFromHost(fr.Vector(committedValues[:n])) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if commDepth < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[commDepth] != nil { + inst.qWb.Pi2Src[commDepth].CopyFromDevice(inst.gpuWork) + p.pi2DeviceReady[commDepth] = true + } + canonicalBuf := make(fr.Vector, n) + inst.gpuWork.CopyToHost(canonicalBuf) + p.pi2Canonical[commDepth] = canonicalBuf + + commitment, err := inst.commit(canonicalBuf) + if err != nil { + return err + } + p.proof.Bsb22Commitments[commDepth] = commitment + + p.htfFunc.Write(p.proof.Bsb22Commitments[commDepth].Marshal()) + hashBts := p.htfFunc.Sum(nil) + p.htfFunc.Reset() + nbBuf := fr.Bytes + if p.htfFunc.Size() < fr.Bytes { + nbBuf = p.htfFunc.Size() + } + p.commitmentVal[commDepth].SetBytes(hashBts[:nbBuf]) + p.commitmentVal[commDepth].BigInt(outs[0]) + return nil + })) + } + + solution_, err := spr.Solve(fullWitness, solverOpts...) + if err != nil { + return fmt.Errorf("solve: %w", err) + } + solution := solution_.(*cs.SparseR1CSSolution) + p.evalL = fr.Vector(solution.L) + p.evalR = fr.Vector(solution.R) + p.evalO = fr.Vector(solution.O) + + var ok bool + p.wWitness, ok = fullWitness.Vector().(fr.Vector) + if !ok { + return errors.New("invalid witness type") + } + return nil +} + +func (p *gpuProver) completeQk() { + inst, err := p.ensureInst() + if err != nil { + panic(err) + } + p.qkCoeffs = inst.hBufs.qkCoeffs + copy(p.qkCoeffs, inst.qkLagrange) + copy(p.qkCoeffs, p.wWitness[:inst.nbPublicVariables]) + for i := range p.commitmentInfo { + p.qkCoeffs[inst.nbPublicVariables+p.commitmentInfo[i].CommitmentIndex] = p.commitmentVal[i] + } +} + +// commitToLRO overlaps the iFFT of L,R,O with Qk patching (via waitQk) and +// blinding-polynomial generation (via waitBlinding), both of which complete +// concurrently in sibling goroutines. +func (p *gpuProver) commitToLRO(inst *gpuInstance, waitQk, waitBlinding func() error) error { + hb := &inst.hBufs + + gpuToCanonical := func(lagrange, dst fr.Vector, dstDevice *FrVector) { + inst.gpuWork.CopyFromHost(lagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if dstDevice != nil { + dstDevice.CopyFromDevice(inst.gpuWork) + } + inst.gpuWork.CopyToHost(dst) + } + + if inst.lowMemory { + gpuToCanonical(p.evalL, hb.lCanonical, nil) + gpuToCanonical(p.evalR, hb.rCanonical, nil) + gpuToCanonical(p.evalO, hb.oCanonical, nil) + } else { + gpuToCanonical(p.evalL, hb.lCanonical, inst.qWb.LCan) + gpuToCanonical(p.evalR, hb.rCanonical, inst.qWb.RCan) + gpuToCanonical(p.evalO, hb.oCanonical, inst.qWb.OCan) + } + + if err := waitQk(); err != nil { + return err + } + inst.gpuWork.CopyFromHost(p.qkCoeffs) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if inst.lowMemory { + inst.gpuWork.CopyToHost(p.qkCoeffs) + } else { + inst.qWb.QkSrc.CopyFromDevice(inst.gpuWork) + p.qkCoeffs = nil + } + + if err := waitBlinding(); err != nil { + return err + } + + var blindWG sync.WaitGroup + blindWG.Add(3) + go func() { defer blindWG.Done(); p.lBlinded = blindInto(hb.lBlinded, hb.lCanonical, p.bpL) }() + go func() { defer blindWG.Done(); p.rBlinded = blindInto(hb.rBlinded, hb.rCanonical, p.bpR) }() + go func() { defer blindWG.Done(); p.oBlinded = blindInto(hb.oBlinded, hb.oCanonical, p.bpO) }() + blindWG.Wait() + if !inst.lowMemory { + SubtractBlindingHead(inst.qWb.LCan, p.bpL.Coefficients()) + SubtractBlindingHead(inst.qWb.RCan, p.bpR.Coefficients()) + SubtractBlindingHead(inst.qWb.OCan, p.bpO.Coefficients()) + } + + p.logTime("iFFT L,R,O,Qk + blind") + + lroCommits, err := inst.commitN(p.lBlinded, p.rBlinded, p.oBlinded) + if err != nil { + return err + } + p.proof.LRO[0] = lroCommits[0] + p.proof.LRO[1] = lroCommits[1] + p.proof.LRO[2] = lroCommits[2] + + p.logTime("MSM commit L,R,O") + return nil +} + +func (p *gpuProver) deriveGammaBeta() error { + inst := p.inst + if err := bindPublicData(p.fs, "gamma", inst.vk, p.wWitness[:inst.nbPublicVariables]); err != nil { + return err + } + var err error + p.gamma, err = deriveRandomness(p.fs, "gamma", &p.proof.LRO[0], &p.proof.LRO[1], &p.proof.LRO[2]) + if err != nil { + return err + } + p.beta, err = deriveRandomness(p.fs, "beta") + if err != nil { + return err + } + p.wWitness = nil + p.logTime("derive gamma,beta") + return nil +} + +func (p *gpuProver) buildZAndCommit() error { + inst := p.inst + + zLagrange, err := buildZGPU(inst, inst.gpuWork, p.evalL, p.evalR, p.evalO, p.beta, p.gamma) + if err != nil { + return fmt.Errorf("build Z: %w", err) + } + p.evalL, p.evalR, p.evalO = nil, nil, nil + p.logTime("build Z") + + hb := &inst.hBufs + inst.gpuWork.CopyFromHost(zLagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + inst.gpuWork.CopyToHost(hb.zLagrange) + p.zBlinded = blindInto(hb.zBlinded, hb.zLagrange, p.bpZ) + if !inst.lowMemory { + inst.qWb.ZCan.CopyFromDevice(inst.gpuWork) + SubtractBlindingHead(inst.qWb.ZCan, p.bpZ.Coefficients()) + } + + zCommit, err := inst.commit(p.zBlinded) + if err != nil { + return err + } + p.proof.Z = zCommit + p.logTime("iFFT+commit Z") + + alphaDeps := make([]*curve.G1Affine, len(p.proof.Bsb22Commitments)+1) + for i := range p.proof.Bsb22Commitments { + alphaDeps[i] = &p.proof.Bsb22Commitments[i] + } + alphaDeps[len(alphaDeps)-1] = &p.proof.Z + var aerr error + p.alpha, aerr = deriveRandomness(p.fs, "alpha", alphaDeps...) + if aerr != nil { + return aerr + } + p.logTime("derive alpha") + return nil +} + +func (p *gpuProver) computeQuotientAndCommit() error { + inst := p.inst + if err := inst.waitCanonicalReady(); err != nil { + return fmt.Errorf("initialize canonical selector data: %w", err) + } + + pointsOffloaded := false + if inst.shouldOffloadMSMForQuotient() { + if err := inst.offloadMSMPoints(); err != nil { + return fmt.Errorf("offload MSM points: %w", err) + } + pointsOffloaded = true + if err := inst.releaseMSMWorkBuffers(); err != nil { + return fmt.Errorf("release MSM work buffers: %w", err) + } + } + defer func() { + if pointsOffloaded { + _ = inst.reloadMSMPoints() + if !inst.lowMemory { + _ = inst.pinMSMWorkBuffers() + } + } + }() + + var qErr error + p.h1, p.h2, p.h3, qErr = computeNumeratorGPU( + inst, inst.gpuWork, + p.lBlinded, p.rBlinded, p.oBlinded, p.zBlinded, + p.qkCoeffs, p.pi2Canonical, p.pi2DeviceReady, + p.alpha, p.beta, p.gamma, + ) + if qErr != nil { + return fmt.Errorf("compute quotient: %w", qErr) + } + + p.logTime("quotient GPU") + + if pointsOffloaded { + if err := inst.reloadMSMPoints(); err != nil { + return fmt.Errorf("reload MSM points: %w", err) + } + if !inst.lowMemory { + if err := inst.pinMSMWorkBuffers(); err != nil { + return fmt.Errorf("re-pin MSM work buffers: %w", err) + } + } + pointsOffloaded = false + } + hCommits, err := inst.commitN(p.h1, p.h2, p.h3) + if err != nil { + return err + } + p.proof.H[0] = hCommits[0] + p.proof.H[1] = hCommits[1] + p.proof.H[2] = hCommits[2] + p.logTime("MSM commit h1,h2,h3") + + var zetaErr error + p.zeta, zetaErr = deriveRandomness(p.fs, "zeta", &p.proof.H[0], &p.proof.H[1], &p.proof.H[2]) + if zetaErr != nil { + return zetaErr + } + return nil +} + +func (inst *gpuInstance) shouldOffloadMSMForQuotient() bool { + if inst.lowMemory { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_FORCE_MSM_OFFLOAD") != "" { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_MSM_OFFLOAD") != "" { + return false + } + free, _, err := inst.dev.MemGetInfo() + if err != nil { + return true + } + reserve := uint64(inst.n) * uint64(fr.Bytes) * 8 + const minReserve = 2 << 30 + if reserve < minReserve { + reserve = minReserve + } + return free < reserve +} + +func (p *gpuProver) openAndFinalize() error { + inst := p.inst + + var zetaShifted fr.Element + zetaShifted.Mul(&p.zeta, &inst.domain0.Generator) + + openZPoly := inst.hBufs.openZBuf[:len(p.zBlinded)] + copy(openZPoly, p.zBlinded) + bzuzetaCh := make(chan fr.Element, 1) + go func() { + parallelHornerQuotient(openZPoly, zetaShifted) + bzuzetaCh <- openZPoly[0] + }() + + // Evaluate host-only blinded polys on CPU while GPU-resident selector polys + // are evaluated on device. + var blzeta, brzeta, bozeta, s1Zeta, s2Zeta fr.Element + var evalWG sync.WaitGroup + evalWG.Add(3) + go func() { defer evalWG.Done(); blzeta = polyEvalParallel(p.lBlinded, p.zeta) }() + go func() { defer evalWG.Done(); brzeta = polyEvalParallel(p.rBlinded, p.zeta) }() + go func() { defer evalWG.Done(); bozeta = polyEvalParallel(p.oBlinded, p.zeta) }() + + if inst.lowMemory { + s1Zeta = polyEvalParallel(inst.s1Canonical, p.zeta) + s2Zeta = polyEvalParallel(inst.s2Canonical, p.zeta) + } else { + s1Zeta = PolyEvalGPU(inst.dev, inst.dS1, p.zeta) + s2Zeta = PolyEvalGPU(inst.dev, inst.dS2, p.zeta) + } + + qcpzeta := make([]fr.Element, len(p.commitmentInfo)) + for i := range p.commitmentInfo { + if inst.lowMemory { + qcpzeta[i] = polyEvalParallel(inst.qcpCanonical[i], p.zeta) + } else { + qcpzeta[i] = PolyEvalGPU(inst.dev, inst.dQcp[i], p.zeta) + } + } + evalWG.Wait() + + bzuzeta := <-bzuzetaCh + p.proof.ZShiftedOpening.ClaimedValue.Set(&bzuzeta) + + var linPol []fr.Element + if inst.lowMemory { + linPol = innerComputeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.h1, p.h2, p.h3, + ) + } else { + linPol = computeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.pi2DeviceReady, p.h1, p.h2, p.h3, + ) + } + p.h1, p.h2, p.h3, p.pi2Canonical, p.pi2DeviceReady = nil, nil, nil, nil, nil + + zOpenCommit, err := inst.commit(openZPoly[1:]) + if err != nil { + return err + } + p.proof.ZShiftedOpening.H = zOpenCommit + p.logTime("eval+linearize+open Z") + + linPolZetaCh := make(chan fr.Element, 1) + go func() { + linPolZetaCh <- polyEvalParallel(linPol, p.zeta) + }() + + linPolDigest, err := inst.commit(linPol) + if err != nil { + return err + } + p.logTime("MSM commit linPol") + + nPolysToOpen := 6 + len(inst.qcpCanonical) + claimedValues := make([]fr.Element, nPolysToOpen) + claimedValues[0] = <-linPolZetaCh + claimedValues[1] = blzeta + claimedValues[2] = brzeta + claimedValues[3] = bozeta + claimedValues[4] = s1Zeta + claimedValues[5] = s2Zeta + for i := range inst.qcpCanonical { + claimedValues[6+i] = qcpzeta[i] + } + + polysToOpen := make([][]fr.Element, nPolysToOpen) + polysToOpen[0] = linPol + polysToOpen[1] = p.lBlinded + polysToOpen[2] = p.rBlinded + polysToOpen[3] = p.oBlinded + polysToOpen[4] = inst.s1Canonical + polysToOpen[5] = inst.s2Canonical + for i := range inst.qcpCanonical { + polysToOpen[6+i] = inst.qcpCanonical[i] + } + + digestsToOpen := make([]curve.G1Affine, nPolysToOpen) + digestsToOpen[0] = linPolDigest + digestsToOpen[1] = p.proof.LRO[0] + digestsToOpen[2] = p.proof.LRO[1] + digestsToOpen[3] = p.proof.LRO[2] + digestsToOpen[4] = inst.vk.S[0] + digestsToOpen[5] = inst.vk.S[1] + copy(digestsToOpen[6:], inst.vk.Qcp) + + p.proof.BatchedProof, err = gpuBatchOpen( + inst.commit, + polysToOpen, digestsToOpen, claimedValues, + p.zeta, + p.kzgFoldingHash, + p.proof.ZShiftedOpening.ClaimedValue.Marshal(), + ) + if err != nil { + return fmt.Errorf("batch opening: %w", err) + } + p.logTime("batch opening") + return nil +} + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProve — top-level prove API +// ───────────────────────────────────────────────────────────────────────────── + +func GPUProve(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, fullWitness witness.Witness, opts ...backend.ProverOption) (*curplonk.Proof, error) { + proverCfg, err := backend.NewProverConfig(opts...) + if err != nil { + return nil, fmt.Errorf("create prover config: %w", err) + } + if proverCfg.HashToFieldFn == nil { + proverCfg.HashToFieldFn = newHTF([]byte("BSB22-Plonk")) + } + + gpk.mu.Lock() + defer gpk.mu.Unlock() + + if gpk.Vk == nil { + return nil, errors.New("gpu: proving key missing verifying key") + } + + proveStart := time.Now() + logTime := func(label string) { + log.Printf(" [GPUProve n=%d] %s: %v", gpk.n, label, time.Since(proveStart)) + } + + var commitmentInfo constraint.PlonkCommitments + if spr.CommitmentInfo != nil { + commitmentInfo = spr.CommitmentInfo.(constraint.PlonkCommitments) + } + + nbCommitments := len(commitmentInfo) + newProof := &curplonk.Proof{ + Bsb22Commitments: make([]curve.G1Affine, nbCommitments), + } + + msmInstReady := make(chan struct{}) + commitInstReady := make(chan struct{}) + traceInstReady := make(chan struct{}) + var ( + msmInstPublishOnce sync.Once + commitInstPublishOnce sync.Once + traceInstPublishOnce sync.Once + msmInst *gpuInstance + commitInst *gpuInstance + traceInst *gpuInstance + msmInstErr error + commitInstErr error + traceInstErr error + ) + publishMSMInst := func(inst *gpuInstance, err error) { + msmInstPublishOnce.Do(func() { + if err != nil { + msmInstErr = err + } else { + msmInst = inst + } + close(msmInstReady) + }) + } + waitMSMInst := func() (*gpuInstance, error) { + <-msmInstReady + if msmInstErr != nil { + return nil, msmInstErr + } + if msmInst == nil { + return nil, errors.New("gpu instance initialization did not publish an MSM-ready instance") + } + return msmInst, nil + } + publishCommitInst := func(inst *gpuInstance, err error) { + commitInstPublishOnce.Do(func() { + if err != nil { + commitInstErr = err + } else { + commitInst = inst + } + close(commitInstReady) + }) + } + waitCommitInst := func() (*gpuInstance, error) { + <-commitInstReady + if commitInstErr != nil { + return nil, commitInstErr + } + if commitInst == nil { + return nil, errors.New("gpu instance initialization did not publish a commitment-ready instance") + } + return commitInst, nil + } + publishTraceInst := func(inst *gpuInstance, err error) { + traceInstPublishOnce.Do(func() { + if err != nil { + traceInstErr = err + } else { + traceInst = inst + gpk.inst = inst + } + close(traceInstReady) + }) + } + waitInst := func() (*gpuInstance, error) { + <-traceInstReady + if traceInstErr != nil { + return nil, traceInstErr + } + if traceInst == nil { + return nil, errors.New("gpu instance initialization did not publish a trace-ready instance") + } + return traceInst, nil + } + + p := &gpuProver{ + proof: *newProof, + fs: fiatshamir.NewTranscript(proverCfg.ChallengeHash, "gamma", "beta", "alpha", "zeta"), + commitmentInfo: commitmentInfo, + commitmentVal: make([]fr.Element, nbCommitments), + pi2Canonical: make([][]fr.Element, nbCommitments), + pi2DeviceReady: make([]bool, nbCommitments), + solverOpts: proverCfg.SolverOpts, + kzgFoldingHash: proverCfg.KZGFoldingHash, + htfFunc: proverCfg.HashToFieldFn, + logTime: logTime, + waitInst: waitInst, + waitMSMInst: waitMSMInst, + waitCommitInst: waitCommitInst, + } + + // Overlap CPU solve with blinding-polynomial init and Qk patching, then + // feed results into a sequential GPU pipeline. Hides the solve latency + // (~400 ms at n=2^18) behind unrelated work; recovers ~20-30% end-to-end. + chSolved := make(chan struct{}) + chBlinding := make(chan struct{}) + chQk := make(chan struct{}) + + g, gctx := errgroup.WithContext(context.Background()) + + waitCh := func(ch <-chan struct{}) error { + select { + case <-gctx.Done(): + return gctx.Err() + case <-ch: + return nil + } + } + safeGo := func(label string, fn func() error) { + g.Go(func() error { return proveStep(label, fn) }) + } + + safeGo("initGPUInstance", func() error { + if gpk.inst != nil && gpk.inst.dev == dev { + publishMSMInst(gpk.inst, nil) + publishCommitInst(gpk.inst, nil) + publishTraceInst(gpk.inst, nil) + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + msmPublished := false + commitPublished := false + tracePublished := false + inst, err := newGPUInstance(dev, gpk, spr, gpuInstanceReadyHooks{ + msm: func(inst *gpuInstance) { + msmPublished = true + publishMSMInst(inst, nil) + }, + commit: func(inst *gpuInstance) { + commitPublished = true + publishCommitInst(inst, nil) + }, + trace: func(inst *gpuInstance) { + tracePublished = true + publishTraceInst(inst, nil) + logTime("trace-ready GPU instance") + }, + }) + if err != nil { + err = fmt.Errorf("init GPU instance: %w", err) + if !msmPublished { + publishMSMInst(nil, err) + } + if !commitPublished { + publishCommitInst(nil, err) + } + if !tracePublished { + publishTraceInst(nil, err) + } + return err + } + if !msmPublished { + publishMSMInst(inst, nil) + } + if !commitPublished { + publishCommitInst(inst, nil) + } + if !tracePublished { + publishTraceInst(inst, nil) + } + logTime("init GPU instance") + return nil + }) + + safeGo("initBlinding", func() error { + p.initBlindingPolynomials() + close(chBlinding) + return nil + }) + + safeGo("solve", func() error { + if err := p.solve(spr, fullWitness); err != nil { + return err + } + logTime("solve") + close(chSolved) + return nil + }) + + safeGo("completeQk", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + p.completeQk() + close(chQk) + return nil + }) + + safeGo("pipeline", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + commitInst, err := waitCommitInst() + if err != nil { + return err + } + if err := p.commitToLRO( + commitInst, + func() error { return waitCh(chQk) }, + func() error { return waitCh(chBlinding) }, + ); err != nil { + return err + } + if _, err := p.ensureInst(); err != nil { + return err + } + if err := p.deriveGammaBeta(); err != nil { + return err + } + if err := p.buildZAndCommit(); err != nil { + return err + } + if err := p.computeQuotientAndCommit(); err != nil { + return err + } + return p.openAndFinalize() // inst.gpuWork persists (owned by gpuInstance) + }) + + if err := g.Wait(); err != nil { + return nil, err + } + + logTime("total") + result := p.proof + return &result, nil +} + +// proveStep converts a panic in fn to a labeled error so goroutines +// surface panics as normal errors through the errgroup. +func proveStep(label string, fn func() error) (err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("%s panic: %v", label, r) + } + }() + return fn() +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helper functions (ported from gpu/plonk/prove.go) +// ───────────────────────────────────────────────────────────────────────────── + +func buildZGPU( + inst *gpuInstance, gpuWork *FrVector, + evalL, evalR, evalO fr.Vector, beta, gamma fr.Element, +) (fr.Vector, error) { + dev := inst.dev + domain0 := inst.domain0 + + gpuR := inst.qWb.R + gpuO := inst.qWb.O + if inst.lowMemory { + var err error + gpuR, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z R buffer: %w", err) + } + defer gpuR.Free() + gpuO, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z O buffer: %w", err) + } + defer gpuO.Free() + } + + gpuWork.CopyFromHost(evalL) + gpuR.CopyFromHost(evalR) + gpuO.CopyFromHost(evalO) + + gMul := domain0.FrMultiplicativeGen + var gSq fr.Element + gSq.Mul(&gMul, &gMul) + + PlonkZComputeFactors(gpuWork, gpuR, gpuO, inst.dPerm, + beta, gamma, gMul, gSq, inst.log2n, inst.fftDom) + gpuR.BatchInvert(gpuO) + gpuWork.Mul(gpuWork, gpuR) + ZPrefixProduct(dev, gpuR, gpuWork, gpuO) + gpuR.CopyToHost(inst.hBufs.zLagrange) + return inst.hBufs.zLagrange, nil +} + +func computeNumeratorGPU( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + if inst.lowMemory { + return computeNumeratorGPULowMemory( + inst, gpuWork, + lBlinded, rBlinded, oBlinded, zBlinded, + qkCanonical, pi2Canonical, + alpha, beta, gamma, + ) + } + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + // Pre-allocated buffers from gpuInstance (avoids per-proof cudaMalloc/Free). + wb := &inst.qWb + gpuL, gpuR, gpuO, gpuZ := wb.L, wb.R, wb.O, wb.Z + gpuS1, gpuS2, gpuS3 := wb.S1, wb.S2, wb.S3 + gpuResult := wb.Result + gpuLCan, gpuRCan, gpuOCan, gpuZCan := wb.LCan, wb.RCan, wb.OCan, wb.ZCan + gpuCosetBlocks := wb.CosetBlock + + // Event IDs used for cross-stream synchronisation in the 4-coset loop. + const ( + evS123Done gpu.EventID = 0 // StreamTransfer → StreamCompute: S1/S2/S3 D2D done + evPermDone gpu.EventID = 1 // StreamCompute → StreamTransfer: safe to overwrite gate buffers + evCosetDone gpu.EventID = 3 // StreamCompute → StreamTransfer: full coset k done + ) + + // L/R/O/Z canonical heads were produced on-device by the iFFT phases and + // adjusted for blinding there. Keep them resident for the quotient loop. + for j := range pi2Canonical { + if j >= len(pi2DeviceReady) || pi2DeviceReady[j] { + continue + } + if j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil && len(pi2Canonical[j]) == n { + wb.Pi2Src[j].CopyFromHost(fr.Vector(pi2Canonical[j])) + pi2DeviceReady[j] = true + } + } + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + // Stream 1 must finish before overwriting gpuS1/S2/S3 with the next coset's + // selectors. PermBoundary (end of previous coset) still holds reads on S1/S2/S3. + if k > 0 { + dev.WaitEvent(gpu.StreamTransfer, evCosetDone) + } + + // Stream 1: D2D perm selectors concurrent with L/R/O/Z reduce+FFT on stream 0. + gpuS1.CopyFromDeviceStream(inst.dS1, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dS2, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dS3, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + // Stream 0: reduce blinded canonicals and FFT while D2D runs concurrently. + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + // L₁ denominator inverse: gpuWork[i] = 1/(cosetGen·ω^i - 1) + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) // result is temp; inverses stored in gpuWork + + // l1Scalar = (cosetGen^n - 1) / n = zhZeta / n at this coset + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + // Gate selectors: overlap transfer-stream D2D copies with compute-stream FFTs. + dev.RecordEvent(gpu.StreamCompute, evPermDone) + + dev.WaitEvent(gpu.StreamTransfer, evPermDone) + gpuS1.CopyFromDeviceStream(inst.dQr, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dQm, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dQo, gpu.StreamTransfer) + gpuWork.CopyFromDeviceStream(wb.QkSrc, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + gpuZ.CopyFromDevice(inst.dQl) + fftDom.CosetFFT(gpuZ, cosetGen) + + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + gpuZ.CopyFromDevice(inst.dQcp[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil { + gpuWork.CopyFromDevice(wb.Pi2Src[j]) + } else { + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + // Store the first three coset results on GPU. Keep the fourth in gpuResult. + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + dev.RecordEvent(gpu.StreamCompute, evCosetDone) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func newLowMemorySelectorCache(inst *gpuInstance, allocated *[]*FrVector) lowMemorySelectorCache { + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY_SELECTOR_CACHE") != "" { + return lowMemorySelectorCache{} + } + + upload := func(name string, data fr.Vector) *FrVector { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + log.Printf("plonk2: low-memory selector cache stopped at %s: %v", name, err) + return nil + } + *allocated = append(*allocated, v) + v.CopyFromHost(data) + return v + } + + cache := lowMemorySelectorCache{ + ql: upload("ql", inst.qlCanonical), + qr: upload("qr", inst.qrCanonical), + qm: upload("qm", inst.qmCanonical), + qo: upload("qo", inst.qoCanonical), + s1: upload("s1", inst.s1Canonical), + s2: upload("s2", inst.s2Canonical), + s3: upload("s3", inst.s3Canonical), + } + if len(inst.qcpCanonical) > 0 { + cache.qcp = make([]*FrVector, len(inst.qcpCanonical)) + for i := range inst.qcpCanonical { + cache.qcp[i] = upload(fmt.Sprintf("qcp[%d]", i), inst.qcpCanonical[i]) + } + } + + qcpCached := 0 + for i := range cache.qcp { + if cache.qcp[i] != nil { + qcpCached++ + } + } + log.Printf( + "plonk2: low-memory selector cache ql=%t qr=%t qm=%t qo=%t s1=%t s2=%t s3=%t qcp=%d/%d", + cache.ql != nil, cache.qr != nil, cache.qm != nil, cache.qo != nil, + cache.s1 != nil, cache.s2 != nil, cache.s3 != nil, + qcpCached, len(inst.qcpCanonical), + ) + return cache +} + +func computeNumeratorGPULowMemory( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + if len(qkCanonical) < n { + return nil, nil, nil, fmt.Errorf("low-memory quotient: qk canonical length %d < %d", len(qkCanonical), n) + } + + var allocated []*FrVector + alloc := func(name string) (*FrVector, error) { + v, err := NewFrVector(inst.dev, n) + if err != nil { + return nil, fmt.Errorf("alloc %s: %w", name, err) + } + allocated = append(allocated, v) + return v, nil + } + defer func() { + for _, v := range allocated { + v.Free() + } + }() + + gpuL, err := alloc("L") + if err != nil { + return nil, nil, nil, err + } + gpuR, err := alloc("R") + if err != nil { + return nil, nil, nil, err + } + gpuO, err := alloc("O") + if err != nil { + return nil, nil, nil, err + } + gpuZ, err := alloc("Z") + if err != nil { + return nil, nil, nil, err + } + gpuS1, err := alloc("S1") + if err != nil { + return nil, nil, nil, err + } + gpuS2, err := alloc("S2") + if err != nil { + return nil, nil, nil, err + } + gpuS3, err := alloc("S3") + if err != nil { + return nil, nil, nil, err + } + gpuResult, err := alloc("Result") + if err != nil { + return nil, nil, nil, err + } + gpuLCan, err := alloc("LCan") + if err != nil { + return nil, nil, nil, err + } + gpuRCan, err := alloc("RCan") + if err != nil { + return nil, nil, nil, err + } + gpuOCan, err := alloc("OCan") + if err != nil { + return nil, nil, nil, err + } + gpuZCan, err := alloc("ZCan") + if err != nil { + return nil, nil, nil, err + } + gpuQkSrc, err := alloc("QkSrc") + if err != nil { + return nil, nil, nil, err + } + var gpuCosetBlocks [3]*FrVector + for k := range gpuCosetBlocks { + gpuCosetBlocks[k], err = alloc(fmt.Sprintf("CosetBlock%d", k)) + if err != nil { + return nil, nil, nil, err + } + } + selectorCache := newLowMemorySelectorCache(inst, &allocated) + copySelector := func(dst, device *FrVector, host fr.Vector) { + if device != nil { + dst.CopyFromDevice(device) + return + } + dst.CopyFromHost(host) + } + + gpuLCan.CopyFromHost(fr.Vector(lBlinded[:n])) + gpuRCan.CopyFromHost(fr.Vector(rBlinded[:n])) + gpuOCan.CopyFromHost(fr.Vector(oBlinded[:n])) + gpuZCan.CopyFromHost(fr.Vector(zBlinded[:n])) + gpuQkSrc.CopyFromHost(fr.Vector(qkCanonical[:n])) + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + copySelector(gpuS1, selectorCache.s1, inst.s1Canonical) + copySelector(gpuS2, selectorCache.s2, inst.s2Canonical) + copySelector(gpuS3, selectorCache.s3, inst.s3Canonical) + + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) + + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + copySelector(gpuS1, selectorCache.qr, inst.qrCanonical) + copySelector(gpuS2, selectorCache.qm, inst.qmCanonical) + copySelector(gpuS3, selectorCache.qo, inst.qoCanonical) + gpuWork.CopyFromDevice(gpuQkSrc) + copySelector(gpuZ, selectorCache.ql, inst.qlCanonical) + + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + var qcpDevice *FrVector + if j < len(selectorCache.qcp) { + qcpDevice = selectorCache.qcp[j] + } + copySelector(gpuZ, qcpDevice, inst.qcpCanonical[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("low-memory quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func gpuCommit(msm *G1MSM, coeffs []fr.Element) (curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + var aff curve.G1Affine + aff.FromJacobian(&jacs[0]) + return aff, nil +} + +func gpuCommitN(msm *G1MSM, coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffSets...) + if err != nil { + return nil, err + } + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) commit(coeffs []fr.Element) (curve.G1Affine, error) { + commits, err := inst.commitN(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + return commits[0], nil +} + +func (inst *gpuInstance) commitN(coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + if inst.lowMemory { + if err := inst.reloadMSMPoints(); err != nil { + return nil, fmt.Errorf("reload MSM points: %w", err) + } + defer func() { + _ = inst.releaseMSMWorkBuffers() + _ = inst.offloadMSMPoints() + }() + } + var jacs []curve.G1Jac + var err error + if inst.splitMSM != nil { + jacs, err = MultiExpSplitBatchAt(inst.splitMSM.msm0, inst.splitMSM.msm1, inst.splitMSM.split, coeffSets...) + } else { + jacs, err = inst.msm.MultiExp(coeffSets...) + } + if err != nil { + return nil, err + } + inst.logMSMPhaseTimings(coeffSets...) + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) logMSMPhaseTimings(coeffSets ...[]fr.Element) { + if os.Getenv("GNARK_GPU_PLONK2_LOG_MSM_PHASES") == "" { + return + } + counts := make([]int, len(coeffSets)) + for i := range coeffSets { + counts[i] = len(coeffSets[i]) + } + if inst.splitMSM != nil { + primaryCounts := make([]int, len(coeffSets)) + secondaryCounts := make([]int, len(coeffSets)) + for i, count := range counts { + primaryCounts[i] = inst.splitMSM.split + if count < primaryCounts[i] { + primaryCounts[i] = count + } + secondaryCounts[i] = count - primaryCounts[i] + } + logMSMPhaseTimings(inst.n, "primary", inst.splitMSM.msm0.LastBatchPhaseTimings(), primaryCounts) + logMSMPhaseTimings(inst.n, "secondary", inst.splitMSM.msm1.LastBatchPhaseTimings(), secondaryCounts) + return + } + logMSMPhaseTimings(inst.n, "single", inst.msm.LastBatchPhaseTimings(), counts) +} + +func logMSMPhaseTimings(n int, device string, timings [][9]float32, scalarCounts []int) { + names := [...]string{ + "h2d", "build_pairs", "sort", "boundaries", "accum_seq", + "accum_par", "reduce_partial", "reduce_finalize", "d2h", + } + for i, phase := range timings { + total := float32(0) + for _, ms := range phase { + total += ms + } + scalars := 0 + if i < len(scalarCounts) { + scalars = scalarCounts[i] + } + log.Printf( + " [GPUProve n=%d] MSM phases device=%s set=%d scalars=%d total=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms", + n, device, i, scalars, total, + names[0], phase[0], names[1], phase[1], names[2], phase[2], + names[3], phase[3], names[4], phase[4], names[5], phase[5], + names[6], phase[6], names[7], phase[7], names[8], phase[8], + ) + } +} + +func gpuBatchOpen( + commit func([]fr.Element) (curve.G1Affine, error), + polys [][]fr.Element, + digests []curve.G1Affine, + claimedValues []fr.Element, + point fr.Element, + kzgFoldingHash hash.Hash, + dataTranscript []byte, +) (kzg.BatchOpeningProof, error) { + var res kzg.BatchOpeningProof + res.ClaimedValues = claimedValues + + fsGamma := fiatshamir.NewTranscript(kzgFoldingHash, "gamma") + if err := fsGamma.Bind("gamma", point.Marshal()); err != nil { + return res, err + } + for i := range digests { + if err := fsGamma.Bind("gamma", digests[i].Marshal()); err != nil { + return res, err + } + } + for i := range claimedValues { + if err := fsGamma.Bind("gamma", claimedValues[i].Marshal()); err != nil { + return res, err + } + } + if len(dataTranscript) > 0 { + if err := fsGamma.Bind("gamma", dataTranscript); err != nil { + return res, err + } + } + gammaByte, err := fsGamma.ComputeChallenge("gamma") + if err != nil { + return res, err + } + var gammaChallenge fr.Element + gammaChallenge.SetBytes(gammaByte) + + nbPolys := len(polys) + largestPoly := 0 + for _, p := range polys { + if len(p) > largestPoly { + largestPoly = len(p) + } + } + + gammas := make([]fr.Element, nbPolys) + gammas[0].SetOne() + for i := 1; i < nbPolys; i++ { + gammas[i].Mul(&gammas[i-1], &gammaChallenge) + } + + folded := make(fr.Vector, largestPoly) + nCPU := runtime.NumCPU() + chunkSize := (largestPoly + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < largestPoly; c += chunkSize { + start := c + end := start + chunkSize + if end > largestPoly { + end = largestPoly + } + wg.Add(1) + go func() { + defer wg.Done() + temp := make(fr.Vector, end-start) + for i := range nbPolys { + effEnd := end + if effEnd > len(polys[i]) { + effEnd = len(polys[i]) + } + if start >= effEnd { + continue + } + n := effEnd - start + t := fr.Vector(temp[:n]) + t.ScalarMul(fr.Vector(polys[i][start:effEnd]), &gammas[i]) + f := fr.Vector(folded[start:effEnd]) + f.Add(f, t) + } + }() + } + wg.Wait() + + var foldedEval fr.Element + for i := nbPolys - 1; i >= 0; i-- { + foldedEval.Mul(&foldedEval, &gammaChallenge).Add(&foldedEval, &claimedValues[i]) + } + folded[0].Sub(&folded[0], &foldedEval) + parallelHornerQuotient(folded, point) + h := folded[1:] + + res.H, err = commit(h) + if err != nil { + return res, err + } + return res, nil +} + +func computeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + h1, h2, h3 []fr.Element, +) []fr.Element { + n := inst.n + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + // Pre-allocated GPU buffers from gpuInstance (guaranteed non-nil after newGPUInstance). + gpuResult := inst.qWb.LinResult + gpuW := inst.qWb.LinW + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + PlonkLinearizeStatic( + gpuResult, inst.qWb.ZCan, inst.dS3, + inst.dQl, inst.dQr, inst.dQm, inst.dQo, inst.dQkFixed, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta, + ) + + for j := range qcpZeta { + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[j] != nil { + gpuW.CopyFromDevice(inst.qWb.Pi2Src[j]) + } else { + gpuW.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + gpuResult.AddScalarMul(gpuW, qcpZeta[j]) + } + + var negCoeff fr.Element + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Mul(&negCoeff, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h3[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h2[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Neg(&zhZeta) + gpuW.CopyFromHost(fr.Vector(h1[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + gpuResult.CopyToHost(fr.Vector(blindedZCanonical[:n])) + + for i := n; i < len(blindedZCanonical); i++ { + var t fr.Element + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + return blindedZCanonical +} + +func innerComputeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, + h1, h2, h3 []fr.Element, +) []fr.Element { + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + s3can := []fr.Element(inst.s3Canonical) + cql := []fr.Element(inst.qlCanonical) + cqr := []fr.Element(inst.qrCanonical) + cqm := []fr.Element(inst.qmCanonical) + cqo := []fr.Element(inst.qoCanonical) + cqk := []fr.Element(inst.qkFixedCanonical) + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + + total := len(blindedZCanonical) + nCPU := runtime.NumCPU() + chunkSize := (total + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < total; c += chunkSize { + start := c + end := start + chunkSize + if end > total { + end = total + } + wg.Add(1) + go func() { + defer wg.Done() + var t, t0, t1 fr.Element + for i := start; i < end; i++ { + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(s3can) { + t0.Mul(&s3can[i], &s1) + t.Add(&t, &t0) + } + if i < len(cqm) { + t1.Mul(&cqm[i], &rl) + t.Add(&t, &t1) + t0.Mul(&cql[i], &lZeta) + t.Add(&t, &t0) + t0.Mul(&cqr[i], &rZeta) + t.Add(&t, &t0) + t0.Mul(&cqo[i], &oZeta) + t.Add(&t, &t0) + t.Add(&t, &cqk[i]) + } + for j := range qcpZeta { + if i < len(pi2Canonical[j]) { + t0.Mul(&pi2Canonical[j][i], &qcpZeta[j]) + t.Add(&t, &t0) + } + } + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + }() + } + wg.Wait() + return blindedZCanonical +} + +// ─── Polynomial helpers ─────────────────────────────────────────────────────── + +func blindInto(dst []fr.Element, canonical []fr.Element, bp *iop.Polynomial) []fr.Element { + cbp := bp.Coefficients() + result := dst[:len(canonical)+len(cbp)] + copy(result, canonical) + copy(result[len(canonical):], cbp) + for i := 0; i < len(cbp); i++ { + result[i].Sub(&result[i], &cbp[i]) + } + return result +} + +func getRandomPolynomial(degree int) *iop.Polynomial { + coeffs := make([]fr.Element, degree+1) + for i := range coeffs { + coeffs[i].SetRandom() + } + return iop.NewPolynomial(&coeffs, iop.Form{Basis: iop.Canonical, Layout: iop.Regular}) +} + +func parallelHornerQuotient(poly []fr.Element, z fr.Element) { + n := len(poly) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + for i := n - 2; i >= 0; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + return + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + var wg sync.WaitGroup + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + wg.Add(1) + go func(lo, hi int) { + defer wg.Done() + for i := hi - 2; i >= lo; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + }(lo, hi) + } + wg.Wait() + zk := expElement(z, chunkSize) + carries := make([]fr.Element, numChunks) + for c := numChunks - 2; c >= 0; c-- { + nextLo := (c + 1) * chunkSize + nextLen := chunkSize + if nextLo+nextLen > n { + nextLen = n - nextLo + } + zkc := zk + if nextLen != chunkSize { + zkc = expElement(z, nextLen) + } + var tmp fr.Element + tmp.Mul(&carries[c+1], &zkc) + carries[c].Add(&poly[nextLo], &tmp) + } + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + if carries[c].IsZero() { + continue + } + wg.Add(1) + go func(lo, hi, c int) { + defer wg.Done() + var zPow fr.Element + zPow.Set(&z) + for i := hi - 1; i >= lo; i-- { + var corr fr.Element + corr.Mul(&zPow, &carries[c]) + poly[i].Add(&poly[i], &corr) + zPow.Mul(&zPow, &z) + } + }(lo, hi, c) + } + wg.Wait() +} + +func expElement(z fr.Element, exp int) fr.Element { + var base, acc fr.Element + base.Set(&z) + acc.SetOne() + for exp > 0 { + if exp&1 != 0 { + acc.Mul(&acc, &base) + } + base.Square(&base) + exp >>= 1 + } + return acc +} + +// ─── Fiat-Shamir helpers ────────────────────────────────────────────────────── + +func bindPublicData(fs *fiatshamir.Transcript, challenge string, vk *curplonk.VerifyingKey, publicInputs []fr.Element) error { + for _, f := range []func() []byte{ + func() []byte { return vk.S[0].Marshal() }, + func() []byte { return vk.S[1].Marshal() }, + func() []byte { return vk.S[2].Marshal() }, + func() []byte { return vk.Ql.Marshal() }, + func() []byte { return vk.Qr.Marshal() }, + func() []byte { return vk.Qm.Marshal() }, + func() []byte { return vk.Qo.Marshal() }, + func() []byte { return vk.Qk.Marshal() }, + } { + if err := fs.Bind(challenge, f()); err != nil { + return err + } + } + for i := range vk.Qcp { + if err := fs.Bind(challenge, vk.Qcp[i].Marshal()); err != nil { + return err + } + } + for i := range publicInputs { + if err := fs.Bind(challenge, publicInputs[i].Marshal()); err != nil { + return err + } + } + return nil +} + +func deriveRandomness(fs *fiatshamir.Transcript, challenge string, points ...*curve.G1Affine) (fr.Element, error) { + var buf [curve.SizeOfG1AffineUncompressed]byte + var r fr.Element + for _, p := range points { + buf = p.RawBytes() + if err := fs.Bind(challenge, buf[:]); err != nil { + return r, err + } + } + b, err := fs.ComputeChallenge(challenge) + if err != nil { + return r, err + } + r.SetBytes(b) + return r, nil +} + +func newHTF(domain []byte) hash.Hash { + return htf.New(domain) +} + +// ─── suppress unused imports ────────────────────────────────────────────────── +var _ = bits.TrailingZeros +var _ = unsafe.Pointer(nil) diff --git a/prover/gpu/plonk2/bn254/prove_stub.go b/prover/gpu/plonk2/bn254/prove_stub.go new file mode 100644 index 00000000000..6812d9e7dbe --- /dev/null +++ b/prover/gpu/plonk2/bn254/prove_stub.go @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bn254 + +import ( + "errors" + + curve "github.com/consensys/gnark-crypto/ecc/bn254" + "github.com/consensys/gnark/backend" + curplonk "github.com/consensys/gnark/backend/plonk/bn254" + "github.com/consensys/gnark/backend/witness" + cs "github.com/consensys/gnark/constraint/bn254" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +type GPUProvingKey struct { + Vk *curplonk.VerifyingKey +} + +func NewGPUProvingKey(_ []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + return &GPUProvingKey{Vk: vk} +} + +func (gpk *GPUProvingKey) Size() int { return 0 } +func (gpk *GPUProvingKey) Prepare(_ *gpu.Device, _ *cs.SparseR1CS) error { + return errors.New("gpu: cuda required") +} +func (gpk *GPUProvingKey) Close() {} + +func GPUProve(_ *gpu.Device, _ *GPUProvingKey, _ *cs.SparseR1CS, _ witness.Witness, _ ...backend.ProverOption) (*curplonk.Proof, error) { + return nil, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bw6761/cgo.go b/prover/gpu/plonk2/bw6761/cgo.go new file mode 100644 index 00000000000..de139b963c4 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/cgo.go @@ -0,0 +1,44 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +/* +#cgo LDFLAGS: -L${SRCDIR}/../../cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/../../cuda/include + +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// curve returns the C curve identifier for bw6761, baked in at generation time. +func curveID() C.gnark_gpu_plonk2_curve_id_t { + return C.gnark_gpu_plonk2_curve_id_t(3) +} + +func devCtx(d *gpu.Device) C.gnark_gpu_context_t { + return C.gnark_gpu_context_t(d.Handle()) +} + +func toError(code C.gnark_gpu_error_t) error { + switch code { + case C.GNARK_GPU_SUCCESS: + return nil + case C.GNARK_GPU_ERROR_CUDA: + return &gpu.Error{Code: int(code), Message: "CUDA error"} + case C.GNARK_GPU_ERROR_INVALID_ARG: + return &gpu.Error{Code: int(code), Message: "invalid argument"} + case C.GNARK_GPU_ERROR_OUT_OF_MEMORY: + return &gpu.Error{Code: int(code), Message: "out of GPU memory"} + case C.GNARK_GPU_ERROR_SIZE_MISMATCH: + return &gpu.Error{Code: int(code), Message: "vector size mismatch"} + default: + return &gpu.Error{Code: int(code), Message: "unknown error"} + } +} diff --git a/prover/gpu/plonk2/bw6761/doc.go b/prover/gpu/plonk2/bw6761/doc.go new file mode 100644 index 00000000000..8d5dde7b380 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/doc.go @@ -0,0 +1,7 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +// Package bw6761 provides GPU-accelerated PlonK operations for the bw6761 curve. +// +// Generated from gpu/internal/generator. Do not edit by hand. +// Re-generate with: cd gpu/internal/generator && go run . +package bw6761 diff --git a/prover/gpu/plonk2/bw6761/fft.go b/prover/gpu/plonk2/bw6761/fft.go new file mode 100644 index 00000000000..390c87b87e8 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/fft.go @@ -0,0 +1,211 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "math/big" + "runtime" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/fft" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain holds GPU-resident twiddle factors for NTT operations over the +// bw6761 scalar field. +// +// All NTT operations accept an optional StreamID. When provided, the operation +// is dispatched on that CUDA stream (non-blocking). When omitted, the default +// stream (stream 0) is used. +type GPUFFTDomain struct { + handle C.gnark_gpu_plonk2_ntt_domain_t + dev *gpu.Device + size int +} + +// NewFFTDomain creates a GPU NTT domain of the given size (must be a power of 2). +// +// Twiddle factors are computed using gnark-crypto's fft.Domain, then uploaded +// to GPU in AoS format. This is a one-time cost per domain size. +func NewFFTDomain(dev *gpu.Device, size int) (*GPUFFTDomain, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if size <= 0 || (size&(size-1)) != 0 { + return nil, &gpu.Error{Code: -1, Message: "size must be a positive power of 2"} + } + + domain := fft.NewDomain(uint64(size)) + halfN := size / 2 + + fwdTwiddles := make([]fr.Element, halfN) + invTwiddles := make([]fr.Element, halfN) + if halfN > 0 { + fwdTwiddles[0].SetOne() + invTwiddles[0].SetOne() + for i := 1; i < halfN; i++ { + fwdTwiddles[i].Mul(&fwdTwiddles[i-1], &domain.Generator) + invTwiddles[i].Mul(&invTwiddles[i-1], &domain.GeneratorInv) + } + } + + invN := domain.CardinalityInv + + var fwdPtr, invPtr *C.uint64_t + if halfN > 0 { + fwdPtr = (*C.uint64_t)(unsafe.Pointer(&fwdTwiddles[0])) + invPtr = (*C.uint64_t)(unsafe.Pointer(&invTwiddles[0])) + } + + var handle C.gnark_gpu_plonk2_ntt_domain_t + if err := toError(C.gnark_gpu_plonk2_ntt_domain_create( + devCtx(dev), + curveID(), + C.size_t(size), + fwdPtr, + invPtr, + (*C.uint64_t)(unsafe.Pointer(&invN)), + &handle, + )); err != nil { + return nil, err + } + + dom := &GPUFFTDomain{handle: handle, dev: dev, size: size} + runtime.SetFinalizer(dom, (*GPUFFTDomain).Close) + return dom, nil +} + +// Size returns the domain size. +func (f *GPUFFTDomain) Size() int { return f.size } + +// Close releases GPU resources. Safe to call multiple times. +func (f *GPUFFTDomain) Close() { + if f.handle != nil { + C.gnark_gpu_plonk2_ntt_domain_destroy(f.handle) + f.handle = nil + runtime.SetFinalizer(f, nil) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Forward / Inverse FFT +// ───────────────────────────────────────────────────────────────────────────── + +// FFT performs a forward NTT (DIF): natural-order input → bit-reversed output. +func (f *GPUFFTDomain) FFT(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFT size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_forward_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_forward(f.handle, v.handle)); err != nil { + panic("gpu: FFT failed: " + err.Error()) + } +} + +// FFTInverse performs an inverse NTT (DIT): bit-reversed input → natural-order output. +// The result is scaled by 1/n. +func (f *GPUFFTDomain) FFTInverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: FFTInverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_inverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_inverse(f.handle, v.handle)); err != nil { + panic("gpu: FFTInverse failed: " + err.Error()) + } +} + +// BitReverse applies the bit-reversal permutation. +func (f *GPUFFTDomain) BitReverse(v *FrVector, streams ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: BitReverse size mismatch") + } + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse_stream(f.handle, v.handle, C.int(streams[0]))); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_ntt_bit_reverse(f.handle, v.handle)); err != nil { + panic("gpu: BitReverse failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Coset FFT +// +// CosetFFT evaluates p(X) on coset g·H = {g·ω^i : i=0..n-1}. +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// ───────────────────────────────────────────────────────────────────────────── + +// CosetFFT evaluates a polynomial in canonical form on coset g·H. +// Input: v holds canonical coefficients in natural order. +// Output: v holds p(g·ω⁰), p(g·ω¹), …, p(g·ωⁿ⁻¹) in natural order. +// +// Implemented as: ScaleByPowers(g) → FFT → BitReverse. +func (f *GPUFFTDomain) CosetFFT(v *FrVector, g fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFT size mismatch") + } + v.ScaleByPowers(g, stream...) + f.FFT(v, stream...) + f.BitReverse(v, stream...) +} + +// CosetFFTInverse recovers canonical coefficients from coset evaluations. +// gInv must be the inverse of the coset generator g. +// +// Implemented as: BitReverse → FFTInverse → ScaleByPowers(gInv). +func (f *GPUFFTDomain) CosetFFTInverse(v *FrVector, gInv fr.Element, stream ...gpu.StreamID) { + if v.n != f.size { + panic("gpu: CosetFFTInverse size mismatch") + } + f.BitReverse(v, stream...) + f.FFTInverse(v, stream...) + v.ScaleByPowers(gInv, stream...) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Butterfly4Inverse — decomposed iFFT(4n) for quotient computation +// ───────────────────────────────────────────────────────────────────────────── + +// Butterfly4Inverse applies a size-4 inverse DFT butterfly across 4 FrVectors. +// +// omega4Inv: inverse of the primitive 4th root of unity. +// quarter: 1/4 in Montgomery form. +func Butterfly4Inverse(b0, b1, b2, b3 *FrVector, omega4Inv, quarter fr.Element) { + if b0.n != b1.n || b1.n != b2.n || b2.n != b3.n { + panic("gpu: Butterfly4Inverse size mismatch") + } + if b0.dev != b1.dev || b1.dev != b2.dev || b2.dev != b3.dev { + panic("gpu: Butterfly4Inverse device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_butterfly4_inverse( + devCtx(b0.dev), + b0.handle, b1.handle, b2.handle, b3.handle, + (*C.uint64_t)(unsafe.Pointer(&omega4Inv)), + (*C.uint64_t)(unsafe.Pointer(&quarter)), + )); err != nil { + panic("gpu: Butterfly4Inverse failed: " + err.Error()) + } +} + +// ─── suppress unused import ─────────────────────────────────────────────────── +var _ = big.NewInt diff --git a/prover/gpu/plonk2/bw6761/fft_stub.go b/prover/gpu/plonk2/bw6761/fft_stub.go new file mode 100644 index 00000000000..49d987c4028 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/fft_stub.go @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bw6761 + +import ( + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// GPUFFTDomain is a stub for non-CUDA builds. +type GPUFFTDomain struct{} + +func NewFFTDomain(_ *gpu.Device, _ int) (*GPUFFTDomain, error) { + return nil, gpu.ErrDeviceClosed +} + +func (f *GPUFFTDomain) Size() int { return 0 } +func (f *GPUFFTDomain) Close() {} +func (f *GPUFFTDomain) FFT(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) FFTInverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) BitReverse(_ *FrVector, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFT(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (f *GPUFFTDomain) CosetFFTInverse(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} + +func Butterfly4Inverse(_, _, _, _ *FrVector, _, _ fr.Element) { panic("gpu: cuda required") } diff --git a/prover/gpu/plonk2/bw6761/fft_test.go b/prover/gpu/plonk2/bw6761/fft_test.go new file mode 100644 index 00000000000..efc45ddafec --- /dev/null +++ b/prover/gpu/plonk2/bw6761/fft_test.go @@ -0,0 +1,188 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761_test + +import ( + "fmt" + "testing" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/fft" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bw6761" + "github.com/stretchr/testify/require" +) + +func newDomain(t testing.TB, dev *gpu.Device, size int) *bw6761.GPUFFTDomain { + t.Helper() + dom, err := bw6761.NewFFTDomain(dev, size) + require.NoError(t, err) + t.Cleanup(func() { dom.Close() }) + return dom +} + +// TestFFTRoundtrip verifies FFT(FFTInverse(v)) == v. +func TestFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16, 20} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + gV := newGPUVec(t, dev, orig) + + dom.FFT(gV) + dom.FFTInverse(gV) + dom.BitReverse(gV) // FFTInverse expects bit-reversed input; FFT output is bit-reversed + dev.Sync() + + // Actually test FFTInverse(FFT(v)) == v: + // FFT: natural → bit-reversed + // FFTInverse: bit-reversed → natural (scaled by 1/n) + // So we need FFTInverse after FFT directly. + gV2 := newGPUVec(t, dev, orig) + dom.FFT(gV2) + dom.FFTInverse(gV2) + dev.Sync() + + result := make(fr.Vector, n) + gV2.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "FFTInverse(FFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestCosetFFTRoundtrip verifies CosetFFT(CosetFFTInverse(v)) == v. +func TestCosetFFTRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + for _, logN := range []int{10, 16} { + n := 1 << logN + t.Run(fmt.Sprintf("n=2^%d", logN), func(t *testing.T) { + dom := newDomain(t, dev, n) + orig := randFrVec(n) + + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + var gInv fr.Element + gInv.Inverse(&g) + + gV := newGPUVec(t, dev, orig) + dom.CosetFFT(gV, g) + dom.CosetFFTInverse(gV, gInv) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + for i := range orig { + require.True(t, orig[i].Equal(&result[i]), + "CosetFFTInverse(CosetFFT(v)) mismatch at i=%d (n=%d)", i, n) + } + }) + } +} + +// TestFFTMatchesCPU verifies GPU FFT output matches gnark-crypto CPU FFT. +func TestFFTMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const logN = 14 + n := 1 << logN + + dom := newDomain(t, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + + orig := randFrVec(n) + cpuCopy := make(fr.Vector, n) + copy(cpuCopy, orig) + + // CPU FFT + cpuDom.FFT(cpuCopy, fft.DIF) + fft.BitReverse(cpuCopy) + + // GPU FFT (DIF: natural → bit-reversed, then BitReverse → natural) + gV := newGPUVec(t, dev, orig) + dom.FFT(gV) // natural → bit-reversed + dom.BitReverse(gV) // bit-reversed → natural + dev.Sync() + + gpuResult := make(fr.Vector, n) + gV.CopyToHost(gpuResult) + + for i := range cpuCopy { + require.True(t, cpuCopy[i].Equal(&gpuResult[i]), + "FFT mismatch at i=%d", i) + } +} + +// BenchmarkFFTForward benchmarks GPU forward NTT. +func BenchmarkFFTForward(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFT(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkFFTInverse benchmarks GPU inverse NTT. +func BenchmarkFFTInverse(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + dom.FFT(gV) // put into bit-reversed form first + dev.Sync() + b.ResetTimer() + for i := 0; i < b.N; i++ { + dom.FFTInverse(gV) + dev.Sync() + } + }) + } +} + +// BenchmarkCosetFFT benchmarks GPU coset FFT. +func BenchmarkCosetFFT(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + dom := newDomain(b, dev, n) + cpuDom := fft.NewDomain(uint64(n)) + g := cpuDom.FrMultiplicativeGen + src := randFrVec(n) + gV := newGPUVec(b, dev, src) + b.ResetTimer() + for i := 0; i < b.N; i++ { + // Re-upload canonical coefficients before each run + gV.CopyFromHost(src) + dom.CosetFFT(gV, g) + dev.Sync() + } + }) + } +} diff --git a/prover/gpu/plonk2/bw6761/fr.go b/prover/gpu/plonk2/bw6761/fr.go new file mode 100644 index 00000000000..56b6e808e57 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/fr.go @@ -0,0 +1,270 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "runtime" + "sync" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector holds a vector of bw6761 scalar-field (Fr) elements on the GPU +// in Structure-of-Arrays (SoA) layout for coalesced memory access. +// +// All elements are in Montgomery form. GPU memory is SoA by limb; host memory +// uses gnark-crypto AoS Montgomery layout. +// +// All operations accept an optional gpu.StreamID. When omitted, the default +// stream (stream 0) is used. +type FrVector struct { + handle C.gnark_gpu_plonk2_fr_vector_t + dev *gpu.Device + n int +} + +var hostTransferMu sync.Mutex + +// NewFrVector allocates GPU memory for n Fr elements on dev. +// A finalizer is installed; call Free for deterministic VRAM release. +func NewFrVector(dev *gpu.Device, n int) (*FrVector, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if n <= 0 { + return nil, &gpu.Error{Code: -1, Message: "count must be positive"} + } + + var handle C.gnark_gpu_plonk2_fr_vector_t + if err := toError(C.gnark_gpu_plonk2_fr_vector_alloc( + devCtx(dev), curveID(), C.size_t(n), &handle, + )); err != nil { + return nil, err + } + + v := &FrVector{handle: handle, dev: dev, n: n} + runtime.SetFinalizer(v, (*FrVector).Free) + return v, nil +} + +// Free releases GPU memory. Safe to call multiple times. +func (v *FrVector) Free() { + if v.handle != nil { + v.bind() + C.gnark_gpu_plonk2_fr_vector_free(v.handle) + v.handle = nil + runtime.SetFinalizer(v, nil) + } +} + +// Len returns the number of elements. +func (v *FrVector) Len() int { return v.n } + +func (v *FrVector) bind() { + if err := v.dev.Bind(); err != nil { + panic("gpu: bind device failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Host ↔ Device transfers +// ───────────────────────────────────────────────────────────────────────────── + +// CopyFromHost copies host data (AoS) to GPU (SoA). Panics on size mismatch. +func (v *FrVector) CopyFromHost(src fr.Vector, _ ...gpu.StreamID) { + if len(src) != v.n { + panic("gpu: CopyFromHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_device( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&src[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyFromHost failed: " + err.Error()) + } +} + +// CopyToHost copies GPU data (SoA) back to host (AoS). Panics on size mismatch. +func (v *FrVector) CopyToHost(dst fr.Vector, _ ...gpu.StreamID) { + if len(dst) != v.n { + panic("gpu: CopyToHost size mismatch") + } + v.bind() + hostTransferMu.Lock() + defer hostTransferMu.Unlock() + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_to_host( + v.handle, + (*C.uint64_t)(unsafe.Pointer(&dst[0])), + C.size_t(v.n), + )); err != nil { + panic("gpu: CopyToHost failed: " + err.Error()) + } +} + +// CopyFromDevice copies src to v (GPU-to-GPU). Panics on size or device mismatch. +func (v *FrVector) CopyFromDevice(src *FrVector, _ ...gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDevice size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDevice device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d( + devCtx(v.dev), v.handle, src.handle, + )); err != nil { + panic("gpu: CopyFromDevice failed: " + err.Error()) + } +} + +// CopyFromDeviceStream copies src to v (GPU-to-GPU) on a specific stream. +// Panics on size or device mismatch. +func (v *FrVector) CopyFromDeviceStream(src *FrVector, streamID gpu.StreamID) { + if v.n != src.n { + panic("gpu: CopyFromDeviceStream size mismatch") + } + if v.dev != src.dev { + panic("gpu: CopyFromDeviceStream device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_copy_d2d_stream( + devCtx(v.dev), v.handle, src.handle, C.int(streamID), + )); err != nil { + panic("gpu: CopyFromDeviceStream failed: " + err.Error()) + } +} + +// SetZero sets all elements to zero. +func (v *FrVector) SetZero(_ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_set_zero( + devCtx(v.dev), v.handle, + )); err != nil { + panic("gpu: SetZero failed: " + err.Error()) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// Element-wise arithmetic (all async on the default stream) +// ───────────────────────────────────────────────────────────────────────────── + +func mustSameDeviceAndSize(v, a, b *FrVector) { + if v.n != a.n || a.n != b.n { + panic("gpu: vector size mismatch") + } + if v.dev != a.dev || a.dev != b.dev { + panic("gpu: vectors from different devices") + } +} + +// Mul computes v[i] = a[i] · b[i] (mod r). +func (v *FrVector) Mul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_mul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Mul failed: " + err.Error()) + } +} + +// Add computes v[i] = a[i] + b[i] (mod r). +func (v *FrVector) Add(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_add( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Add failed: " + err.Error()) + } +} + +// Sub computes v[i] = a[i] - b[i] (mod r). +func (v *FrVector) Sub(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_sub( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: Sub failed: " + err.Error()) + } +} + +// AddMul computes v[i] += a[i] · b[i] (mod r). +func (v *FrVector) AddMul(a, b *FrVector, _ ...gpu.StreamID) { + mustSameDeviceAndSize(v, a, b) + if err := toError(C.gnark_gpu_plonk2_fr_vector_addmul( + devCtx(v.dev), v.handle, a.handle, b.handle, + )); err != nil { + panic("gpu: AddMul failed: " + err.Error()) + } +} + +// AddScalarMul computes v[i] += a[i] · scalar (mod r). +func (v *FrVector) AddScalarMul(a *FrVector, scalar fr.Element, _ ...gpu.StreamID) { + if v.n != a.n { + panic("gpu: AddScalarMul size mismatch") + } + if v.dev != a.dev { + panic("gpu: AddScalarMul device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_add_scalar_mul( + devCtx(v.dev), v.handle, a.handle, + (*C.uint64_t)(unsafe.Pointer(&scalar)), + )); err != nil { + panic("gpu: AddScalarMul failed: " + err.Error()) + } +} + +// ScalarMul computes v[i] *= c (mod r) for all i. +func (v *FrVector) ScalarMul(c fr.Element, _ ...gpu.StreamID) { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scalar_mul( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&c)), + )); err != nil { + panic("gpu: ScalarMul failed: " + err.Error()) + } +} + +// ScaleByPowers computes v[i] *= g^i for i in [0, n). +// Used for coset FFT shifting. +func (v *FrVector) ScaleByPowers(g fr.Element, streams ...gpu.StreamID) { + if len(streams) > 0 { + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers_stream( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + C.int(streams[0]), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } + return + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_scale_by_powers( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&g)), + )); err != nil { + panic("gpu: ScaleByPowers failed: " + err.Error()) + } +} + +// BatchInvert computes v[i] = 1/v[i] using Montgomery batch inversion. +// temp must be a separate FrVector of the same size used as scratch space. +func (v *FrVector) BatchInvert(temp *FrVector, _ ...gpu.StreamID) { + if v.n != temp.n { + panic("gpu: BatchInvert size mismatch") + } + if v.dev != temp.dev { + panic("gpu: BatchInvert device mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_batch_invert( + devCtx(v.dev), v.handle, temp.handle, + )); err != nil { + panic("gpu: BatchInvert failed: " + err.Error()) + } +} diff --git a/prover/gpu/plonk2/bw6761/fr_stub.go b/prover/gpu/plonk2/bw6761/fr_stub.go new file mode 100644 index 00000000000..ab464abe2d0 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/fr_stub.go @@ -0,0 +1,37 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bw6761 + +import ( + "errors" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// FrVector is a stub for non-CUDA builds. +type FrVector struct{} + +func NewFrVector(_ *gpu.Device, _ int) (*FrVector, error) { + return nil, errors.New("gpu: cuda required") +} + +func (v *FrVector) Free() {} +func (v *FrVector) Len() int { return 0 } +func (v *FrVector) CopyFromHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyToHost(_ fr.Vector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDevice(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) CopyFromDeviceStream(_ *FrVector, _ gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) SetZero(_ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Mul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Add(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) Sub(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddMul(_, _ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) AddScalarMul(_ *FrVector, _ fr.Element, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func (v *FrVector) ScalarMul(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) ScaleByPowers(_ fr.Element, _ ...gpu.StreamID) { panic("gpu: cuda required") } +func (v *FrVector) BatchInvert(_ *FrVector, _ ...gpu.StreamID) { panic("gpu: cuda required") } diff --git a/prover/gpu/plonk2/bw6761/fr_test.go b/prover/gpu/plonk2/bw6761/fr_test.go new file mode 100644 index 00000000000..454e3ff2707 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/fr_test.go @@ -0,0 +1,275 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761_test + +import ( + "fmt" + "testing" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bw6761" + "github.com/leanovate/gopter" + "github.com/leanovate/gopter/prop" + "github.com/stretchr/testify/require" +) + +func requireGPUDev(t testing.TB) *gpu.Device { + t.Helper() + dev, err := gpu.New() + require.NoError(t, err) + t.Cleanup(func() { dev.Close() }) + return dev +} + +func genFrElem() gopter.Gen { + return func(_ *gopter.GenParameters) *gopter.GenResult { + var e fr.Element + e.MustSetRandom() + return gopter.NewGenResult(e, gopter.NoShrinker) + } +} + +func randFrVec(n int) fr.Vector { + v := make(fr.Vector, n) + for i := range v { + v[i].MustSetRandom() + } + return v +} + +func newGPUVec(t testing.TB, dev *gpu.Device, data fr.Vector) *bw6761.FrVector { + t.Helper() + gv, err := bw6761.NewFrVector(dev, len(data)) + require.NoError(t, err) + t.Cleanup(func() { gv.Free() }) + gv.CopyFromHost(data) + dev.Sync() + return gv +} + +// TestFrVectorRoundtrip verifies CopyFromHost → CopyToHost is identity. +func TestFrVectorRoundtrip(t *testing.T) { + dev := requireGPUDev(t) + const n = 1024 + src := randFrVec(n) + gv := newGPUVec(t, dev, src) + dst := make(fr.Vector, n) + gv.CopyToHost(dst) + for i := range src { + require.True(t, src[i].Equal(&dst[i]), "mismatch at %d", i) + } +} + +// TestFrVectorAddCommutative checks GPU Add(a,b) == GPU Add(b,a). +func TestFrVectorAddCommutative(t *testing.T) { + dev := requireGPUDev(t) + parameters := gopter.DefaultTestParameters() + parameters.MinSuccessfulTests = 50 + properties := gopter.NewProperties(parameters) + + properties.Property("Add is commutative", prop.ForAll( + func(a, b fr.Element) bool { + n := 16 + aVec := make(fr.Vector, n) + bVec := make(fr.Vector, n) + for i := range aVec { + aVec[i] = a + bVec[i] = b + } + + gA, err := bw6761.NewFrVector(dev, n) + if err != nil { + return false + } + gB, _ := bw6761.NewFrVector(dev, n) + gAB, _ := bw6761.NewFrVector(dev, n) + gBA, _ := bw6761.NewFrVector(dev, n) + defer gA.Free() + defer gB.Free() + defer gAB.Free() + defer gBA.Free() + + gA.CopyFromHost(aVec) + gB.CopyFromHost(bVec) + gAB.Add(gA, gB) + gBA.Add(gB, gA) + dev.Sync() + + ab := make(fr.Vector, n) + ba := make(fr.Vector, n) + gAB.CopyToHost(ab) + gBA.CopyToHost(ba) + for i := range ab { + if !ab[i].Equal(&ba[i]) { + return false + } + } + return true + }, + genFrElem(), genFrElem(), + )) + properties.TestingRun(t, gopter.ConsoleReporter(false)) +} + +// TestFrVectorBatchInvert verifies v[i] * inv(v[i]) == 1. +func TestFrVectorBatchInvert(t *testing.T) { + dev := requireGPUDev(t) + const n = 256 + + orig := make(fr.Vector, n) + for i := range orig { + orig[i].MustSetRandom() + if orig[i].IsZero() { + orig[i].SetOne() + } + } + + gV := newGPUVec(t, dev, orig) + gTemp, err := bw6761.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + inv := make(fr.Vector, n) + gV.CopyToHost(inv) + + var one fr.Element + one.SetOne() + for i := range orig { + var product fr.Element + product.Mul(&orig[i], &inv[i]) + require.True(t, product.Equal(&one), "BatchInvert: v[%d]*inv[%d] != 1", i, i) + } +} + +// TestFrVectorScaleByPowers checks GPU ScaleByPowers matches CPU loop. +func TestFrVectorScaleByPowers(t *testing.T) { + dev := requireGPUDev(t) + const n = 512 + + var omega fr.Element + omega.MustSetRandom() + + ones := make(fr.Vector, n) + for i := range ones { + ones[i].SetOne() + } + + gV := newGPUVec(t, dev, ones) + gV.ScaleByPowers(omega) + dev.Sync() + + result := make(fr.Vector, n) + gV.CopyToHost(result) + + expected := make(fr.Vector, n) + expected[0].SetOne() + for i := 1; i < n; i++ { + expected[i].Mul(&expected[i-1], &omega) + } + + for i := range result { + require.True(t, result[i].Equal(&expected[i]), "ScaleByPowers mismatch at %d", i) + } +} + +// TestFrVectorBatchInvertMatchesCPU verifies BatchInvert matches scalar CPU inversion. +func TestFrVectorBatchInvertMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + const n = 128 + + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + + cpuInv := make(fr.Vector, n) + for i := range src { + cpuInv[i].Inverse(&src[i]) + } + + gV := newGPUVec(t, dev, src) + gTemp, err := bw6761.NewFrVector(dev, n) + require.NoError(t, err) + defer gTemp.Free() + + gV.BatchInvert(gTemp) + dev.Sync() + + gpuInv := make(fr.Vector, n) + gV.CopyToHost(gpuInv) + + for i := range cpuInv { + require.True(t, cpuInv[i].Equal(&gpuInv[i]), + "BatchInvert mismatch at %d", i) + } +} + +// BenchmarkFrVectorAdd benchmarks GPU element-wise addition. +func BenchmarkFrVectorAdd(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20, 1 << 22} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + gA := newGPUVec(b, dev, src) + gB := newGPUVec(b, dev, src) + gC, _ := bw6761.NewFrVector(dev, n) + defer gC.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gC.Add(gA, gB) + dev.Sync() + } + }) + } +} + +// BenchmarkFrVectorBatchInvert benchmarks GPU batch inversion. +func BenchmarkFrVectorBatchInvert(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, n := range []int{1 << 14, 1 << 18, 1 << 20} { + n := n + b.Run(fmt.Sprintf("n=%s", fmtPow2(n)), func(b *testing.B) { + src := randFrVec(n) + for i := range src { + if src[i].IsZero() { + src[i].SetOne() + } + } + gV := newGPUVec(b, dev, src) + gTemp, _ := bw6761.NewFrVector(dev, n) + defer gTemp.Free() + b.ResetTimer() + for i := 0; i < b.N; i++ { + gV.CopyFromHost(src) + gV.BatchInvert(gTemp) + dev.Sync() + } + }) + } +} + +func fmtPow2(n int) string { + switch { + case n >= 1<<20: + return fmt.Sprintf("%dM", n>>20) + case n >= 1<<10: + return fmt.Sprintf("%dK", n>>10) + default: + return fmt.Sprintf("%d", n) + } +} diff --git a/prover/gpu/plonk2/bw6761/kernels.go b/prover/gpu/plonk2/bw6761/kernels.go new file mode 100644 index 00000000000..7c18ca17e1e --- /dev/null +++ b/prover/gpu/plonk2/bw6761/kernels.go @@ -0,0 +1,316 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +/* +#include "gnark_gpu.h" +#include +*/ +import "C" + +import ( + "math/big" + "runtime" + "sync" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// ZPrefixProduct computes Z[i] = product(ratio[0..i-1]) on GPU with CPU chunk scan. +func ZPrefixProduct(dev *gpu.Device, zVec, ratioVec, tempVec *FrVector) { + if zVec.n != ratioVec.n || zVec.n != tempVec.n { + panic("gpu: ZPrefixProduct size mismatch") + } + n := ratioVec.n + maxChunks := (n + 1023) / 1024 + cpHost := make([]uint64, maxChunks*6) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase1( + devCtx(dev), zVec.handle, ratioVec.handle, + (*C.uint64_t)(unsafe.Pointer(&cpHost[0])), &numChunks, + )); err != nil { + panic("gpu: ZPrefixProduct phase1 failed: " + err.Error()) + } + + nc := int(numChunks) + spHost := make([]uint64, nc*6) + copy(spHost[:6], cpHost[:6]) + for i := 1; i < nc; i++ { + prev := *(*fr.Element)(unsafe.Pointer(&spHost[(i-1)*6])) + cur := *(*fr.Element)(unsafe.Pointer(&cpHost[i*6])) + var prod fr.Element + prod.Mul(&prev, &cur) + *(*fr.Element)(unsafe.Pointer(&spHost[i*6])) = prod + } + + if err := toError(C.gnark_gpu_plonk2_z_prefix_phase3( + devCtx(dev), zVec.handle, tempVec.handle, + (*C.uint64_t)(unsafe.Pointer(&spHost[0])), C.size_t(nc), + )); err != nil { + panic("gpu: ZPrefixProduct phase3 failed: " + err.Error()) + } +} + +// PlonkZComputeFactors computes per-element Z ratio factors on GPU. +// On exit L contains numerators, R contains denominators. +func PlonkZComputeFactors( + L, R, O *FrVector, dPerm unsafe.Pointer, + beta, gamma, gMul, gSq fr.Element, + log2n uint, domain *GPUFFTDomain, +) { + n := L.n + if R.n != n || O.n != n || domain.size != n { + panic("gpu: PlonkZComputeFactors size mismatch") + } + params := [4]fr.Element{beta, gamma, gMul, gSq} + if err := toError(C.gnark_gpu_plonk2_z_compute_factors( + devCtx(L.dev), L.handle, R.handle, O.handle, + dPerm, (*C.uint64_t)(unsafe.Pointer(¶ms[0])), + C.uint(log2n), domain.handle, + )); err != nil { + panic("gpu: PlonkZComputeFactors failed: " + err.Error()) + } +} + +// PlonkGateAccum computes the fused gate constraint accumulation. +func PlonkGateAccum(result, Ql, Qr, Qm, Qo, Qk, L, R, O *FrVector, zhKInv fr.Element) { + n := result.n + if Ql.n != n || Qr.n != n || Qm.n != n || Qo.n != n || Qk.n != n || + L.n != n || R.n != n || O.n != n { + panic("gpu: PlonkGateAccum size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_gate_accum( + devCtx(result.dev), + result.handle, Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + L.handle, R.handle, O.handle, + (*C.uint64_t)(unsafe.Pointer(&zhKInv)), + )); err != nil { + panic("gpu: PlonkGateAccum failed: " + err.Error()) + } +} + +// PlonkLinearizeStatic computes the fixed-selector part of the linearized polynomial. +func PlonkLinearizeStatic( + result, Z, S3, Ql, Qr, Qm, Qo, Qk *FrVector, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta fr.Element, +) { + n := result.n + if Z.n != n || S3.n != n || Ql.n != n || Qr.n != n || Qm.n != n || + Qo.n != n || Qk.n != n { + panic("gpu: PlonkLinearizeStatic size mismatch") + } + scalars := [6]fr.Element{combinedZCoeff, s1, lZeta, rZeta, rl, oZeta} + if err := toError(C.gnark_gpu_plonk2_linearize_static( + devCtx(result.dev), + result.handle, Z.handle, S3.handle, + Ql.handle, Qr.handle, Qm.handle, Qo.handle, Qk.handle, + (*C.uint64_t)(unsafe.Pointer(&scalars[0])), + )); err != nil { + panic("gpu: PlonkLinearizeStatic failed: " + err.Error()) + } +} + +// PlonkPermBoundary computes the fused permutation + boundary constraint. +func PlonkPermBoundary( + result, L, R, O, Z, S1, S2, S3, L1DenInv *FrVector, + alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen fr.Element, + domain *GPUFFTDomain, _ ...gpu.StreamID, +) { + n := result.n + if L.n != n || R.n != n || O.n != n || Z.n != n || + S1.n != n || S2.n != n || S3.n != n || L1DenInv.n != n || domain.size != n { + panic("gpu: PlonkPermBoundary size mismatch") + } + params := [7]fr.Element{alpha, beta, gamma, l1Scalar, cosetShift, cosetShiftSq, cosetGen} + if err := toError(C.gnark_gpu_plonk2_perm_boundary( + devCtx(result.dev), + result.handle, L.handle, R.handle, O.handle, Z.handle, + S1.handle, S2.handle, S3.handle, L1DenInv.handle, + (*C.uint64_t)(unsafe.Pointer(¶ms[0])), domain.handle, + )); err != nil { + panic("gpu: PlonkPermBoundary failed: " + err.Error()) + } +} + +// ComputeL1Den computes out[i] = cosetGen·ω^i - 1 for all i. +func ComputeL1Den(out *FrVector, cosetGen fr.Element, domain *GPUFFTDomain, _ ...gpu.StreamID) { + if domain.size != out.n { + panic("gpu: ComputeL1Den domain size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_compute_l1_den( + domain.handle, out.handle, + (*C.uint64_t)(unsafe.Pointer(&cosetGen)), + )); err != nil { + panic("gpu: ComputeL1Den failed: " + err.Error()) + } +} + +// ReduceBlindedCoset reduces a blinded polynomial for coset evaluation on GPU. +func ReduceBlindedCoset(dst, src *FrVector, tail []fr.Element, cosetPowN fr.Element) { + if dst.n != src.n { + panic("gpu: ReduceBlindedCoset size mismatch") + } + var tailPtr *C.uint64_t + if len(tail) > 0 { + tailPtr = (*C.uint64_t)(unsafe.Pointer(&tail[0])) + } + if err := toError(C.gnark_gpu_plonk2_reduce_blinded_coset( + devCtx(dst.dev), dst.handle, src.handle, + tailPtr, C.size_t(len(tail)), + (*C.uint64_t)(unsafe.Pointer(&cosetPowN)), + )); err != nil { + panic("gpu: ReduceBlindedCoset failed: " + err.Error()) + } +} + +// SubtractBlindingHead subtracts tail[i] from v[i] for the blinding tail. +func SubtractBlindingHead(v *FrVector, tail []fr.Element) { + if len(tail) == 0 { + return + } + if len(tail) > v.n { + panic("gpu: SubtractBlindingHead size mismatch") + } + if err := toError(C.gnark_gpu_plonk2_fr_vector_subtract_head( + devCtx(v.dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&tail[0])), + C.size_t(len(tail)), + )); err != nil { + panic("gpu: SubtractBlindingHead failed: " + err.Error()) + } +} + +// DeviceAllocCopyInt64 uploads an int64 slice to GPU device memory. +func DeviceAllocCopyInt64(dev *gpu.Device, data []int64) (unsafe.Pointer, error) { + var dPtr unsafe.Pointer + if err := toError(C.gnark_gpu_device_alloc_copy_int64( + devCtx(dev), + (*C.int64_t)(unsafe.Pointer(&data[0])), + C.size_t(len(data)), + &dPtr, + )); err != nil { + return nil, err + } + return dPtr, nil +} + +// DeviceFreePtr frees device memory allocated by DeviceAllocCopyInt64. +func DeviceFreePtr(ptr unsafe.Pointer) { + if ptr != nil { + C.gnark_gpu_device_free_ptr(ptr) + } +} + +// PolyEvalGPU evaluates a GPU-resident polynomial at z using chunked Horner on +// device and a small CPU combine over chunk partials. +func PolyEvalGPU(dev *gpu.Device, v *FrVector, z fr.Element) fr.Element { + n := v.n + if n == 0 { + return fr.Element{} + } + + maxChunks := (n + 1023) / 1024 + partialsHost := make([]uint64, maxChunks*6) + var numChunks C.size_t + + if err := toError(C.gnark_gpu_plonk2_poly_eval_chunks( + devCtx(dev), v.handle, + (*C.uint64_t)(unsafe.Pointer(&z)), + (*C.uint64_t)(unsafe.Pointer(&partialsHost[0])), + &numChunks, + )); err != nil { + panic("gpu: PolyEvalGPU failed: " + err.Error()) + } + + return combinePolyEvalPartials(partialsHost, int(numChunks), z) +} + +// PolyEvalFromDevice downloads a GPU FrVector and evaluates at z using CPU Horner. +func PolyEvalFromDevice(v *FrVector, z fr.Element) fr.Element { + n := v.n + coeffs := make(fr.Vector, n) + v.CopyToHost(coeffs) + return polyEvalParallel(coeffs, z) +} + +func combinePolyEvalPartials(partialsHost []uint64, numChunks int, z fr.Element) fr.Element { + if numChunks == 0 { + return fr.Element{} + } + readPartial := func(chunk int) fr.Element { + var r fr.Element + for limb := range r { + r[limb] = partialsHost[chunk*6+limb] + } + return r + } + if numChunks == 1 { + return readPartial(0) + } + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(1024)) + result := readPartial(numChunks - 1) + for j := numChunks - 2; j >= 0; j-- { + p := readPartial(j) + result.Mul(&result, &zChunk).Add(&result, &p) + } + return result +} + +// polyEvalParallel evaluates p(z) = Σ c[i]·z^i using multi-core Horner. +func polyEvalParallel(coeffs []fr.Element, z fr.Element) fr.Element { + n := len(coeffs) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + return hornerEval(coeffs, z) + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + partials := make([]fr.Element, numChunks) + var wg sync.WaitGroup + for c := range numChunks { + start := c * chunkSize + if start >= n { + break + } + end := start + chunkSize + if end > n { + end = n + } + wg.Add(1) + go func(idx, s, e int) { + defer wg.Done() + partials[idx] = hornerEval(coeffs[s:e], z) + }(c, start, end) + } + wg.Wait() + + var zChunk fr.Element + zChunk.Exp(z, big.NewInt(int64(chunkSize))) + var result, zPow fr.Element + zPow.SetOne() + for c := range numChunks { + if c*chunkSize >= n { + break + } + var t fr.Element + t.Mul(&partials[c], &zPow) + result.Add(&result, &t) + zPow.Mul(&zPow, &zChunk) + } + return result +} + +func hornerEval(coeffs []fr.Element, z fr.Element) fr.Element { + var r fr.Element + for i := len(coeffs) - 1; i >= 0; i-- { + r.Mul(&r, &z).Add(&r, &coeffs[i]) + } + return r +} diff --git a/prover/gpu/plonk2/bw6761/kernels_stub.go b/prover/gpu/plonk2/bw6761/kernels_stub.go new file mode 100644 index 00000000000..5366c4d5373 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/kernels_stub.go @@ -0,0 +1,36 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bw6761 + +import ( + "errors" + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +func ZPrefixProduct(_ *gpu.Device, _, _, _ *FrVector) { panic("gpu: cuda required") } +func PlonkZComputeFactors(_, _, _ *FrVector, _ unsafe.Pointer, _, _, _, _ fr.Element, _ uint, _ *GPUFFTDomain) { + panic("gpu: cuda required") +} +func PlonkGateAccum(_, _, _, _, _, _, _, _, _ *FrVector, _ fr.Element) { panic("gpu: cuda required") } +func PlonkPermBoundary(_, _, _, _, _, _, _, _, _ *FrVector, _, _, _, _, _, _, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ComputeL1Den(_ *FrVector, _ fr.Element, _ *GPUFFTDomain, _ ...gpu.StreamID) { + panic("gpu: cuda required") +} +func ReduceBlindedCoset(_, _ *FrVector, _ []fr.Element, _ fr.Element) { panic("gpu: cuda required") } +func DeviceAllocCopyInt64(_ *gpu.Device, _ []int64) (unsafe.Pointer, error) { + return nil, errors.New("gpu: cuda required") +} +func DeviceFreePtr(_ unsafe.Pointer) {} +func PolyEvalGPU(_ *gpu.Device, _ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} +func PolyEvalFromDevice(_ *FrVector, _ fr.Element) fr.Element { + panic("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bw6761/msm.go b/prover/gpu/plonk2/bw6761/msm.go new file mode 100644 index 00000000000..fcd61893d6b --- /dev/null +++ b/prover/gpu/plonk2/bw6761/msm.go @@ -0,0 +1,388 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "fmt" + "log" + "math/big" + "os" + "runtime" + "strconv" + "unsafe" + + curve "github.com/consensys/gnark-crypto/ecc/bw6-761" + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// frRInv is R^{-1} mod r where R = 2^{FrLimbs*64} (the Fr Montgomery constant). +// The GPU MSM uses Montgomery-form scalars without fr_from_mont, so the result +// is R * correct_result. Multiplying by frRInv corrects this. +var frRInv big.Int + +func init() { + var rInv fr.Element + rInv[0] = 1 // Montgomery representation of R^{-1}: stores R^{-1} mod r + rInv.BigInt(&frRInv) +} + +// msmDefaultWindowBits selects the Pippenger window size for n points. +func msmDefaultWindowBits(n int) int { + switch { + case n >= 1<<22: + return 18 + case n > 1<<18: + return 15 + case n > 1<<12: + return 13 + default: + return 11 + } +} + +// G1MSM holds a GPU MSM context with uploaded affine base points. +// +// Points are uploaded once at construction. The context supports multiple +// MultiExp calls sharing the same base points. +type G1MSM struct { + handle C.gnark_gpu_plonk2_msm_t + dev *gpu.Device + n int + windowBits int + hostPoints []curve.G1Affine + hostPointsPtr unsafe.Pointer + lastBatchPhaseTimings [][9]float32 +} + +// NewG1MSM creates a G1MSM context by uploading affine points to the GPU. +// window_bits=0 selects a default based on point count. +func NewG1MSM(dev *gpu.Device, points []curve.G1Affine, windowBits int) (*G1MSM, error) { + if dev.Handle() == nil { + return nil, gpu.ErrDeviceClosed + } + if err := dev.Bind(); err != nil { + return nil, err + } + n := len(points) + if n == 0 { + return nil, &gpu.Error{Code: -1, Message: "points must not be empty"} + } + if windowBits == 0 { + windowBits = msmDefaultWindowBits(n) + } + if override := os.Getenv("GNARK_GPU_PLONK2_MSM_WINDOW_BITS"); override != "" { + parsed, err := strconv.Atoi(override) + if err != nil { + return nil, fmt.Errorf("gpu: invalid GNARK_GPU_PLONK2_MSM_WINDOW_BITS %q: %w", override, err) + } + windowBits = parsed + } + if windowBits < 2 || windowBits > 24 { + return nil, fmt.Errorf("gpu: window bits must be in [2,24], got %d", windowBits) + } + + hostPoints := points + var hostPointsPtr unsafe.Pointer + if os.Getenv("GNARK_GPU_DISABLE_PINNED_MSM_POINTS") == "" { + nbytes := C.size_t(n) * C.size_t(unsafe.Sizeof(curve.G1Affine{})) + if err := toError(C.gnark_gpu_alloc_pinned(&hostPointsPtr, nbytes)); err == nil { + hostPoints = unsafe.Slice((*curve.G1Affine)(hostPointsPtr), n) + copy(hostPoints, points) + } else { + log.Printf("gpu: pinned MSM points unavailable (%v), using heap", err) + hostPointsPtr = nil + } + } + + var handle C.gnark_gpu_plonk2_msm_t + if err := toError(C.gnark_gpu_plonk2_msm_create( + devCtx(dev), + curveID(), + (*C.uint64_t)(unsafe.Pointer(&hostPoints[0])), + C.size_t(n), + C.int(windowBits), + &handle, + )); err != nil { + if hostPointsPtr != nil { + C.gnark_gpu_free_pinned(hostPointsPtr) + } + return nil, err + } + + m := &G1MSM{ + handle: handle, + dev: dev, + n: n, + windowBits: windowBits, + hostPoints: hostPoints, + hostPointsPtr: hostPointsPtr, + } + runtime.SetFinalizer(m, (*G1MSM).Close) + return m, nil +} + +// Close releases GPU resources. Safe to call multiple times. +func (m *G1MSM) Close() { + if m.handle != nil { + C.gnark_gpu_plonk2_msm_destroy(m.handle) + m.handle = nil + if m.hostPointsPtr != nil { + C.gnark_gpu_free_pinned(m.hostPointsPtr) + m.hostPointsPtr = nil + } + m.hostPoints = nil + runtime.SetFinalizer(m, nil) + } +} + +// Len returns the number of base points. +func (m *G1MSM) Len() int { return m.n } + +// PinWorkBuffers keeps MSM scratch buffers resident across MultiExp calls, +// amortizing cudaMalloc/Free overhead over a wave of MSMs. +func (m *G1MSM) PinWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_pin_work_buffers(m.handle)) +} + +// ReleaseWorkBuffers frees pinned scratch buffers. Subsequent MultiExp calls +// re-allocate lazily. +func (m *G1MSM) ReleaseWorkBuffers() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_release_work_buffers(m.handle)) +} + +// OffloadPoints frees the GPU-resident base points. Call ReloadPoints before +// the next MultiExp. +func (m *G1MSM) OffloadPoints() error { + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_offload_points(m.handle)) +} + +// ReloadPoints uploads the retained host base points after OffloadPoints. +func (m *G1MSM) ReloadPoints() error { + if len(m.hostPoints) < m.n { + return fmt.Errorf("gpu: MSM host points unavailable") + } + if err := m.dev.Bind(); err != nil { + return err + } + return toError(C.gnark_gpu_plonk2_msm_reload_points( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&m.hostPoints[0])), + C.size_t(m.n), + )) +} + +// MultiExp computes Q[i] = Σⱼ scalars[i][j] · P[j] for each scalar set. +// Each scalars[i] must have length ≤ m.Len(). +// Returns Jacobian results. +func (m *G1MSM) MultiExp(scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if err := m.dev.Bind(); err != nil { + return nil, err + } + k := len(scalars) + if k == 0 { + return nil, nil + } + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: MSM scalar set %d is empty", i) + } + if len(s) > m.n { + return nil, fmt.Errorf("gpu: MSM scalar set %d has %d elements, exceeds %d points", i, len(s), m.n) + } + } + + results := make([]curve.G1Jac, k) + m.lastBatchPhaseTimings = make([][9]float32, k) + for i, s := range scalars { + if err := toError(C.gnark_gpu_plonk2_msm_run( + m.handle, + (*C.uint64_t)(unsafe.Pointer(&s[0])), + C.size_t(len(s)), + (*C.uint64_t)(unsafe.Pointer(&results[i])), + )); err != nil { + return nil, fmt.Errorf("gpu: MSM set %d failed: %w", i, err) + } + m.lastBatchPhaseTimings[i] = m.LastPhaseTimings() + // Montgomery correction: GPU skips fr_from_mont on scalars, so result = R * correct. + results[i].ScalarMultiplication(&results[i], &frRInv) + } + return results, nil +} + +// LastPhaseTimings returns per-phase timings (ms) from the most recent MultiExp call. +func (m *G1MSM) LastPhaseTimings() [9]float32 { + var out [9]C.float + C.gnark_gpu_plonk2_msm_get_phase_timings(m.handle, (*C.float)(unsafe.Pointer(&out[0]))) + var result [9]float32 + for i := range result { + result[i] = float32(out[i]) + } + return result +} + +// LastBatchPhaseTimings returns per-set MSM phase timings from the most recent +// MultiExp call. +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { + if len(m.lastBatchPhaseTimings) == 0 { + return nil + } + out := make([][9]float32, len(m.lastBatchPhaseTimings)) + copy(out, m.lastBatchPhaseTimings) + return out +} + +// MultiExpSplit runs the MSM split across 2 devices for ~2x speedup. +// msm0 must hold points[:n/2] and msm1 must hold points[n/2:]. +// This is an advanced API; use MultiExp for single-GPU operation. +func MultiExpSplit(msm0, msm1 *G1MSM, scalars []fr.Element) (curve.G1Jac, error) { + return MultiExpSplitAt(msm0, msm1, len(scalars)/2, scalars) +} + +// MultiExpSplitAt runs one MSM split across 2 devices at a fixed scalar index. +// msm0 must hold points[:split], and msm1 must hold points[split:]. +func MultiExpSplitAt(msm0, msm1 *G1MSM, split int, scalars []fr.Element) (curve.G1Jac, error) { + if msm0 == nil || msm1 == nil || len(scalars) == 0 { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: nil MSM or empty scalars") + } + n := len(scalars) + if split <= 0 || split >= n { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: invalid split %d for %d scalars", split, n) + } + if split > msm0.Len() || n-split > msm1.Len() { + return curve.G1Jac{}, fmt.Errorf("gpu: MultiExpSplit: split exceeds MSM point capacity") + } + + type result struct { + jac curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(scalars[:split]) + if err != nil { + ch0 <- result{err: err} + return + } + ch0 <- result{jac: jacs[0]} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(scalars[split:]) + if err != nil { + ch1 <- result{err: err} + return + } + ch1 <- result{jac: jacs[0]} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return curve.G1Jac{}, r0.err + } + if r1.err != nil { + return curve.G1Jac{}, r1.err + } + r0.jac.AddAssign(&r1.jac) + return r0.jac, nil +} + +// MultiExpSplitBatchAt runs several MSMs split across 2 devices. Each device +// executes its half-batch sequentially on its own stream, and the host combines +// matching partials. +func MultiExpSplitBatchAt(msm0, msm1 *G1MSM, split int, scalars ...[]fr.Element) ([]curve.G1Jac, error) { + if len(scalars) == 0 { + return nil, nil + } + first := make([][]fr.Element, len(scalars)) + second := make([][]fr.Element, len(scalars)) + for i, s := range scalars { + if len(s) == 0 { + return nil, fmt.Errorf("gpu: split MSM scalar set %d is empty", i) + } + if split <= 0 || split >= len(s) { + return nil, fmt.Errorf("gpu: split MSM scalar set %d has invalid split %d for %d scalars", i, split, len(s)) + } + if split > msm0.Len() || len(s)-split > msm1.Len() { + return nil, fmt.Errorf("gpu: split MSM scalar set %d exceeds MSM point capacity", i) + } + first[i] = s[:split] + second[i] = s[split:] + } + + type result struct { + jacs []curve.G1Jac + err error + } + ch0 := make(chan result, 1) + ch1 := make(chan result, 1) + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm0.dev.Bind(); err != nil { + ch0 <- result{err: err} + return + } + jacs, err := msm0.MultiExp(first...) + ch0 <- result{jacs: jacs, err: err} + }() + + go func() { + runtime.LockOSThread() + defer runtime.UnlockOSThread() + if err := msm1.dev.Bind(); err != nil { + ch1 <- result{err: err} + return + } + jacs, err := msm1.MultiExp(second...) + ch1 <- result{jacs: jacs, err: err} + }() + + r0 := <-ch0 + r1 := <-ch1 + if r0.err != nil { + return nil, r0.err + } + if r1.err != nil { + return nil, r1.err + } + if len(r0.jacs) != len(scalars) || len(r1.jacs) != len(scalars) { + return nil, fmt.Errorf("gpu: split MSM result length mismatch") + } + for i := range r0.jacs { + r0.jacs[i].AddAssign(&r1.jacs[i]) + } + return r0.jacs, nil +} diff --git a/prover/gpu/plonk2/bw6761/msm_stub.go b/prover/gpu/plonk2/bw6761/msm_stub.go new file mode 100644 index 00000000000..ca9af467757 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/msm_stub.go @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bw6761 + +import ( + "errors" + + curve "github.com/consensys/gnark-crypto/ecc/bw6-761" + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// G1MSM is a stub for non-CUDA builds. +type G1MSM struct{} + +func NewG1MSM(_ *gpu.Device, _ []curve.G1Affine, _ int) (*G1MSM, error) { + return nil, errors.New("gpu: cuda required") +} + +func (m *G1MSM) Close() {} +func (m *G1MSM) Len() int { return 0 } +func (m *G1MSM) PinWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) ReleaseWorkBuffers() error { return errors.New("gpu: cuda required") } +func (m *G1MSM) MultiExp(_ ...[]fr.Element) ([]curve.G1Jac, error) { + return nil, errors.New("gpu: cuda required") +} +func (m *G1MSM) LastPhaseTimings() [9]float32 { return [9]float32{} } +func (m *G1MSM) LastBatchPhaseTimings() [][9]float32 { return nil } + +func MultiExpSplit(_, _ *G1MSM, _ []fr.Element) (curve.G1Jac, error) { + return curve.G1Jac{}, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/bw6761/msm_test.go b/prover/gpu/plonk2/bw6761/msm_test.go new file mode 100644 index 00000000000..83cf169d1c5 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/msm_test.go @@ -0,0 +1,139 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761_test + +import ( + "fmt" + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "github.com/consensys/gnark-crypto/ecc/bw6-761" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bw6761" + "github.com/stretchr/testify/require" +) + +func makeTestPoints(n int) []curve.G1Affine { + _, _, g1, _ := curve.Generators() + pts := make([]curve.G1Affine, n) + pts[0] = g1 + for i := 1; i < n; i++ { + pts[i].Add(&pts[i-1], &g1) + } + return pts +} + +// TestMSMMatchesCPU verifies GPU MSM matches gnark-crypto CPU MultiExp. +func TestMSMMatchesCPU(t *testing.T) { + dev := requireGPUDev(t) + + for _, n := range []int{1, 16, 100, 1000} { + n := n + t.Run(fmt.Sprintf("n=%d", n), func(t *testing.T) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + + // CPU reference + var cpuResult curve.G1Affine + cpuResult.MultiExp(pts, scalars, ecc.MultiExpConfig{}) + + // GPU + msm, err := bw6761.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars) + require.NoError(t, err) + require.Len(t, results, 1) + + var gpuAffine curve.G1Affine + gpuAffine.FromJacobian(&results[0]) + + require.True(t, cpuResult.Equal(&gpuAffine), + "MSM mismatch at n=%d", n) + }) + } +} + +// TestMSMBatchScalarSets tests MultiExp with multiple scalar sets. +func TestMSMBatchScalarSets(t *testing.T) { + dev := requireGPUDev(t) + const n = 100 + + pts := makeTestPoints(n) + scalars1 := randFrVec(n) + scalars2 := randFrVec(n) + + // CPU references + var cpu1, cpu2 curve.G1Affine + cpu1.MultiExp(pts, scalars1, ecc.MultiExpConfig{}) + cpu2.MultiExp(pts, scalars2, ecc.MultiExpConfig{}) + + // GPU batch + msm, err := bw6761.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + results, err := msm.MultiExp(scalars1, scalars2) + require.NoError(t, err) + require.Len(t, results, 2) + + var gpu1, gpu2 curve.G1Affine + gpu1.FromJacobian(&results[0]) + gpu2.FromJacobian(&results[1]) + + require.True(t, cpu1.Equal(&gpu1), "MSM set 0 mismatch") + require.True(t, cpu2.Equal(&gpu2), "MSM set 1 mismatch") +} + +// TestMSMWorkBuffers verifies PinWorkBuffers/ReleaseWorkBuffers are idempotent. +func TestMSMWorkBuffers(t *testing.T) { + dev := requireGPUDev(t) + const n = 64 + + pts := makeTestPoints(n) + scalars := randFrVec(n) + + msm, err := bw6761.NewG1MSM(dev, pts, 0) + require.NoError(t, err) + defer msm.Close() + + require.NoError(t, msm.PinWorkBuffers()) + r1, err := msm.MultiExp(scalars) + require.NoError(t, err) + + require.NoError(t, msm.ReleaseWorkBuffers()) + r2, err := msm.MultiExp(scalars) + require.NoError(t, err) + + var a1, a2 curve.G1Affine + a1.FromJacobian(&r1[0]) + a2.FromJacobian(&r2[0]) + require.True(t, a1.Equal(&a2), "result changed after work buffer release") +} + +// BenchmarkMSM benchmarks GPU MSM at various sizes. +func BenchmarkMSM(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + for _, logN := range []int{14, 18, 22} { + n := 1 << logN + b.Run(fmt.Sprintf("n=2^%d", logN), func(b *testing.B) { + pts := makeTestPoints(n) + scalars := randFrVec(n) + msm, err := bw6761.NewG1MSM(dev, pts, 0) + require.NoError(b, err) + defer msm.Close() + require.NoError(b, msm.PinWorkBuffers()) + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := msm.MultiExp(scalars) + require.NoError(b, err) + } + }) + } +} diff --git a/prover/gpu/plonk2/bw6761/pinned_fr.go b/prover/gpu/plonk2/bw6761/pinned_fr.go new file mode 100644 index 00000000000..43603b08236 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/pinned_fr.go @@ -0,0 +1,41 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "unsafe" + + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" +) + +type pinnedFrBuffer struct { + ptr unsafe.Pointer + data []fr.Element +} + +func newPinnedFrBuffer(n int) (pinnedFrBuffer, error) { + var ptr unsafe.Pointer + nbytes := C.size_t(n) * C.size_t(fr.Bytes) + if err := toError(C.gnark_gpu_alloc_pinned(&ptr, nbytes)); err != nil { + return pinnedFrBuffer{}, err + } + return pinnedFrBuffer{ + ptr: ptr, + data: unsafe.Slice((*fr.Element)(ptr), n), + }, nil +} + +func (b *pinnedFrBuffer) free() { + if b.ptr != nil { + C.gnark_gpu_free_pinned(b.ptr) + b.ptr = nil + b.data = nil + } +} diff --git a/prover/gpu/plonk2/bw6761/plonk_test.go b/prover/gpu/plonk2/bw6761/plonk_test.go new file mode 100644 index 00000000000..28af80418f6 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/plonk_test.go @@ -0,0 +1,169 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761_test + +import ( + "testing" + + "github.com/consensys/gnark-crypto/ecc" + curve "github.com/consensys/gnark-crypto/ecc/bw6-761" + kzg "github.com/consensys/gnark-crypto/ecc/bw6-761/kzg" + gnarkplonk "github.com/consensys/gnark/backend/plonk" + curplonk "github.com/consensys/gnark/backend/plonk/bw6-761" + cs "github.com/consensys/gnark/constraint/bw6-761" + "github.com/consensys/gnark/frontend" + "github.com/consensys/gnark/frontend/cs/scs" + "github.com/consensys/gnark/test/unsafekzg" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bw6761" + "github.com/stretchr/testify/require" +) + +// addCircuit has enough constraints for sizeSystem >= 6 (avoiding gnark's 8-coset edge case for tiny circuits). +// Circuit: a*b + c*d + e*f = out (out is public) +type addCircuit struct { + A, B, C, D, F, G frontend.Variable + Out frontend.Variable `gnark:",public"` +} + +func (c *addCircuit) Define(api frontend.API) error { + ab := api.Mul(c.A, c.B) + cd := api.Mul(c.C, c.D) + fg := api.Mul(c.F, c.G) + sum := api.Add(ab, cd) + sum2 := api.Add(sum, fg) + api.AssertIsEqual(sum2, c.Out) + return nil +} + +type commitCircuit struct { + A, B, Out frontend.Variable +} + +func (c *commitCircuit) Define(api frontend.API) error { + commitment, err := api.(frontend.Committer).Commit(c.A, c.B) + if err != nil { + return err + } + product := api.Mul(c.A, c.B) + api.AssertIsDifferent(commitment, product) + api.AssertIsEqual(api.Add(c.A, c.B), c.Out) + return nil +} + +func setupAddCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &addCircuit{}) +} + +func setupCommitCircuit(t testing.TB) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + return setupCircuit(t, &commitCircuit{}) +} + +func setupCircuit(t testing.TB, circuit frontend.Circuit) (*cs.SparseR1CS, *curplonk.VerifyingKey, []curve.G1Affine) { + t.Helper() + ccs, err := frontend.Compile(ecc.BW6_761.ScalarField(), scs.NewBuilder, circuit) + require.NoError(t, err) + + srs, srsLag, err := unsafekzg.NewSRS(ccs) + require.NoError(t, err) + + _, vkIface, err := gnarkplonk.Setup(ccs, srs, srsLag) + require.NoError(t, err) + vk := vkIface.(*curplonk.VerifyingKey) + + // Extract canonical G1 SRS points from the concrete KZG SRS type. + concreteSRS := srs.(*kzg.SRS) + srsPoints := make([]curve.G1Affine, len(concreteSRS.Pk.G1)) + copy(srsPoints, concreteSRS.Pk.G1) + + return ccs.(*cs.SparseR1CS), vk, srsPoints +} + +// TestGPUProveVerify proves a small circuit with the GPU and verifies with gnark CPU. +func TestGPUProveVerify(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := bw6761.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15 + 77 + 8} + fullW, err := frontend.NewWitness(assignment, ecc.BW6_761.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bw6761.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +// TestGPUProveMultipleProofs tests that multiple proofs can be generated from the same key. +func TestGPUProveMultipleProofs(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupAddCircuit(t) + + gpk := bw6761.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + for i := range 3 { + a := int64(i + 1) + _ = int64(i + 2) + assignment := &addCircuit{A: a, B: a + 1, C: a + 2, D: a + 3, F: a + 4, G: a + 5, Out: a*(a+1) + (a+2)*(a+3) + (a+4)*(a+5)} + fullW, err := frontend.NewWitness(assignment, ecc.BW6_761.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bw6761.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err, "proof %d failed", i) + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "proof %d verification failed", i) + } +} + +func TestGPUProveVerify_BSB22Commitment(t *testing.T) { + dev := requireGPUDev(t) + spr, vk, srsPoints := setupCommitCircuit(t) + + gpk := bw6761.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &commitCircuit{A: 3, B: 5, Out: 8} + fullW, err := frontend.NewWitness(assignment, ecc.BW6_761.ScalarField()) + require.NoError(t, err) + pubW, err := fullW.Public() + require.NoError(t, err) + + proof, err := bw6761.GPUProve(dev, gpk, spr, fullW) + require.NoError(t, err) + require.NotNil(t, proof) + + require.NoError(t, gnarkplonk.Verify(proof, vk, pubW), "GPU proof failed verification") +} + +// BenchmarkGPUProve benchmarks GPU proof generation. +func BenchmarkGPUProve(b *testing.B) { + dev, err := gpu.New() + require.NoError(b, err) + defer dev.Close() + + spr, vk, srsPoints := setupAddCircuit(b) + gpk := bw6761.NewGPUProvingKey(srsPoints, vk) + defer gpk.Close() + + assignment := &addCircuit{A: 3, B: 5, C: 7, D: 11, F: 2, G: 4, Out: 15 + 77 + 8} + fullW, err := frontend.NewWitness(assignment, ecc.BW6_761.ScalarField()) + require.NoError(b, err) + + b.ResetTimer() + for i := 0; i < b.N; i++ { + _, err := bw6761.GPUProve(dev, gpk, spr, fullW) + require.NoError(b, err) + } +} diff --git a/prover/gpu/plonk2/bw6761/prove.go b/prover/gpu/plonk2/bw6761/prove.go new file mode 100644 index 00000000000..62aa6070c45 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/prove.go @@ -0,0 +1,2618 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +/* +#include "gnark_gpu.h" +*/ +import "C" + +import ( + "context" + "errors" + "fmt" + "hash" + "log" + "math/big" + "math/bits" + "os" + "runtime" + "strconv" + "sync" + "time" + "unsafe" + + curve "github.com/consensys/gnark-crypto/ecc/bw6-761" + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/fft" + htf "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/hash_to_field" + iop "github.com/consensys/gnark-crypto/ecc/bw6-761/fr/iop" + kzg "github.com/consensys/gnark-crypto/ecc/bw6-761/kzg" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + + "github.com/consensys/gnark/backend" + curplonk "github.com/consensys/gnark/backend/plonk/bw6-761" + "github.com/consensys/gnark/backend/witness" + "github.com/consensys/gnark/constraint" + cs "github.com/consensys/gnark/constraint/bw6-761" + "github.com/consensys/gnark/constraint/solver" + fcs "github.com/consensys/gnark/frontend/cs" + + "github.com/consensys/linea-monorepo/prover/gpu" + "golang.org/x/sync/errgroup" +) + +const ( + id_L int = iota + id_R + id_O + id_Z + + orderBlindingL = 1 + orderBlindingR = 1 + orderBlindingO = 1 + orderBlindingZ = 2 + msmExtraPoints = 6 +) + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProvingKey — slim wrapper: VerifyingKey + lazy gpuInstance +// ───────────────────────────────────────────────────────────────────────────── + +type GPUProvingKey struct { + mu sync.Mutex + Vk *curplonk.VerifyingKey + n int + + // SRS data (consumed during instance init) + srsPoints []curve.G1Affine + pinnedN int + + inst *gpuInstance +} + +// NewGPUProvingKey creates a GPUProvingKey from affine SRS points. +func NewGPUProvingKey(srsPoints []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + n := 0 + if vk != nil { + n = int(vk.Size) + } + return &GPUProvingKey{Vk: vk, n: n, srsPoints: srsPoints} +} + +// Size returns the domain size n. +func (gpk *GPUProvingKey) Size() int { return gpk.n } + +// Prepare performs one-time GPU setup. +func (gpk *GPUProvingKey) Prepare(dev *gpu.Device, spr *cs.SparseR1CS) error { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil && gpk.inst.dev == dev { + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + inst, err := newGPUInstance(dev, gpk, spr) + if err != nil { + return err + } + gpk.inst = inst + return nil +} + +// Close releases all GPU resources. +func (gpk *GPUProvingKey) Close() { + gpk.mu.Lock() + defer gpk.mu.Unlock() + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuInstance — persistent GPU resources + circuit data +// ───────────────────────────────────────────────────────────────────────────── + +// quotientWorkBufs holds pre-allocated GPU buffers for computeNumeratorGPU and +// computeLinearizedPoly, avoiding per-proof cudaMalloc/Free overhead. +type quotientWorkBufs struct { + L, R, O, Z *FrVector // wire poly working buffers (reused per coset) + S1, S2, S3 *FrVector // perm selector buffers + Result *FrVector // coset numerator accumulator + LCan, RCan, OCan, ZCan *FrVector // canonical wire polys (uploaded once per proof) + QkSrc *FrVector // Qk canonical source (D2D per coset, avoids H2D) + Pi2Src []*FrVector // per-proof BSB22 pi2 sources (D2D per coset) + CosetBlock [3]*FrVector // GPU-resident coset results; Result keeps block 4 + LinResult, LinW *FrVector // linearized poly GPU scratch +} + +type lowMemorySelectorCache struct { + ql, qr, qm, qo *FrVector + s1, s2, s3 *FrVector + qcp []*FrVector +} + +type splitMSMBackend struct { + secondary *gpu.Device + msm0 *G1MSM + msm1 *G1MSM + split int +} + +type gpuInstance struct { + dev *gpu.Device + vk *curplonk.VerifyingKey + n int + log2n uint + lowMemory bool + canonicalReady chan struct{} + canonicalErr error + canonicalOnce sync.Once + + domain0 *fft.Domain + + msm *G1MSM + splitMSM *splitMSMBackend + fftDom *GPUFFTDomain + dPerm unsafe.Pointer + + dQl, dQr, dQm, dQo *FrVector + dS1, dS2, dS3 *FrVector + dQkFixed *FrVector + dQcp []*FrVector + + qlCanonical, qrCanonical, qmCanonical, qoCanonical fr.Vector + qkFixedCanonical fr.Vector + s1Canonical, s2Canonical, s3Canonical fr.Vector + qcpCanonical []fr.Vector + qkLagrange fr.Vector + permutation []int64 + nbPublicVariables int + commitmentInfo []uint64 + + gpuWork *FrVector // shared scratch buffer (persists for prover lifetime) + qWb quotientWorkBufs + + hBufs hostBufs +} + +type gpuInstanceReadyHooks struct { + msm func(*gpuInstance) + commit func(*gpuInstance) + trace func(*gpuInstance) +} + +type hostBufs struct { + lCanonical, rCanonical, oCanonical fr.Vector + zLagrange fr.Vector + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + hFull []fr.Element + openZBuf []fr.Element + pinned []pinnedFrBuffer +} + +func (inst *gpuInstance) initHostBufs() { + n := inst.n + var hb hostBufs + + allocPinnedHotBuffer := func(name string, n int) []fr.Element { + if os.Getenv("GNARK_GPU_DISABLE_PINNED_HOST_BUFS") == "" { + buf, err := newPinnedFrBuffer(n) + if err == nil { + hb.pinned = append(hb.pinned, buf) + return buf.data + } + log.Printf("gpu: pinned host buffer %s unavailable (%v), using heap", name, err) + } + return make([]fr.Element, n) + } + + hb = hostBufs{ + lCanonical: make(fr.Vector, n), + rCanonical: make(fr.Vector, n), + oCanonical: make(fr.Vector, n), + zLagrange: make(fr.Vector, n), + qkCoeffs: make(fr.Vector, n), + openZBuf: make([]fr.Element, n+1+orderBlindingZ), + } + hb.lBlinded = allocPinnedHotBuffer("lBlinded", n+1+orderBlindingL) + hb.rBlinded = allocPinnedHotBuffer("rBlinded", n+1+orderBlindingR) + hb.oBlinded = allocPinnedHotBuffer("oBlinded", n+1+orderBlindingO) + hb.zBlinded = allocPinnedHotBuffer("zBlinded", n+1+orderBlindingZ) + hSize := 4 * n + if needed := 3 * (n + 2); needed > hSize { + hSize = needed + } + hb.hFull = allocPinnedHotBuffer("hFull", hSize) + inst.hBufs = hb +} + +func (hb *hostBufs) free() { + for i := range hb.pinned { + hb.pinned[i].free() + } + *hb = hostBufs{} +} + +func newGPUInstance(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, hooks ...gpuInstanceReadyHooks) (*gpuInstance, error) { + inst := &gpuInstance{dev: dev, vk: gpk.Vk, n: gpk.n, canonicalReady: make(chan struct{})} + var hook gpuInstanceReadyHooks + if len(hooks) > 0 { + hook = hooks[0] + } + commitPublished := false + msmPublished := false + tracePublished := false + publishMSMReady := func() { + if hook.msm != nil && !msmPublished { + msmPublished = true + hook.msm(inst) + } + } + publishCommitReady := func() { + if hook.commit != nil && !commitPublished { + commitPublished = true + hook.commit(inst) + } + } + publishTraceReady := func() { + if hook.trace != nil && !tracePublished { + tracePublished = true + hook.trace(inst) + } + } + var traceErrCh chan error + + fail := func(msg string, err error) (*gpuInstance, error) { + wrapped := fmt.Errorf("%s: %w", msg, err) + if traceErrCh != nil { + <-traceErrCh + traceErrCh = nil + } + inst.publishCanonicalReady(wrapped) + if !msmPublished && !commitPublished && !tracePublished { + inst.close() + } + return nil, wrapped + } + + if err := inst.initCircuitShape(spr); err != nil { + return fail("init circuit shape", err) + } + inst.lowMemory = selectLowMemoryMode(dev, inst.n) + traceErrCh = make(chan error, 1) + go func() { + traceErrCh <- inst.initTraceData(spr) + }() + waitTrace := func() error { + if traceErrCh == nil { + return nil + } + err := <-traceErrCh + traceErrCh = nil + return err + } + + var err error + msmSize := inst.n + msmExtraPoints + pts := gpk.srsPoints + if msmSize > len(pts) { + msmSize = len(pts) + } + if secondaryID, ok, cfgErr := secondaryMSMDeviceID(dev.DeviceID()); cfgErr != nil { + return fail("configure secondary MSM GPU", cfgErr) + } else if ok { + split := inst.n / 2 + if split <= 0 || split >= msmSize { + return fail("configure secondary MSM GPU", fmt.Errorf("invalid split %d for MSM size %d", split, msmSize)) + } + secondary, err := gpu.New(gpu.WithDeviceID(secondaryID)) + if err != nil { + return fail("create secondary GPU device", err) + } + inst.splitMSM = &splitMSMBackend{secondary: secondary, split: split} + inst.splitMSM.msm0, err = NewG1MSM(dev, pts[:split], 0) + if err != nil { + return fail("create primary split MSM", err) + } + inst.splitMSM.msm1, err = NewG1MSM(secondary, pts[split:msmSize], 0) + if err != nil { + return fail("create secondary split MSM", err) + } + } else { + inst.msm, err = NewG1MSM(dev, pts[:msmSize], 0) + if err != nil { + return fail("create MSM", err) + } + } + gpk.srsPoints = nil // ownership transferred; free heap copy + + if !inst.lowMemory { + if perr := inst.pinMSMWorkBuffers(); perr != nil { + return fail("pin MSM work buffers", perr) + } + } + + if inst.lowMemory { + if err := inst.offloadMSMPoints(); err != nil { + return fail("offload MSM points", err) + } + } + + inst.fftDom, err = NewFFTDomain(dev, inst.n) + if err != nil { + return fail("create FFT domain", err) + } + + if inst.lowMemory { + inst.gpuWork, err = NewFrVector(dev, inst.n) + if err != nil { + return fail("alloc low-memory GPU work buffer", err) + } + if err := dev.InitMultiStream(); err != nil { + return fail("init multi-stream", err) + } + publishMSMReady() + inst.initHostBufs() + publishCommitReady() + } + + if err := waitTrace(); err != nil { + return fail("init circuit data", err) + } + + inst.dPerm, err = DeviceAllocCopyInt64(dev, inst.permutation) + if err != nil { + return fail("upload permutation", err) + } + + if inst.lowMemory { + publishTraceReady() + } + + if err := inst.initCanonicalGPU(); err != nil { + return fail("init canonical", err) + } + + if inst.lowMemory { + inst.publishCanonicalReady(nil) + return inst, nil + } + + if err := inst.uploadPolynomials(); err != nil { + return fail("upload polynomials", err) + } + + if err := inst.allocPersistentBufs(); err != nil { + return fail("alloc persistent GPU buffers", err) + } + + inst.initHostBufs() + publishMSMReady() + publishCommitReady() + publishTraceReady() + inst.publishCanonicalReady(nil) + return inst, nil +} + +func (inst *gpuInstance) publishCanonicalReady(err error) { + inst.canonicalOnce.Do(func() { + inst.canonicalErr = err + close(inst.canonicalReady) + }) +} + +func (inst *gpuInstance) waitCanonicalReady() error { + if inst.canonicalReady == nil { + return nil + } + <-inst.canonicalReady + return inst.canonicalErr +} + +func selectLowMemoryMode(dev *gpu.Device, n int) bool { + if os.Getenv("GNARK_GPU_PLONK2_FORCE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode forced for n=%d", n) + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY") != "" { + log.Printf("plonk2: low-memory GPU mode disabled for n=%d", n) + return false + } + free, total, err := dev.MemGetInfo() + if err != nil { + low := n >= 1<<25 + log.Printf("plonk2: low-memory GPU mode=%t for n=%d; mem query failed: %v", low, n, err) + return low + } + vecBytes := uint64(n) * uint64(fr.Bytes) + estimatedResident := vecBytes * 24 + low := estimatedResident > total/2 + log.Printf( + "plonk2: low-memory GPU mode=%t n=%d vecBytes=%d estimatedResident=%d freeVRAM=%d totalVRAM=%d", + low, n, vecBytes, estimatedResident, free, total, + ) + return low +} + +func secondaryMSMDeviceID(primaryID int) (int, bool, error) { + raw := os.Getenv("GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID") + if raw == "" { + return 0, false, nil + } + id, err := strconv.Atoi(raw) + if err != nil { + return 0, false, fmt.Errorf("invalid GNARK_GPU_PLONK2_SECONDARY_DEVICE_ID %q: %w", raw, err) + } + if id == primaryID { + return 0, false, fmt.Errorf("secondary device matches primary device %d", primaryID) + } + if id < 0 { + return 0, false, fmt.Errorf("secondary device id must be non-negative, got %d", id) + } + return id, true, nil +} + +func (inst *gpuInstance) pinMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.PinWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.PinWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.PinWorkBuffers() +} + +func (inst *gpuInstance) releaseMSMWorkBuffers() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReleaseWorkBuffers(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReleaseWorkBuffers(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReleaseWorkBuffers() +} + +func (inst *gpuInstance) offloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.OffloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.OffloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.OffloadPoints() +} + +func (inst *gpuInstance) reloadMSMPoints() error { + if inst.splitMSM != nil { + if err := inst.splitMSM.msm0.ReloadPoints(); err != nil { + return err + } + if err := inst.splitMSM.msm1.ReloadPoints(); err != nil { + return err + } + return nil + } + if inst.msm == nil { + return nil + } + return inst.msm.ReloadPoints() +} + +// allocPersistentBufs allocates GPU work buffers that persist across proofs. +// Avoids per-proof cudaMalloc/Free overhead (~3 ms per 64 MB alloc × 20 bufs). +func (inst *gpuInstance) allocPersistentBufs() error { + n := inst.n + alloc := func() (*FrVector, error) { + return NewFrVector(inst.dev, n) + } + wb := &inst.qWb + // Flat list mirrors the free loop in close() — keep in sync. + named := []*(*FrVector){ + &inst.gpuWork, + &wb.L, &wb.R, &wb.O, &wb.Z, + &wb.S1, &wb.S2, &wb.S3, &wb.Result, + &wb.LCan, &wb.RCan, &wb.OCan, &wb.ZCan, + &wb.QkSrc, &wb.LinResult, &wb.LinW, + } + for _, p := range named { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + *p = v + } + for k := range wb.CosetBlock { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.CosetBlock[k] = v + } + if len(inst.commitmentInfo) > 0 { + wb.Pi2Src = make([]*FrVector, len(inst.commitmentInfo)) + for i := range wb.Pi2Src { + v, err := alloc() + if err != nil { + return fmt.Errorf("alloc persistent GPU buffer: %w", err) + } + wb.Pi2Src[i] = v + } + } + // Create multi-stream upfront so the quotient pipeline can use it immediately. + return inst.dev.InitMultiStream() +} + +func (inst *gpuInstance) initCircuitShape(spr *cs.SparseR1CS) error { + nbConstraints := spr.GetNbConstraints() + sizeSystem := uint64(nbConstraints + len(spr.Public)) + inst.domain0 = fft.NewDomain(sizeSystem, fft.WithoutPrecompute()) + n := int(inst.domain0.Cardinality) + if n != inst.n { + return fmt.Errorf("domain size mismatch: spr=%d SRS=%d", n, inst.n) + } + inst.log2n = uint(bits.TrailingZeros(uint(n))) + inst.nbPublicVariables = len(spr.Public) + inst.commitmentInfo = inst.vk.CommitmentConstraintIndexes + return nil +} + +func (inst *gpuInstance) initTraceData(spr *cs.SparseR1CS) error { + trace := curplonk.NewTrace(spr, inst.domain0) + inst.qlCanonical = fr.Vector(trace.Ql.Coefficients()) + inst.qrCanonical = fr.Vector(trace.Qr.Coefficients()) + inst.qmCanonical = fr.Vector(trace.Qm.Coefficients()) + inst.qoCanonical = fr.Vector(trace.Qo.Coefficients()) + inst.s1Canonical = fr.Vector(trace.S1.Coefficients()) + inst.s2Canonical = fr.Vector(trace.S2.Coefficients()) + inst.s3Canonical = fr.Vector(trace.S3.Coefficients()) + + inst.qkLagrange = make(fr.Vector, inst.n) + copy(inst.qkLagrange, trace.Qk.Coefficients()) + inst.qkFixedCanonical = fr.Vector(trace.Qk.Coefficients()) + + inst.qcpCanonical = make([]fr.Vector, len(trace.Qcp)) + for i, p := range trace.Qcp { + inst.qcpCanonical[i] = fr.Vector(p.Coefficients()) + } + inst.permutation = trace.S + return nil +} + +func (inst *gpuInstance) initCanonicalGPU() error { + n := inst.n + gpuWork, err := NewFrVector(inst.dev, n) + if err != nil { + return fmt.Errorf("alloc work vector: %w", err) + } + defer gpuWork.Free() + + iFFTSelector := func(v fr.Vector) { + gpuWork.CopyFromHost(v) + inst.fftDom.BitReverse(gpuWork) + inst.fftDom.FFTInverse(gpuWork) + gpuWork.CopyToHost(v) + } + + for _, v := range []fr.Vector{ + inst.qlCanonical, inst.qrCanonical, inst.qmCanonical, inst.qoCanonical, + inst.qkFixedCanonical, inst.s1Canonical, inst.s2Canonical, inst.s3Canonical, + } { + iFFTSelector(v) + } + for _, v := range inst.qcpCanonical { + iFFTSelector(v) + } + + return inst.dev.Sync() +} + +func (inst *gpuInstance) uploadPolynomials() error { + upload := func(data fr.Vector) (*FrVector, error) { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, err + } + v.CopyFromHost(data) + return v, nil + } + var err error + if inst.dQl, err = upload(inst.qlCanonical); err != nil { + return fmt.Errorf("upload ql: %w", err) + } + if inst.dQr, err = upload(inst.qrCanonical); err != nil { + return fmt.Errorf("upload qr: %w", err) + } + if inst.dQm, err = upload(inst.qmCanonical); err != nil { + return fmt.Errorf("upload qm: %w", err) + } + if inst.dQo, err = upload(inst.qoCanonical); err != nil { + return fmt.Errorf("upload qo: %w", err) + } + if inst.dS1, err = upload(inst.s1Canonical); err != nil { + return fmt.Errorf("upload s1: %w", err) + } + if inst.dS2, err = upload(inst.s2Canonical); err != nil { + return fmt.Errorf("upload s2: %w", err) + } + if inst.dS3, err = upload(inst.s3Canonical); err != nil { + return fmt.Errorf("upload s3: %w", err) + } + if inst.dQkFixed, err = upload(inst.qkFixedCanonical); err != nil { + return fmt.Errorf("upload qkFixed: %w", err) + } + inst.dQcp = make([]*FrVector, len(inst.qcpCanonical)) + for i, v := range inst.qcpCanonical { + if inst.dQcp[i], err = upload(v); err != nil { + return fmt.Errorf("upload qcp[%d]: %w", i, err) + } + } + return nil +} + +func (inst *gpuInstance) close() { + if inst.msm != nil { + inst.msm.Close() + inst.msm = nil + } + if inst.splitMSM != nil { + if inst.splitMSM.msm0 != nil { + inst.splitMSM.msm0.Close() + } + if inst.splitMSM.msm1 != nil { + inst.splitMSM.msm1.Close() + } + if inst.splitMSM.secondary != nil { + _ = inst.splitMSM.secondary.Close() + } + inst.splitMSM = nil + } + if inst.fftDom != nil { + inst.fftDom.Close() + inst.fftDom = nil + } + if inst.dPerm != nil { + DeviceFreePtr(inst.dPerm) + inst.dPerm = nil + } + for _, v := range []*FrVector{inst.dQl, inst.dQr, inst.dQm, inst.dQo, + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed} { + if v != nil { + v.Free() + } + } + inst.dQl, inst.dQr, inst.dQm, inst.dQo = nil, nil, nil, nil + inst.dS1, inst.dS2, inst.dS3, inst.dQkFixed = nil, nil, nil, nil + for _, v := range inst.dQcp { + if v != nil { + v.Free() + } + } + inst.dQcp = nil + // Free persistent work buffers (mirrors the alloc list in allocPersistentBufs). + wb := &inst.qWb + for _, v := range []*FrVector{ + inst.gpuWork, + wb.L, wb.R, wb.O, wb.Z, wb.S1, wb.S2, wb.S3, wb.Result, + wb.LCan, wb.RCan, wb.OCan, wb.ZCan, wb.QkSrc, wb.LinResult, wb.LinW, + } { + if v != nil { + v.Free() + } + } + for k := range wb.CosetBlock { + if wb.CosetBlock[k] != nil { + wb.CosetBlock[k].Free() + } + } + for _, v := range wb.Pi2Src { + if v != nil { + v.Free() + } + } + inst.gpuWork = nil + inst.qWb = quotientWorkBufs{} + inst.hBufs.free() +} + +// ───────────────────────────────────────────────────────────────────────────── +// gpuProver — per-proof mutable state +// ───────────────────────────────────────────────────────────────────────────── + +type gpuProver struct { + inst *gpuInstance + instMu sync.Mutex + waitInst func() (*gpuInstance, error) + waitMSMInst func() (*gpuInstance, error) + waitCommitInst func() (*gpuInstance, error) + + proof curplonk.Proof + fs *fiatshamir.Transcript + + commitmentInfo constraint.PlonkCommitments + commitmentVal []fr.Element + pi2Canonical [][]fr.Element + pi2DeviceReady []bool + solverOpts []solver.Option + kzgFoldingHash hash.Hash + htfFunc hash.Hash + + evalL, evalR, evalO fr.Vector + wWitness fr.Vector + bpL, bpR, bpO, bpZ *iop.Polynomial + qkCoeffs fr.Vector + lBlinded, rBlinded, oBlinded []fr.Element + zBlinded []fr.Element + h1, h2, h3 []fr.Element + gamma, beta, alpha, zeta fr.Element + + logTime func(string) +} + +// ─── Prove phases ───────────────────────────────────────────────────────────── + +func (p *gpuProver) ensureInst() (*gpuInstance, error) { + p.instMu.Lock() + if p.inst != nil { + inst := p.inst + p.instMu.Unlock() + return inst, nil + } + waitInst := p.waitInst + p.instMu.Unlock() + if waitInst == nil { + return nil, errors.New("gpu instance is not initialized") + } + inst, err := waitInst() + if err != nil { + return nil, err + } + p.instMu.Lock() + if p.inst == nil { + p.inst = inst + } + inst = p.inst + p.instMu.Unlock() + return inst, nil +} + +func (p *gpuProver) initBlindingPolynomials() { + p.bpL = getRandomPolynomial(orderBlindingL) + p.bpR = getRandomPolynomial(orderBlindingR) + p.bpO = getRandomPolynomial(orderBlindingO) + p.bpZ = getRandomPolynomial(orderBlindingZ) +} + +func (p *gpuProver) solve(spr *cs.SparseR1CS, fullWitness witness.Witness) error { + solverOpts := append([]solver.Option(nil), p.solverOpts...) + if len(p.commitmentInfo) > 0 { + bsb22ID := solver.GetHintID(fcs.Bsb22CommitmentComputePlaceholder) + solverOpts = append(solverOpts, solver.OverrideHint(bsb22ID, func(_ *big.Int, ins, outs []*big.Int) error { + waitMSMInst := p.waitMSMInst + if waitMSMInst == nil { + waitMSMInst = p.waitCommitInst + } + if waitMSMInst == nil { + waitMSMInst = p.ensureInst + } + inst, err := waitMSMInst() + if err != nil { + return err + } + n := inst.n + commDepth := int(ins[0].Int64()) + ins = ins[1:] + ci := p.commitmentInfo[commDepth] + committedValues := make([]fr.Element, inst.domain0.Cardinality) + offset := inst.nbPublicVariables + for i := range ins { + committedValues[offset+ci.Committed[i]].SetBigInt(ins[i]) + } + committedValues[offset+ci.CommitmentIndex].SetRandom() + committedValues[offset+spr.GetNbConstraints()-1].SetRandom() + + inst.gpuWork.CopyFromHost(fr.Vector(committedValues[:n])) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if commDepth < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[commDepth] != nil { + inst.qWb.Pi2Src[commDepth].CopyFromDevice(inst.gpuWork) + p.pi2DeviceReady[commDepth] = true + } + canonicalBuf := make(fr.Vector, n) + inst.gpuWork.CopyToHost(canonicalBuf) + p.pi2Canonical[commDepth] = canonicalBuf + + commitment, err := inst.commit(canonicalBuf) + if err != nil { + return err + } + p.proof.Bsb22Commitments[commDepth] = commitment + + p.htfFunc.Write(p.proof.Bsb22Commitments[commDepth].Marshal()) + hashBts := p.htfFunc.Sum(nil) + p.htfFunc.Reset() + nbBuf := fr.Bytes + if p.htfFunc.Size() < fr.Bytes { + nbBuf = p.htfFunc.Size() + } + p.commitmentVal[commDepth].SetBytes(hashBts[:nbBuf]) + p.commitmentVal[commDepth].BigInt(outs[0]) + return nil + })) + } + + solution_, err := spr.Solve(fullWitness, solverOpts...) + if err != nil { + return fmt.Errorf("solve: %w", err) + } + solution := solution_.(*cs.SparseR1CSSolution) + p.evalL = fr.Vector(solution.L) + p.evalR = fr.Vector(solution.R) + p.evalO = fr.Vector(solution.O) + + var ok bool + p.wWitness, ok = fullWitness.Vector().(fr.Vector) + if !ok { + return errors.New("invalid witness type") + } + return nil +} + +func (p *gpuProver) completeQk() { + inst, err := p.ensureInst() + if err != nil { + panic(err) + } + p.qkCoeffs = inst.hBufs.qkCoeffs + copy(p.qkCoeffs, inst.qkLagrange) + copy(p.qkCoeffs, p.wWitness[:inst.nbPublicVariables]) + for i := range p.commitmentInfo { + p.qkCoeffs[inst.nbPublicVariables+p.commitmentInfo[i].CommitmentIndex] = p.commitmentVal[i] + } +} + +// commitToLRO overlaps the iFFT of L,R,O with Qk patching (via waitQk) and +// blinding-polynomial generation (via waitBlinding), both of which complete +// concurrently in sibling goroutines. +func (p *gpuProver) commitToLRO(inst *gpuInstance, waitQk, waitBlinding func() error) error { + hb := &inst.hBufs + + gpuToCanonical := func(lagrange, dst fr.Vector, dstDevice *FrVector) { + inst.gpuWork.CopyFromHost(lagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if dstDevice != nil { + dstDevice.CopyFromDevice(inst.gpuWork) + } + inst.gpuWork.CopyToHost(dst) + } + + if inst.lowMemory { + gpuToCanonical(p.evalL, hb.lCanonical, nil) + gpuToCanonical(p.evalR, hb.rCanonical, nil) + gpuToCanonical(p.evalO, hb.oCanonical, nil) + } else { + gpuToCanonical(p.evalL, hb.lCanonical, inst.qWb.LCan) + gpuToCanonical(p.evalR, hb.rCanonical, inst.qWb.RCan) + gpuToCanonical(p.evalO, hb.oCanonical, inst.qWb.OCan) + } + + if err := waitQk(); err != nil { + return err + } + inst.gpuWork.CopyFromHost(p.qkCoeffs) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + if inst.lowMemory { + inst.gpuWork.CopyToHost(p.qkCoeffs) + } else { + inst.qWb.QkSrc.CopyFromDevice(inst.gpuWork) + p.qkCoeffs = nil + } + + if err := waitBlinding(); err != nil { + return err + } + + var blindWG sync.WaitGroup + blindWG.Add(3) + go func() { defer blindWG.Done(); p.lBlinded = blindInto(hb.lBlinded, hb.lCanonical, p.bpL) }() + go func() { defer blindWG.Done(); p.rBlinded = blindInto(hb.rBlinded, hb.rCanonical, p.bpR) }() + go func() { defer blindWG.Done(); p.oBlinded = blindInto(hb.oBlinded, hb.oCanonical, p.bpO) }() + blindWG.Wait() + if !inst.lowMemory { + SubtractBlindingHead(inst.qWb.LCan, p.bpL.Coefficients()) + SubtractBlindingHead(inst.qWb.RCan, p.bpR.Coefficients()) + SubtractBlindingHead(inst.qWb.OCan, p.bpO.Coefficients()) + } + + p.logTime("iFFT L,R,O,Qk + blind") + + lroCommits, err := inst.commitN(p.lBlinded, p.rBlinded, p.oBlinded) + if err != nil { + return err + } + p.proof.LRO[0] = lroCommits[0] + p.proof.LRO[1] = lroCommits[1] + p.proof.LRO[2] = lroCommits[2] + + p.logTime("MSM commit L,R,O") + return nil +} + +func (p *gpuProver) deriveGammaBeta() error { + inst := p.inst + if err := bindPublicData(p.fs, "gamma", inst.vk, p.wWitness[:inst.nbPublicVariables]); err != nil { + return err + } + var err error + p.gamma, err = deriveRandomness(p.fs, "gamma", &p.proof.LRO[0], &p.proof.LRO[1], &p.proof.LRO[2]) + if err != nil { + return err + } + p.beta, err = deriveRandomness(p.fs, "beta") + if err != nil { + return err + } + p.wWitness = nil + p.logTime("derive gamma,beta") + return nil +} + +func (p *gpuProver) buildZAndCommit() error { + inst := p.inst + + zLagrange, err := buildZGPU(inst, inst.gpuWork, p.evalL, p.evalR, p.evalO, p.beta, p.gamma) + if err != nil { + return fmt.Errorf("build Z: %w", err) + } + p.evalL, p.evalR, p.evalO = nil, nil, nil + p.logTime("build Z") + + hb := &inst.hBufs + inst.gpuWork.CopyFromHost(zLagrange) + inst.fftDom.BitReverse(inst.gpuWork) + inst.fftDom.FFTInverse(inst.gpuWork) + inst.gpuWork.CopyToHost(hb.zLagrange) + p.zBlinded = blindInto(hb.zBlinded, hb.zLagrange, p.bpZ) + if !inst.lowMemory { + inst.qWb.ZCan.CopyFromDevice(inst.gpuWork) + SubtractBlindingHead(inst.qWb.ZCan, p.bpZ.Coefficients()) + } + + zCommit, err := inst.commit(p.zBlinded) + if err != nil { + return err + } + p.proof.Z = zCommit + p.logTime("iFFT+commit Z") + + alphaDeps := make([]*curve.G1Affine, len(p.proof.Bsb22Commitments)+1) + for i := range p.proof.Bsb22Commitments { + alphaDeps[i] = &p.proof.Bsb22Commitments[i] + } + alphaDeps[len(alphaDeps)-1] = &p.proof.Z + var aerr error + p.alpha, aerr = deriveRandomness(p.fs, "alpha", alphaDeps...) + if aerr != nil { + return aerr + } + p.logTime("derive alpha") + return nil +} + +func (p *gpuProver) computeQuotientAndCommit() error { + inst := p.inst + if err := inst.waitCanonicalReady(); err != nil { + return fmt.Errorf("initialize canonical selector data: %w", err) + } + + pointsOffloaded := false + if inst.shouldOffloadMSMForQuotient() { + if err := inst.offloadMSMPoints(); err != nil { + return fmt.Errorf("offload MSM points: %w", err) + } + pointsOffloaded = true + if err := inst.releaseMSMWorkBuffers(); err != nil { + return fmt.Errorf("release MSM work buffers: %w", err) + } + } + defer func() { + if pointsOffloaded { + _ = inst.reloadMSMPoints() + if !inst.lowMemory { + _ = inst.pinMSMWorkBuffers() + } + } + }() + + var qErr error + p.h1, p.h2, p.h3, qErr = computeNumeratorGPU( + inst, inst.gpuWork, + p.lBlinded, p.rBlinded, p.oBlinded, p.zBlinded, + p.qkCoeffs, p.pi2Canonical, p.pi2DeviceReady, + p.alpha, p.beta, p.gamma, + ) + if qErr != nil { + return fmt.Errorf("compute quotient: %w", qErr) + } + + p.logTime("quotient GPU") + + if pointsOffloaded { + if err := inst.reloadMSMPoints(); err != nil { + return fmt.Errorf("reload MSM points: %w", err) + } + if !inst.lowMemory { + if err := inst.pinMSMWorkBuffers(); err != nil { + return fmt.Errorf("re-pin MSM work buffers: %w", err) + } + } + pointsOffloaded = false + } + hCommits, err := inst.commitN(p.h1, p.h2, p.h3) + if err != nil { + return err + } + p.proof.H[0] = hCommits[0] + p.proof.H[1] = hCommits[1] + p.proof.H[2] = hCommits[2] + p.logTime("MSM commit h1,h2,h3") + + var zetaErr error + p.zeta, zetaErr = deriveRandomness(p.fs, "zeta", &p.proof.H[0], &p.proof.H[1], &p.proof.H[2]) + if zetaErr != nil { + return zetaErr + } + return nil +} + +func (inst *gpuInstance) shouldOffloadMSMForQuotient() bool { + if inst.lowMemory { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_FORCE_MSM_OFFLOAD") != "" { + return true + } + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_MSM_OFFLOAD") != "" { + return false + } + free, _, err := inst.dev.MemGetInfo() + if err != nil { + return true + } + reserve := uint64(inst.n) * uint64(fr.Bytes) * 8 + const minReserve = 2 << 30 + if reserve < minReserve { + reserve = minReserve + } + return free < reserve +} + +func (p *gpuProver) openAndFinalize() error { + inst := p.inst + + var zetaShifted fr.Element + zetaShifted.Mul(&p.zeta, &inst.domain0.Generator) + + openZPoly := inst.hBufs.openZBuf[:len(p.zBlinded)] + copy(openZPoly, p.zBlinded) + bzuzetaCh := make(chan fr.Element, 1) + go func() { + parallelHornerQuotient(openZPoly, zetaShifted) + bzuzetaCh <- openZPoly[0] + }() + + // Evaluate host-only blinded polys on CPU while GPU-resident selector polys + // are evaluated on device. + var blzeta, brzeta, bozeta, s1Zeta, s2Zeta fr.Element + var evalWG sync.WaitGroup + evalWG.Add(3) + go func() { defer evalWG.Done(); blzeta = polyEvalParallel(p.lBlinded, p.zeta) }() + go func() { defer evalWG.Done(); brzeta = polyEvalParallel(p.rBlinded, p.zeta) }() + go func() { defer evalWG.Done(); bozeta = polyEvalParallel(p.oBlinded, p.zeta) }() + + if inst.lowMemory { + s1Zeta = polyEvalParallel(inst.s1Canonical, p.zeta) + s2Zeta = polyEvalParallel(inst.s2Canonical, p.zeta) + } else { + s1Zeta = PolyEvalGPU(inst.dev, inst.dS1, p.zeta) + s2Zeta = PolyEvalGPU(inst.dev, inst.dS2, p.zeta) + } + + qcpzeta := make([]fr.Element, len(p.commitmentInfo)) + for i := range p.commitmentInfo { + if inst.lowMemory { + qcpzeta[i] = polyEvalParallel(inst.qcpCanonical[i], p.zeta) + } else { + qcpzeta[i] = PolyEvalGPU(inst.dev, inst.dQcp[i], p.zeta) + } + } + evalWG.Wait() + + bzuzeta := <-bzuzetaCh + p.proof.ZShiftedOpening.ClaimedValue.Set(&bzuzeta) + + var linPol []fr.Element + if inst.lowMemory { + linPol = innerComputeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.h1, p.h2, p.h3, + ) + } else { + linPol = computeLinearizedPoly( + inst, + blzeta, brzeta, bozeta, p.alpha, p.beta, p.gamma, p.zeta, bzuzeta, + s1Zeta, s2Zeta, qcpzeta, p.zBlinded, p.pi2Canonical, p.pi2DeviceReady, p.h1, p.h2, p.h3, + ) + } + p.h1, p.h2, p.h3, p.pi2Canonical, p.pi2DeviceReady = nil, nil, nil, nil, nil + + zOpenCommit, err := inst.commit(openZPoly[1:]) + if err != nil { + return err + } + p.proof.ZShiftedOpening.H = zOpenCommit + p.logTime("eval+linearize+open Z") + + linPolZetaCh := make(chan fr.Element, 1) + go func() { + linPolZetaCh <- polyEvalParallel(linPol, p.zeta) + }() + + linPolDigest, err := inst.commit(linPol) + if err != nil { + return err + } + p.logTime("MSM commit linPol") + + nPolysToOpen := 6 + len(inst.qcpCanonical) + claimedValues := make([]fr.Element, nPolysToOpen) + claimedValues[0] = <-linPolZetaCh + claimedValues[1] = blzeta + claimedValues[2] = brzeta + claimedValues[3] = bozeta + claimedValues[4] = s1Zeta + claimedValues[5] = s2Zeta + for i := range inst.qcpCanonical { + claimedValues[6+i] = qcpzeta[i] + } + + polysToOpen := make([][]fr.Element, nPolysToOpen) + polysToOpen[0] = linPol + polysToOpen[1] = p.lBlinded + polysToOpen[2] = p.rBlinded + polysToOpen[3] = p.oBlinded + polysToOpen[4] = inst.s1Canonical + polysToOpen[5] = inst.s2Canonical + for i := range inst.qcpCanonical { + polysToOpen[6+i] = inst.qcpCanonical[i] + } + + digestsToOpen := make([]curve.G1Affine, nPolysToOpen) + digestsToOpen[0] = linPolDigest + digestsToOpen[1] = p.proof.LRO[0] + digestsToOpen[2] = p.proof.LRO[1] + digestsToOpen[3] = p.proof.LRO[2] + digestsToOpen[4] = inst.vk.S[0] + digestsToOpen[5] = inst.vk.S[1] + copy(digestsToOpen[6:], inst.vk.Qcp) + + p.proof.BatchedProof, err = gpuBatchOpen( + inst.commit, + polysToOpen, digestsToOpen, claimedValues, + p.zeta, + p.kzgFoldingHash, + p.proof.ZShiftedOpening.ClaimedValue.Marshal(), + ) + if err != nil { + return fmt.Errorf("batch opening: %w", err) + } + p.logTime("batch opening") + return nil +} + +// ───────────────────────────────────────────────────────────────────────────── +// GPUProve — top-level prove API +// ───────────────────────────────────────────────────────────────────────────── + +func GPUProve(dev *gpu.Device, gpk *GPUProvingKey, spr *cs.SparseR1CS, fullWitness witness.Witness, opts ...backend.ProverOption) (*curplonk.Proof, error) { + proverCfg, err := backend.NewProverConfig(opts...) + if err != nil { + return nil, fmt.Errorf("create prover config: %w", err) + } + if proverCfg.HashToFieldFn == nil { + proverCfg.HashToFieldFn = newHTF([]byte("BSB22-Plonk")) + } + + gpk.mu.Lock() + defer gpk.mu.Unlock() + + if gpk.Vk == nil { + return nil, errors.New("gpu: proving key missing verifying key") + } + + proveStart := time.Now() + logTime := func(label string) { + log.Printf(" [GPUProve n=%d] %s: %v", gpk.n, label, time.Since(proveStart)) + } + + var commitmentInfo constraint.PlonkCommitments + if spr.CommitmentInfo != nil { + commitmentInfo = spr.CommitmentInfo.(constraint.PlonkCommitments) + } + + nbCommitments := len(commitmentInfo) + newProof := &curplonk.Proof{ + Bsb22Commitments: make([]curve.G1Affine, nbCommitments), + } + + msmInstReady := make(chan struct{}) + commitInstReady := make(chan struct{}) + traceInstReady := make(chan struct{}) + var ( + msmInstPublishOnce sync.Once + commitInstPublishOnce sync.Once + traceInstPublishOnce sync.Once + msmInst *gpuInstance + commitInst *gpuInstance + traceInst *gpuInstance + msmInstErr error + commitInstErr error + traceInstErr error + ) + publishMSMInst := func(inst *gpuInstance, err error) { + msmInstPublishOnce.Do(func() { + if err != nil { + msmInstErr = err + } else { + msmInst = inst + } + close(msmInstReady) + }) + } + waitMSMInst := func() (*gpuInstance, error) { + <-msmInstReady + if msmInstErr != nil { + return nil, msmInstErr + } + if msmInst == nil { + return nil, errors.New("gpu instance initialization did not publish an MSM-ready instance") + } + return msmInst, nil + } + publishCommitInst := func(inst *gpuInstance, err error) { + commitInstPublishOnce.Do(func() { + if err != nil { + commitInstErr = err + } else { + commitInst = inst + } + close(commitInstReady) + }) + } + waitCommitInst := func() (*gpuInstance, error) { + <-commitInstReady + if commitInstErr != nil { + return nil, commitInstErr + } + if commitInst == nil { + return nil, errors.New("gpu instance initialization did not publish a commitment-ready instance") + } + return commitInst, nil + } + publishTraceInst := func(inst *gpuInstance, err error) { + traceInstPublishOnce.Do(func() { + if err != nil { + traceInstErr = err + } else { + traceInst = inst + gpk.inst = inst + } + close(traceInstReady) + }) + } + waitInst := func() (*gpuInstance, error) { + <-traceInstReady + if traceInstErr != nil { + return nil, traceInstErr + } + if traceInst == nil { + return nil, errors.New("gpu instance initialization did not publish a trace-ready instance") + } + return traceInst, nil + } + + p := &gpuProver{ + proof: *newProof, + fs: fiatshamir.NewTranscript(proverCfg.ChallengeHash, "gamma", "beta", "alpha", "zeta"), + commitmentInfo: commitmentInfo, + commitmentVal: make([]fr.Element, nbCommitments), + pi2Canonical: make([][]fr.Element, nbCommitments), + pi2DeviceReady: make([]bool, nbCommitments), + solverOpts: proverCfg.SolverOpts, + kzgFoldingHash: proverCfg.KZGFoldingHash, + htfFunc: proverCfg.HashToFieldFn, + logTime: logTime, + waitInst: waitInst, + waitMSMInst: waitMSMInst, + waitCommitInst: waitCommitInst, + } + + // Overlap CPU solve with blinding-polynomial init and Qk patching, then + // feed results into a sequential GPU pipeline. Hides the solve latency + // (~400 ms at n=2^18) behind unrelated work; recovers ~20-30% end-to-end. + chSolved := make(chan struct{}) + chBlinding := make(chan struct{}) + chQk := make(chan struct{}) + + g, gctx := errgroup.WithContext(context.Background()) + + waitCh := func(ch <-chan struct{}) error { + select { + case <-gctx.Done(): + return gctx.Err() + case <-ch: + return nil + } + } + safeGo := func(label string, fn func() error) { + g.Go(func() error { return proveStep(label, fn) }) + } + + safeGo("initGPUInstance", func() error { + if gpk.inst != nil && gpk.inst.dev == dev { + publishMSMInst(gpk.inst, nil) + publishCommitInst(gpk.inst, nil) + publishTraceInst(gpk.inst, nil) + return nil + } + if gpk.inst != nil { + gpk.inst.close() + gpk.inst = nil + } + msmPublished := false + commitPublished := false + tracePublished := false + inst, err := newGPUInstance(dev, gpk, spr, gpuInstanceReadyHooks{ + msm: func(inst *gpuInstance) { + msmPublished = true + publishMSMInst(inst, nil) + }, + commit: func(inst *gpuInstance) { + commitPublished = true + publishCommitInst(inst, nil) + }, + trace: func(inst *gpuInstance) { + tracePublished = true + publishTraceInst(inst, nil) + logTime("trace-ready GPU instance") + }, + }) + if err != nil { + err = fmt.Errorf("init GPU instance: %w", err) + if !msmPublished { + publishMSMInst(nil, err) + } + if !commitPublished { + publishCommitInst(nil, err) + } + if !tracePublished { + publishTraceInst(nil, err) + } + return err + } + if !msmPublished { + publishMSMInst(inst, nil) + } + if !commitPublished { + publishCommitInst(inst, nil) + } + if !tracePublished { + publishTraceInst(inst, nil) + } + logTime("init GPU instance") + return nil + }) + + safeGo("initBlinding", func() error { + p.initBlindingPolynomials() + close(chBlinding) + return nil + }) + + safeGo("solve", func() error { + if err := p.solve(spr, fullWitness); err != nil { + return err + } + logTime("solve") + close(chSolved) + return nil + }) + + safeGo("completeQk", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + p.completeQk() + close(chQk) + return nil + }) + + safeGo("pipeline", func() error { + if err := waitCh(chSolved); err != nil { + return err + } + commitInst, err := waitCommitInst() + if err != nil { + return err + } + if err := p.commitToLRO( + commitInst, + func() error { return waitCh(chQk) }, + func() error { return waitCh(chBlinding) }, + ); err != nil { + return err + } + if _, err := p.ensureInst(); err != nil { + return err + } + if err := p.deriveGammaBeta(); err != nil { + return err + } + if err := p.buildZAndCommit(); err != nil { + return err + } + if err := p.computeQuotientAndCommit(); err != nil { + return err + } + return p.openAndFinalize() // inst.gpuWork persists (owned by gpuInstance) + }) + + if err := g.Wait(); err != nil { + return nil, err + } + + logTime("total") + result := p.proof + return &result, nil +} + +// proveStep converts a panic in fn to a labeled error so goroutines +// surface panics as normal errors through the errgroup. +func proveStep(label string, fn func() error) (err error) { + defer func() { + if r := recover(); r != nil { + err = fmt.Errorf("%s panic: %v", label, r) + } + }() + return fn() +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helper functions (ported from gpu/plonk/prove.go) +// ───────────────────────────────────────────────────────────────────────────── + +func buildZGPU( + inst *gpuInstance, gpuWork *FrVector, + evalL, evalR, evalO fr.Vector, beta, gamma fr.Element, +) (fr.Vector, error) { + dev := inst.dev + domain0 := inst.domain0 + + gpuR := inst.qWb.R + gpuO := inst.qWb.O + if inst.lowMemory { + var err error + gpuR, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z R buffer: %w", err) + } + defer gpuR.Free() + gpuO, err = NewFrVector(inst.dev, inst.n) + if err != nil { + return nil, fmt.Errorf("alloc Z O buffer: %w", err) + } + defer gpuO.Free() + } + + gpuWork.CopyFromHost(evalL) + gpuR.CopyFromHost(evalR) + gpuO.CopyFromHost(evalO) + + gMul := domain0.FrMultiplicativeGen + var gSq fr.Element + gSq.Mul(&gMul, &gMul) + + PlonkZComputeFactors(gpuWork, gpuR, gpuO, inst.dPerm, + beta, gamma, gMul, gSq, inst.log2n, inst.fftDom) + gpuR.BatchInvert(gpuO) + gpuWork.Mul(gpuWork, gpuR) + ZPrefixProduct(dev, gpuR, gpuWork, gpuO) + gpuR.CopyToHost(inst.hBufs.zLagrange) + return inst.hBufs.zLagrange, nil +} + +func computeNumeratorGPU( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + if inst.lowMemory { + return computeNumeratorGPULowMemory( + inst, gpuWork, + lBlinded, rBlinded, oBlinded, zBlinded, + qkCanonical, pi2Canonical, + alpha, beta, gamma, + ) + } + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + // Pre-allocated buffers from gpuInstance (avoids per-proof cudaMalloc/Free). + wb := &inst.qWb + gpuL, gpuR, gpuO, gpuZ := wb.L, wb.R, wb.O, wb.Z + gpuS1, gpuS2, gpuS3 := wb.S1, wb.S2, wb.S3 + gpuResult := wb.Result + gpuLCan, gpuRCan, gpuOCan, gpuZCan := wb.LCan, wb.RCan, wb.OCan, wb.ZCan + gpuCosetBlocks := wb.CosetBlock + + // Event IDs used for cross-stream synchronisation in the 4-coset loop. + const ( + evS123Done gpu.EventID = 0 // StreamTransfer → StreamCompute: S1/S2/S3 D2D done + evPermDone gpu.EventID = 1 // StreamCompute → StreamTransfer: safe to overwrite gate buffers + evCosetDone gpu.EventID = 3 // StreamCompute → StreamTransfer: full coset k done + ) + + // L/R/O/Z canonical heads were produced on-device by the iFFT phases and + // adjusted for blinding there. Keep them resident for the quotient loop. + for j := range pi2Canonical { + if j >= len(pi2DeviceReady) || pi2DeviceReady[j] { + continue + } + if j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil && len(pi2Canonical[j]) == n { + wb.Pi2Src[j].CopyFromHost(fr.Vector(pi2Canonical[j])) + pi2DeviceReady[j] = true + } + } + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + // Stream 1 must finish before overwriting gpuS1/S2/S3 with the next coset's + // selectors. PermBoundary (end of previous coset) still holds reads on S1/S2/S3. + if k > 0 { + dev.WaitEvent(gpu.StreamTransfer, evCosetDone) + } + + // Stream 1: D2D perm selectors concurrent with L/R/O/Z reduce+FFT on stream 0. + gpuS1.CopyFromDeviceStream(inst.dS1, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dS2, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dS3, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + // Stream 0: reduce blinded canonicals and FFT while D2D runs concurrently. + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + // L₁ denominator inverse: gpuWork[i] = 1/(cosetGen·ω^i - 1) + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) // result is temp; inverses stored in gpuWork + + // l1Scalar = (cosetGen^n - 1) / n = zhZeta / n at this coset + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + // Gate selectors: overlap transfer-stream D2D copies with compute-stream FFTs. + dev.RecordEvent(gpu.StreamCompute, evPermDone) + + dev.WaitEvent(gpu.StreamTransfer, evPermDone) + gpuS1.CopyFromDeviceStream(inst.dQr, gpu.StreamTransfer) + gpuS2.CopyFromDeviceStream(inst.dQm, gpu.StreamTransfer) + gpuS3.CopyFromDeviceStream(inst.dQo, gpu.StreamTransfer) + gpuWork.CopyFromDeviceStream(wb.QkSrc, gpu.StreamTransfer) + dev.RecordEvent(gpu.StreamTransfer, evS123Done) + + gpuZ.CopyFromDevice(inst.dQl) + fftDom.CosetFFT(gpuZ, cosetGen) + + dev.WaitEvent(gpu.StreamCompute, evS123Done) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + gpuZ.CopyFromDevice(inst.dQcp[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(wb.Pi2Src) && wb.Pi2Src[j] != nil { + gpuWork.CopyFromDevice(wb.Pi2Src[j]) + } else { + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + // Store the first three coset results on GPU. Keep the fourth in gpuResult. + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + dev.RecordEvent(gpu.StreamCompute, evCosetDone) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func newLowMemorySelectorCache(inst *gpuInstance, allocated *[]*FrVector) lowMemorySelectorCache { + if os.Getenv("GNARK_GPU_PLONK2_DISABLE_LOW_MEMORY_SELECTOR_CACHE") != "" { + return lowMemorySelectorCache{} + } + + upload := func(name string, data fr.Vector) *FrVector { + v, err := NewFrVector(inst.dev, inst.n) + if err != nil { + log.Printf("plonk2: low-memory selector cache stopped at %s: %v", name, err) + return nil + } + *allocated = append(*allocated, v) + v.CopyFromHost(data) + return v + } + + cache := lowMemorySelectorCache{ + ql: upload("ql", inst.qlCanonical), + qr: upload("qr", inst.qrCanonical), + qm: upload("qm", inst.qmCanonical), + qo: upload("qo", inst.qoCanonical), + s1: upload("s1", inst.s1Canonical), + s2: upload("s2", inst.s2Canonical), + s3: upload("s3", inst.s3Canonical), + } + if len(inst.qcpCanonical) > 0 { + cache.qcp = make([]*FrVector, len(inst.qcpCanonical)) + for i := range inst.qcpCanonical { + cache.qcp[i] = upload(fmt.Sprintf("qcp[%d]", i), inst.qcpCanonical[i]) + } + } + + qcpCached := 0 + for i := range cache.qcp { + if cache.qcp[i] != nil { + qcpCached++ + } + } + log.Printf( + "plonk2: low-memory selector cache ql=%t qr=%t qm=%t qo=%t s1=%t s2=%t s3=%t qcp=%d/%d", + cache.ql != nil, cache.qr != nil, cache.qm != nil, cache.qo != nil, + cache.s1 != nil, cache.s2 != nil, cache.s3 != nil, + qcpCached, len(inst.qcpCanonical), + ) + return cache +} + +func computeNumeratorGPULowMemory( + inst *gpuInstance, gpuWork *FrVector, + lBlinded, rBlinded, oBlinded, zBlinded []fr.Element, + qkCanonical []fr.Element, pi2Canonical [][]fr.Element, + alpha, beta, gamma fr.Element, +) (h1, h2, h3 []fr.Element, retErr error) { + n := inst.n + dev := inst.dev + fftDom := inst.fftDom + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + if len(qkCanonical) < n { + return nil, nil, nil, fmt.Errorf("low-memory quotient: qk canonical length %d < %d", len(qkCanonical), n) + } + + var allocated []*FrVector + alloc := func(name string) (*FrVector, error) { + v, err := NewFrVector(inst.dev, n) + if err != nil { + return nil, fmt.Errorf("alloc %s: %w", name, err) + } + allocated = append(allocated, v) + return v, nil + } + defer func() { + for _, v := range allocated { + v.Free() + } + }() + + gpuL, err := alloc("L") + if err != nil { + return nil, nil, nil, err + } + gpuR, err := alloc("R") + if err != nil { + return nil, nil, nil, err + } + gpuO, err := alloc("O") + if err != nil { + return nil, nil, nil, err + } + gpuZ, err := alloc("Z") + if err != nil { + return nil, nil, nil, err + } + gpuS1, err := alloc("S1") + if err != nil { + return nil, nil, nil, err + } + gpuS2, err := alloc("S2") + if err != nil { + return nil, nil, nil, err + } + gpuS3, err := alloc("S3") + if err != nil { + return nil, nil, nil, err + } + gpuResult, err := alloc("Result") + if err != nil { + return nil, nil, nil, err + } + gpuLCan, err := alloc("LCan") + if err != nil { + return nil, nil, nil, err + } + gpuRCan, err := alloc("RCan") + if err != nil { + return nil, nil, nil, err + } + gpuOCan, err := alloc("OCan") + if err != nil { + return nil, nil, nil, err + } + gpuZCan, err := alloc("ZCan") + if err != nil { + return nil, nil, nil, err + } + gpuQkSrc, err := alloc("QkSrc") + if err != nil { + return nil, nil, nil, err + } + var gpuCosetBlocks [3]*FrVector + for k := range gpuCosetBlocks { + gpuCosetBlocks[k], err = alloc(fmt.Sprintf("CosetBlock%d", k)) + if err != nil { + return nil, nil, nil, err + } + } + selectorCache := newLowMemorySelectorCache(inst, &allocated) + copySelector := func(dst, device *FrVector, host fr.Vector) { + if device != nil { + dst.CopyFromDevice(device) + return + } + dst.CopyFromHost(host) + } + + gpuLCan.CopyFromHost(fr.Vector(lBlinded[:n])) + gpuRCan.CopyFromHost(fr.Vector(rBlinded[:n])) + gpuOCan.CopyFromHost(fr.Vector(oBlinded[:n])) + gpuZCan.CopyFromHost(fr.Vector(zBlinded[:n])) + gpuQkSrc.CopyFromHost(fr.Vector(qkCanonical[:n])) + + domain1 := fft.NewDomain(4*uint64(n), fft.WithoutPrecompute()) + u := domain1.FrMultiplicativeGen + g1 := domain1.Generator + var cosetShiftSq fr.Element + cosetShiftSq.Square(&cosetShift) + bn := big.NewInt(int64(n)) + var one fr.Element + one.SetOne() + + hFull := inst.hBufs.hFull + + var cosetGen fr.Element + for k := 0; k < 4; k++ { + if k == 0 { + cosetGen.Set(&u) + } else { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetPowN fr.Element + cosetPowN.Exp(cosetGen, bn) + + copySelector(gpuS1, selectorCache.s1, inst.s1Canonical) + copySelector(gpuS2, selectorCache.s2, inst.s2Canonical) + copySelector(gpuS3, selectorCache.s3, inst.s3Canonical) + + ReduceBlindedCoset(gpuL, gpuLCan, lBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuR, gpuRCan, rBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuO, gpuOCan, oBlinded[n:], cosetPowN) + ReduceBlindedCoset(gpuZ, gpuZCan, zBlinded[n:], cosetPowN) + fftDom.CosetFFT(gpuL, cosetGen) + fftDom.CosetFFT(gpuR, cosetGen) + fftDom.CosetFFT(gpuO, cosetGen) + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + + ComputeL1Den(gpuWork, cosetGen, fftDom) + gpuWork.BatchInvert(gpuResult) + + var l1Scalar fr.Element + l1Scalar.Sub(&cosetPowN, &one) + l1Scalar.Mul(&l1Scalar, &domain0.CardinalityInv) + + PlonkPermBoundary( + gpuResult, gpuL, gpuR, gpuO, gpuZ, + gpuS1, gpuS2, gpuS3, gpuWork, + alpha, beta, gamma, l1Scalar, + cosetShift, cosetShiftSq, cosetGen, + fftDom, + ) + + copySelector(gpuS1, selectorCache.qr, inst.qrCanonical) + copySelector(gpuS2, selectorCache.qm, inst.qmCanonical) + copySelector(gpuS3, selectorCache.qo, inst.qoCanonical) + gpuWork.CopyFromDevice(gpuQkSrc) + copySelector(gpuZ, selectorCache.ql, inst.qlCanonical) + + fftDom.CosetFFT(gpuZ, cosetGen) + fftDom.CosetFFT(gpuS1, cosetGen) + fftDom.CosetFFT(gpuS2, cosetGen) + fftDom.CosetFFT(gpuS3, cosetGen) + fftDom.CosetFFT(gpuWork, cosetGen) + + var zhKInv fr.Element + zhKInv.Sub(&cosetPowN, &one) + zhKInv.Inverse(&zhKInv) + + PlonkGateAccum(gpuResult, gpuZ, gpuS1, gpuS2, gpuS3, gpuWork, gpuL, gpuR, gpuO, zhKInv) + + for j := range pi2Canonical { + var qcpDevice *FrVector + if j < len(selectorCache.qcp) { + qcpDevice = selectorCache.qcp[j] + } + copySelector(gpuZ, qcpDevice, inst.qcpCanonical[j]) + fftDom.CosetFFT(gpuZ, cosetGen) + gpuWork.CopyFromHost(fr.Vector(pi2Canonical[j])) + fftDom.CosetFFT(gpuWork, cosetGen) + gpuZ.Mul(gpuZ, gpuWork) + gpuResult.AddScalarMul(gpuZ, zhKInv) + } + + if k < len(gpuCosetBlocks) { + gpuCosetBlocks[k].CopyFromDevice(gpuResult) + } + } + + blocks := [4]*FrVector{gpuCosetBlocks[0], gpuCosetBlocks[1], gpuCosetBlocks[2], gpuResult} + cosetGen.Set(&u) + for k := 0; k < 4; k++ { + if k > 0 { + cosetGen.Mul(&cosetGen, &g1) + } + var cosetGenInv fr.Element + cosetGenInv.Inverse(&cosetGen) + fftDom.CosetFFTInverse(blocks[k], cosetGenInv) + } + + var omega4Inv, quarter fr.Element + { + var omega4 fr.Element + omega4.Exp(g1, bn) + omega4Inv.Inverse(&omega4) + } + quarter.SetUint64(4) + quarter.Inverse(&quarter) + Butterfly4Inverse(blocks[0], blocks[1], blocks[2], blocks[3], omega4Inv, quarter) + + var uInvN fr.Element + { + var uN fr.Element + uN.Exp(u, bn) + uInvN.Inverse(&uN) + } + blocks[1].ScalarMul(uInvN) + var uInv2N, uInv3N fr.Element + uInv2N.Mul(&uInvN, &uInvN) + blocks[2].ScalarMul(uInv2N) + uInv3N.Mul(&uInv2N, &uInvN) + blocks[3].ScalarMul(uInv3N) + + if err := dev.Sync(); err != nil { + return nil, nil, nil, fmt.Errorf("low-memory quotient GPU sync: %w", err) + } + + for k := 0; k < 4; k++ { + blocks[k].CopyToHost(fr.Vector(hFull[k*n : (k+1)*n])) + } + + np2 := n + 2 + h1 = hFull[:np2] + h2 = hFull[np2 : 2*np2] + h3 = hFull[2*np2 : 3*np2] + return h1, h2, h3, nil +} + +func gpuCommit(msm *G1MSM, coeffs []fr.Element) (curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + var aff curve.G1Affine + aff.FromJacobian(&jacs[0]) + return aff, nil +} + +func gpuCommitN(msm *G1MSM, coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + jacs, err := msm.MultiExp(coeffSets...) + if err != nil { + return nil, err + } + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) commit(coeffs []fr.Element) (curve.G1Affine, error) { + commits, err := inst.commitN(coeffs) + if err != nil { + return curve.G1Affine{}, err + } + return commits[0], nil +} + +func (inst *gpuInstance) commitN(coeffSets ...[]fr.Element) ([]curve.G1Affine, error) { + if inst.lowMemory { + if err := inst.reloadMSMPoints(); err != nil { + return nil, fmt.Errorf("reload MSM points: %w", err) + } + defer func() { + _ = inst.releaseMSMWorkBuffers() + _ = inst.offloadMSMPoints() + }() + } + var jacs []curve.G1Jac + var err error + if inst.splitMSM != nil { + jacs, err = MultiExpSplitBatchAt(inst.splitMSM.msm0, inst.splitMSM.msm1, inst.splitMSM.split, coeffSets...) + } else { + jacs, err = inst.msm.MultiExp(coeffSets...) + } + if err != nil { + return nil, err + } + inst.logMSMPhaseTimings(coeffSets...) + affs := make([]curve.G1Affine, len(jacs)) + for i := range jacs { + affs[i].FromJacobian(&jacs[i]) + } + return affs, nil +} + +func (inst *gpuInstance) logMSMPhaseTimings(coeffSets ...[]fr.Element) { + if os.Getenv("GNARK_GPU_PLONK2_LOG_MSM_PHASES") == "" { + return + } + counts := make([]int, len(coeffSets)) + for i := range coeffSets { + counts[i] = len(coeffSets[i]) + } + if inst.splitMSM != nil { + primaryCounts := make([]int, len(coeffSets)) + secondaryCounts := make([]int, len(coeffSets)) + for i, count := range counts { + primaryCounts[i] = inst.splitMSM.split + if count < primaryCounts[i] { + primaryCounts[i] = count + } + secondaryCounts[i] = count - primaryCounts[i] + } + logMSMPhaseTimings(inst.n, "primary", inst.splitMSM.msm0.LastBatchPhaseTimings(), primaryCounts) + logMSMPhaseTimings(inst.n, "secondary", inst.splitMSM.msm1.LastBatchPhaseTimings(), secondaryCounts) + return + } + logMSMPhaseTimings(inst.n, "single", inst.msm.LastBatchPhaseTimings(), counts) +} + +func logMSMPhaseTimings(n int, device string, timings [][9]float32, scalarCounts []int) { + names := [...]string{ + "h2d", "build_pairs", "sort", "boundaries", "accum_seq", + "accum_par", "reduce_partial", "reduce_finalize", "d2h", + } + for i, phase := range timings { + total := float32(0) + for _, ms := range phase { + total += ms + } + scalars := 0 + if i < len(scalarCounts) { + scalars = scalarCounts[i] + } + log.Printf( + " [GPUProve n=%d] MSM phases device=%s set=%d scalars=%d total=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms %s=%.3fms", + n, device, i, scalars, total, + names[0], phase[0], names[1], phase[1], names[2], phase[2], + names[3], phase[3], names[4], phase[4], names[5], phase[5], + names[6], phase[6], names[7], phase[7], names[8], phase[8], + ) + } +} + +func gpuBatchOpen( + commit func([]fr.Element) (curve.G1Affine, error), + polys [][]fr.Element, + digests []curve.G1Affine, + claimedValues []fr.Element, + point fr.Element, + kzgFoldingHash hash.Hash, + dataTranscript []byte, +) (kzg.BatchOpeningProof, error) { + var res kzg.BatchOpeningProof + res.ClaimedValues = claimedValues + + fsGamma := fiatshamir.NewTranscript(kzgFoldingHash, "gamma") + if err := fsGamma.Bind("gamma", point.Marshal()); err != nil { + return res, err + } + for i := range digests { + if err := fsGamma.Bind("gamma", digests[i].Marshal()); err != nil { + return res, err + } + } + for i := range claimedValues { + if err := fsGamma.Bind("gamma", claimedValues[i].Marshal()); err != nil { + return res, err + } + } + if len(dataTranscript) > 0 { + if err := fsGamma.Bind("gamma", dataTranscript); err != nil { + return res, err + } + } + gammaByte, err := fsGamma.ComputeChallenge("gamma") + if err != nil { + return res, err + } + var gammaChallenge fr.Element + gammaChallenge.SetBytes(gammaByte) + + nbPolys := len(polys) + largestPoly := 0 + for _, p := range polys { + if len(p) > largestPoly { + largestPoly = len(p) + } + } + + gammas := make([]fr.Element, nbPolys) + gammas[0].SetOne() + for i := 1; i < nbPolys; i++ { + gammas[i].Mul(&gammas[i-1], &gammaChallenge) + } + + folded := make(fr.Vector, largestPoly) + nCPU := runtime.NumCPU() + chunkSize := (largestPoly + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < largestPoly; c += chunkSize { + start := c + end := start + chunkSize + if end > largestPoly { + end = largestPoly + } + wg.Add(1) + go func() { + defer wg.Done() + temp := make(fr.Vector, end-start) + for i := range nbPolys { + effEnd := end + if effEnd > len(polys[i]) { + effEnd = len(polys[i]) + } + if start >= effEnd { + continue + } + n := effEnd - start + t := fr.Vector(temp[:n]) + t.ScalarMul(fr.Vector(polys[i][start:effEnd]), &gammas[i]) + f := fr.Vector(folded[start:effEnd]) + f.Add(f, t) + } + }() + } + wg.Wait() + + var foldedEval fr.Element + for i := nbPolys - 1; i >= 0; i-- { + foldedEval.Mul(&foldedEval, &gammaChallenge).Add(&foldedEval, &claimedValues[i]) + } + folded[0].Sub(&folded[0], &foldedEval) + parallelHornerQuotient(folded, point) + h := folded[1:] + + res.H, err = commit(h) + if err != nil { + return res, err + } + return res, nil +} + +func computeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, pi2DeviceReady []bool, + h1, h2, h3 []fr.Element, +) []fr.Element { + n := inst.n + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + // Pre-allocated GPU buffers from gpuInstance (guaranteed non-nil after newGPUInstance). + gpuResult := inst.qWb.LinResult + gpuW := inst.qWb.LinW + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + PlonkLinearizeStatic( + gpuResult, inst.qWb.ZCan, inst.dS3, + inst.dQl, inst.dQr, inst.dQm, inst.dQo, inst.dQkFixed, + combinedZCoeff, s1, lZeta, rZeta, rl, oZeta, + ) + + for j := range qcpZeta { + if j < len(pi2DeviceReady) && pi2DeviceReady[j] && j < len(inst.qWb.Pi2Src) && inst.qWb.Pi2Src[j] != nil { + gpuW.CopyFromDevice(inst.qWb.Pi2Src[j]) + } else { + gpuW.CopyFromHost(fr.Vector(pi2Canonical[j])) + } + gpuResult.AddScalarMul(gpuW, qcpZeta[j]) + } + + var negCoeff fr.Element + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Mul(&negCoeff, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h3[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Mul(&zhZeta, &zetaNPlusTwo).Neg(&negCoeff) + gpuW.CopyFromHost(fr.Vector(h2[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + negCoeff.Neg(&zhZeta) + gpuW.CopyFromHost(fr.Vector(h1[:n])) + gpuResult.AddScalarMul(gpuW, negCoeff) + + gpuResult.CopyToHost(fr.Vector(blindedZCanonical[:n])) + + for i := n; i < len(blindedZCanonical); i++ { + var t fr.Element + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + return blindedZCanonical +} + +func innerComputeLinearizedPoly( + inst *gpuInstance, + lZeta, rZeta, oZeta, alpha, beta, gamma, zeta, zu fr.Element, + s1Zeta, s2Zeta fr.Element, + qcpZeta []fr.Element, blindedZCanonical []fr.Element, pi2Canonical [][]fr.Element, + h1, h2, h3 []fr.Element, +) []fr.Element { + domain0 := inst.domain0 + cosetShift := inst.vk.CosetShift + var rl fr.Element + rl.Mul(&rZeta, &lZeta) + var s1, tmp fr.Element + s1.Mul(&s1Zeta, &beta).Add(&s1, &lZeta).Add(&s1, &gamma) + tmp.Mul(&s2Zeta, &beta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s1.Mul(&s1, &tmp).Mul(&s1, &zu).Mul(&s1, &beta).Mul(&s1, &alpha) + var s2 fr.Element + var uzeta, uuzeta fr.Element + uzeta.Mul(&zeta, &cosetShift) + uuzeta.Mul(&uzeta, &cosetShift) + s2.Mul(&beta, &zeta).Add(&s2, &lZeta).Add(&s2, &gamma) + tmp.Mul(&beta, &uzeta).Add(&tmp, &rZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp) + tmp.Mul(&beta, &uuzeta).Add(&tmp, &oZeta).Add(&tmp, &gamma) + s2.Mul(&s2, &tmp).Neg(&s2).Mul(&s2, &alpha) + var zhZeta, zetaNPlusTwo, alphaSquareLagrangeZero, den fr.Element + nbElmt := int64(domain0.Cardinality) + alphaSquareLagrangeZero.Set(&zeta).Exp(alphaSquareLagrangeZero, big.NewInt(nbElmt)) + zetaNPlusTwo.Mul(&alphaSquareLagrangeZero, &zeta).Mul(&zetaNPlusTwo, &zeta) + one := fr.One() + alphaSquareLagrangeZero.Sub(&alphaSquareLagrangeZero, &one) + zhZeta.Set(&alphaSquareLagrangeZero) + den.Sub(&zeta, &one).Inverse(&den) + alphaSquareLagrangeZero.Mul(&alphaSquareLagrangeZero, &den). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &alpha). + Mul(&alphaSquareLagrangeZero, &domain0.CardinalityInv) + + s3can := []fr.Element(inst.s3Canonical) + cql := []fr.Element(inst.qlCanonical) + cqr := []fr.Element(inst.qrCanonical) + cqm := []fr.Element(inst.qmCanonical) + cqo := []fr.Element(inst.qoCanonical) + cqk := []fr.Element(inst.qkFixedCanonical) + + var combinedZCoeff fr.Element + combinedZCoeff.Add(&s2, &alphaSquareLagrangeZero) + + total := len(blindedZCanonical) + nCPU := runtime.NumCPU() + chunkSize := (total + nCPU - 1) / nCPU + var wg sync.WaitGroup + for c := 0; c < total; c += chunkSize { + start := c + end := start + chunkSize + if end > total { + end = total + } + wg.Add(1) + go func() { + defer wg.Done() + var t, t0, t1 fr.Element + for i := start; i < end; i++ { + t.Mul(&blindedZCanonical[i], &combinedZCoeff) + if i < len(s3can) { + t0.Mul(&s3can[i], &s1) + t.Add(&t, &t0) + } + if i < len(cqm) { + t1.Mul(&cqm[i], &rl) + t.Add(&t, &t1) + t0.Mul(&cql[i], &lZeta) + t.Add(&t, &t0) + t0.Mul(&cqr[i], &rZeta) + t.Add(&t, &t0) + t0.Mul(&cqo[i], &oZeta) + t.Add(&t, &t0) + t.Add(&t, &cqk[i]) + } + for j := range qcpZeta { + if i < len(pi2Canonical[j]) { + t0.Mul(&pi2Canonical[j][i], &qcpZeta[j]) + t.Add(&t, &t0) + } + } + if i < len(h3) { + var hv fr.Element + hv.Mul(&h3[i], &zetaNPlusTwo). + Add(&hv, &h2[i]). + Mul(&hv, &zetaNPlusTwo). + Add(&hv, &h1[i]). + Mul(&hv, &zhZeta) + t.Sub(&t, &hv) + } + blindedZCanonical[i] = t + } + }() + } + wg.Wait() + return blindedZCanonical +} + +// ─── Polynomial helpers ─────────────────────────────────────────────────────── + +func blindInto(dst []fr.Element, canonical []fr.Element, bp *iop.Polynomial) []fr.Element { + cbp := bp.Coefficients() + result := dst[:len(canonical)+len(cbp)] + copy(result, canonical) + copy(result[len(canonical):], cbp) + for i := 0; i < len(cbp); i++ { + result[i].Sub(&result[i], &cbp[i]) + } + return result +} + +func getRandomPolynomial(degree int) *iop.Polynomial { + coeffs := make([]fr.Element, degree+1) + for i := range coeffs { + coeffs[i].SetRandom() + } + return iop.NewPolynomial(&coeffs, iop.Form{Basis: iop.Canonical, Layout: iop.Regular}) +} + +func parallelHornerQuotient(poly []fr.Element, z fr.Element) { + n := len(poly) + nCPU := runtime.NumCPU() + if n < 4096 || nCPU < 2 { + for i := n - 2; i >= 0; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + return + } + chunkSize := (n + nCPU - 1) / nCPU + numChunks := (n + chunkSize - 1) / chunkSize + var wg sync.WaitGroup + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + wg.Add(1) + go func(lo, hi int) { + defer wg.Done() + for i := hi - 2; i >= lo; i-- { + var tmp fr.Element + tmp.Mul(&poly[i+1], &z) + poly[i].Add(&poly[i], &tmp) + } + }(lo, hi) + } + wg.Wait() + zk := expElement(z, chunkSize) + carries := make([]fr.Element, numChunks) + for c := numChunks - 2; c >= 0; c-- { + nextLo := (c + 1) * chunkSize + nextLen := chunkSize + if nextLo+nextLen > n { + nextLen = n - nextLo + } + zkc := zk + if nextLen != chunkSize { + zkc = expElement(z, nextLen) + } + var tmp fr.Element + tmp.Mul(&carries[c+1], &zkc) + carries[c].Add(&poly[nextLo], &tmp) + } + for c := range numChunks { + lo := c * chunkSize + hi := lo + chunkSize + if hi > n { + hi = n + } + if carries[c].IsZero() { + continue + } + wg.Add(1) + go func(lo, hi, c int) { + defer wg.Done() + var zPow fr.Element + zPow.Set(&z) + for i := hi - 1; i >= lo; i-- { + var corr fr.Element + corr.Mul(&zPow, &carries[c]) + poly[i].Add(&poly[i], &corr) + zPow.Mul(&zPow, &z) + } + }(lo, hi, c) + } + wg.Wait() +} + +func expElement(z fr.Element, exp int) fr.Element { + var base, acc fr.Element + base.Set(&z) + acc.SetOne() + for exp > 0 { + if exp&1 != 0 { + acc.Mul(&acc, &base) + } + base.Square(&base) + exp >>= 1 + } + return acc +} + +// ─── Fiat-Shamir helpers ────────────────────────────────────────────────────── + +func bindPublicData(fs *fiatshamir.Transcript, challenge string, vk *curplonk.VerifyingKey, publicInputs []fr.Element) error { + for _, f := range []func() []byte{ + func() []byte { return vk.S[0].Marshal() }, + func() []byte { return vk.S[1].Marshal() }, + func() []byte { return vk.S[2].Marshal() }, + func() []byte { return vk.Ql.Marshal() }, + func() []byte { return vk.Qr.Marshal() }, + func() []byte { return vk.Qm.Marshal() }, + func() []byte { return vk.Qo.Marshal() }, + func() []byte { return vk.Qk.Marshal() }, + } { + if err := fs.Bind(challenge, f()); err != nil { + return err + } + } + for i := range vk.Qcp { + if err := fs.Bind(challenge, vk.Qcp[i].Marshal()); err != nil { + return err + } + } + for i := range publicInputs { + if err := fs.Bind(challenge, publicInputs[i].Marshal()); err != nil { + return err + } + } + return nil +} + +func deriveRandomness(fs *fiatshamir.Transcript, challenge string, points ...*curve.G1Affine) (fr.Element, error) { + var buf [curve.SizeOfG1AffineUncompressed]byte + var r fr.Element + for _, p := range points { + buf = p.RawBytes() + if err := fs.Bind(challenge, buf[:]); err != nil { + return r, err + } + } + b, err := fs.ComputeChallenge(challenge) + if err != nil { + return r, err + } + r.SetBytes(b) + return r, nil +} + +func newHTF(domain []byte) hash.Hash { + return htf.New(domain) +} + +// ─── suppress unused imports ────────────────────────────────────────────────── +var _ = bits.TrailingZeros +var _ = unsafe.Pointer(nil) diff --git a/prover/gpu/plonk2/bw6761/prove_stub.go b/prover/gpu/plonk2/bw6761/prove_stub.go new file mode 100644 index 00000000000..ac491a02a18 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/prove_stub.go @@ -0,0 +1,34 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build !cuda + +package bw6761 + +import ( + "errors" + + curve "github.com/consensys/gnark-crypto/ecc/bw6-761" + "github.com/consensys/gnark/backend" + curplonk "github.com/consensys/gnark/backend/plonk/bw6-761" + "github.com/consensys/gnark/backend/witness" + cs "github.com/consensys/gnark/constraint/bw6-761" + "github.com/consensys/linea-monorepo/prover/gpu" +) + +type GPUProvingKey struct { + Vk *curplonk.VerifyingKey +} + +func NewGPUProvingKey(_ []curve.G1Affine, vk *curplonk.VerifyingKey) *GPUProvingKey { + return &GPUProvingKey{Vk: vk} +} + +func (gpk *GPUProvingKey) Size() int { return 0 } +func (gpk *GPUProvingKey) Prepare(_ *gpu.Device, _ *cs.SparseR1CS) error { + return errors.New("gpu: cuda required") +} +func (gpk *GPUProvingKey) Close() {} + +func GPUProve(_ *gpu.Device, _ *GPUProvingKey, _ *cs.SparseR1CS, _ witness.Witness, _ ...backend.ProverOption) (*curplonk.Proof, error) { + return nil, errors.New("gpu: cuda required") +} diff --git a/prover/gpu/plonk2/doc.go b/prover/gpu/plonk2/doc.go new file mode 100644 index 00000000000..552069c428f --- /dev/null +++ b/prover/gpu/plonk2/doc.go @@ -0,0 +1,40 @@ +// Package plonk2 is the curve-generic GPU PlonK prover used by the +// linea-monorepo prover binary. +// +// Layout: +// +// plonk2/ — multi-curve dispatcher (this package) +// plonk2/bls12377/ — BLS12-377 prover, used for compression +// plonk2/bn254/ — BN254 prover, used for aggregation BN254 emulation +// plonk2/bw6761/ — BW6-761 prover, used for aggregation +// +// The three per-curve packages are produced by gpu/internal/generator/plonk +// from a shared template. Re-emit them with `go run ./gpu/internal/generator` +// after editing the templates; the curve files are otherwise identical. +// +// Build tags: +// +// cuda — links against gpu/cuda/build/libgnark_gpu.a; full GPU acceleration +// !cuda — stub types; the dispatcher falls back to gnark's CPU prover +// +// Design constraints: +// +// - SoA layout for GPU field vectors (coalesced limb access in CUDA). +// - AoS Montgomery layout for host buffers (matches gnark-crypto). +// - One CUDA context per Device; the top-level gpu package owns lifecycle. +// - Pinned host staging buffers reused across rounds (see pinned_fr.go and +// prove.go's persistent work-buffer scope). +// - All multi-stream work (FFT/MSM/permutation) drains via the device's +// compute stream before any cross-stream sync. +// +// Activation in the linea-monorepo prover: +// +// - Compression auto-enables this prover whenever a GPU is reachable, via +// circuits.WithGPU(true) plumbed from backend/dataavailability/prove.go. +// - Aggregation only uses it when the operator opts in via the master flag +// LINEA_PROVER_GPU_AGGREGATION=1 (see backend/aggregation/prove.go). +// +// See gpu/plonk2/bls12377/prove.go for the per-curve top-level prover entry +// point — that is the right starting place when reviewing the GPU PlonK +// pipeline. +package plonk2 diff --git a/prover/gpu/plonk2/options.go b/prover/gpu/plonk2/options.go new file mode 100644 index 00000000000..8406c040aeb --- /dev/null +++ b/prover/gpu/plonk2/options.go @@ -0,0 +1,42 @@ +package plonk2 + +// Option configures a plonk2 Prover. +type Option func(*proverConfig) + +type proverConfig struct { + // enabled is the master kill-switch for the GPU path. When false, Prove + // returns gnark's CPU prover output without ever touching the device. + enabled bool + // cpuFallback controls whether a GPU-side error falls back to the CPU + // prover. Default true. Disabled by WithStrictMode for tests that must + // fail loudly when the GPU disagrees with the reference. + cpuFallback bool +} + +func defaultProverConfig() proverConfig { + return proverConfig{ + cpuFallback: true, + } +} + +// WithEnabled controls whether the GPU path is attempted. +func WithEnabled(enabled bool) Option { + return func(c *proverConfig) { c.enabled = enabled } +} + +// WithCPUFallback controls whether Prove falls back to gnark's CPU prover +// when the GPU path is disabled or returns an error. Default: true. +func WithCPUFallback(enabled bool) Option { + return func(c *proverConfig) { c.cpuFallback = enabled } +} + +// WithStrictMode disables the CPU fallback and returns errors instead. +// Used by tests that must fail when the GPU path errors, rather than +// silently falling through to the CPU and masking the bug. +func WithStrictMode(strict bool) Option { + return func(c *proverConfig) { + if strict { + c.cpuFallback = false + } + } +} diff --git a/prover/gpu/plonk2/prove.go b/prover/gpu/plonk2/prove.go new file mode 100644 index 00000000000..f37c080121c --- /dev/null +++ b/prover/gpu/plonk2/prove.go @@ -0,0 +1,197 @@ +package plonk2 + +import ( + "errors" + "fmt" + "log" + "reflect" + + "github.com/consensys/gnark/backend" + gnarkplonk "github.com/consensys/gnark/backend/plonk" + plonk_bls12377 "github.com/consensys/gnark/backend/plonk/bls12-377" + plonk_bn254 "github.com/consensys/gnark/backend/plonk/bn254" + plonk_bw6761 "github.com/consensys/gnark/backend/plonk/bw6-761" + "github.com/consensys/gnark/backend/witness" + "github.com/consensys/gnark/constraint" + cs_bls12377 "github.com/consensys/gnark/constraint/bls12-377" + cs_bn254 "github.com/consensys/gnark/constraint/bn254" + cs_bw6761 "github.com/consensys/gnark/constraint/bw6-761" + + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bls12377" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bn254" + "github.com/consensys/linea-monorepo/prover/gpu/plonk2/bw6761" +) + +// Prover is a multi-curve GPU PlonK prover dispatcher. +// +// CPU fallback is applied when: +// - The GPU is disabled (WithEnabled(false)) +// - The GPU path returns an error with WithCPUFallback(true) (default) +// - The constraint system curve is not supported +type Prover struct { + dev *gpu.Device + cfg proverConfig + ccs constraint.ConstraintSystem + pk gnarkplonk.ProvingKey + vk gnarkplonk.VerifyingKey + + // per-curve GPU proving keys — at most one is non-nil + bn254PK *bn254.GPUProvingKey + bls12377PK *bls12377.GPUProvingKey + bw6761PK *bw6761.GPUProvingKey + + closed bool +} + +// NewProver creates a GPU prover for the given constraint system and proving key. +// The prover inspects the curve ID and instantiates the appropriate per-curve GPU PK. +func NewProver(dev *gpu.Device, ccs constraint.ConstraintSystem, pk gnarkplonk.ProvingKey, vk gnarkplonk.VerifyingKey, opts ...Option) (*Prover, error) { + cfg := defaultProverConfig() + for _, o := range opts { + o(&cfg) + } + p := &Prover{dev: dev, cfg: cfg, ccs: ccs, pk: pk, vk: vk} + if cfg.enabled { + if err := p.initGPU(); err != nil { + if !cfg.cpuFallback { + return nil, fmt.Errorf("plonk2: init GPU: %w", err) + } + // GPU init failed but CPU fallback is enabled — continue with CPU only. + } + } + return p, nil +} + +func (p *Prover) initGPU() error { + switch gpk := p.pk.(type) { + case *plonk_bn254.ProvingKey: + p.bn254PK = bn254.NewGPUProvingKey(gpk.Kzg.G1, p.vk.(*plonk_bn254.VerifyingKey)) + case *plonk_bls12377.ProvingKey: + p.bls12377PK = bls12377.NewGPUProvingKey(gpk.Kzg.G1, p.vk.(*plonk_bls12377.VerifyingKey)) + case *plonk_bw6761.ProvingKey: + p.bw6761PK = bw6761.NewGPUProvingKey(gpk.Kzg.G1, p.vk.(*plonk_bw6761.VerifyingKey)) + default: + return fmt.Errorf("plonk2: unsupported proving key type %T", p.pk) + } + return nil +} + +// Prove generates a PlonK proof for the given witness. When the GPU path +// is enabled and the curve is supported, dispatches to the per-curve +// gpu/plonk2/.GPUProve. On error, falls back to gnark's CPU prover +// unless WithStrictMode/WithCPUFallback(false) was set. +func (p *Prover) Prove(w witness.Witness, opts ...backend.ProverOption) (gnarkplonk.Proof, error) { + if p.closed { + return nil, errors.New("plonk2: prover is closed") + } + if p.cfg.enabled && (p.bn254PK != nil || p.bls12377PK != nil || p.bw6761PK != nil) { + proof, err := p.proveGPU(w, opts...) + if err == nil { + return proof, nil + } + if !p.cfg.cpuFallback { + return nil, err + } + log.Printf("plonk2: GPU prove failed, falling back to CPU: %v", err) + } + if !p.cfg.cpuFallback { + return nil, errors.New("plonk2: GPU disabled and CPU fallback disabled") + } + return gnarkplonk.Prove(p.ccs, p.pk, w, opts...) +} + +func (p *Prover) proveGPU(w witness.Witness, opts ...backend.ProverOption) (gnarkplonk.Proof, error) { + switch { + case p.bn254PK != nil: + spr, ok := p.ccs.(*cs_bn254.SparseR1CS) + if !ok { + return nil, fmt.Errorf("plonk2: BN254 CCS type mismatch: got %T", p.ccs) + } + normalizeGkrScheduleLevels(spr.Blueprints) + return bn254.GPUProve(p.dev, p.bn254PK, spr, w, opts...) + case p.bls12377PK != nil: + spr, ok := p.ccs.(*cs_bls12377.SparseR1CS) + if !ok { + return nil, fmt.Errorf("plonk2: BLS12-377 CCS type mismatch: got %T", p.ccs) + } + normalizeGkrScheduleLevels(spr.Blueprints) + return bls12377.GPUProve(p.dev, p.bls12377PK, spr, w, opts...) + case p.bw6761PK != nil: + spr, ok := p.ccs.(*cs_bw6761.SparseR1CS) + if !ok { + return nil, fmt.Errorf("plonk2: BW6-761 CCS type mismatch: got %T", p.ccs) + } + normalizeGkrScheduleLevels(spr.Blueprints) + return bw6761.GPUProve(p.dev, p.bw6761PK, spr, w, opts...) + default: + return nil, errors.New("plonk2: no GPU proving key initialized") + } +} + +// normalizeGkrScheduleLevels rewrites pointer-typed GKR schedule levels into +// their value-typed equivalents in-place. gnark's solver hands us blueprints +// where the GKR schedule is a slice of interface values; some of those values +// are *constraint.GkrSkipLevel etc., others are constraint.GkrSkipLevel. The +// per-curve GPU prover assumes the value form (we never need to mutate them +// after the solver hands them over), so we deref each pointer once at the +// start of Prove. This keeps the per-curve switch in proveGPU stable. + +func normalizeGkrScheduleLevels(blueprints []constraint.Blueprint) { + scheduleType := reflect.TypeFor[constraint.GkrProvingSchedule]() + + for _, blueprint := range blueprints { + value := reflect.ValueOf(blueprint) + if value.Kind() != reflect.Pointer || value.IsNil() { + continue + } + + value = value.Elem() + if value.Kind() != reflect.Struct { + continue + } + + schedule := value.FieldByName("Schedule") + if !schedule.IsValid() || !schedule.CanSet() || schedule.Type() != scheduleType { + continue + } + + for i := range schedule.Len() { + level := schedule.Index(i) + switch typed := level.Interface().(type) { + case *constraint.GkrSkipLevel: + if typed != nil { + level.Set(reflect.ValueOf(*typed)) + } + case *constraint.GkrSingleSourceZeroCheckLevel: + if typed != nil { + level.Set(reflect.ValueOf(*typed)) + } + case *constraint.GkrSumcheckLevel: + if typed != nil { + level.Set(reflect.ValueOf(*typed)) + } + } + } + } +} + +// Close releases all GPU resources. +func (p *Prover) Close() { + if p.closed { + return + } + p.closed = true + if p.bn254PK != nil { + p.bn254PK.Close() + p.bn254PK = nil + } + if p.bls12377PK != nil { + p.bls12377PK.Close() + p.bls12377PK = nil + } + if p.bw6761PK != nil { + p.bw6761PK.Close() + p.bw6761PK = nil + } +} diff --git a/prover/gpu/plonk2/prove_test.go b/prover/gpu/plonk2/prove_test.go new file mode 100644 index 00000000000..152121b6791 --- /dev/null +++ b/prover/gpu/plonk2/prove_test.go @@ -0,0 +1,53 @@ +package plonk2 + +import ( + "testing" + + "github.com/consensys/gnark/constraint" + "github.com/stretchr/testify/require" +) + +type scheduleBlueprint struct { + Schedule constraint.GkrProvingSchedule +} + +func (scheduleBlueprint) CalldataSize() int { return 0 } + +func (scheduleBlueprint) NbConstraints() int { return 0 } + +func (scheduleBlueprint) NbOutputs(constraint.Instruction) int { return 0 } + +func (scheduleBlueprint) UpdateInstructionTree( + constraint.Instruction, + constraint.InstructionTree, +) constraint.Level { + return 0 +} + +func TestNormalizeGkrScheduleLevels_DecodedPointerLevels(t *testing.T) { + skip := constraint.GkrSkipLevel{ + Wires: []int{1}, + ClaimSources: []constraint.GkrClaimSource{{Level: 2}}, + } + single := constraint.GkrSingleSourceZeroCheckLevel{ + Wires: []int{3}, + ClaimSources: []constraint.GkrClaimSource{{Level: 4}}, + } + sumcheck := constraint.GkrSumcheckLevel{{ + Wires: []int{5}, + ClaimSources: []constraint.GkrClaimSource{{Level: 6}}, + }} + + blueprint := &scheduleBlueprint{ + Schedule: constraint.GkrProvingSchedule{&skip, &single, &sumcheck}, + } + + normalizeGkrScheduleLevels([]constraint.Blueprint{blueprint}) + + require.IsType(t, constraint.GkrSkipLevel{}, blueprint.Schedule[0]) + require.IsType(t, constraint.GkrSingleSourceZeroCheckLevel{}, blueprint.Schedule[1]) + require.IsType(t, constraint.GkrSumcheckLevel{}, blueprint.Schedule[2]) + require.Equal(t, skip, blueprint.Schedule[0]) + require.Equal(t, single, blueprint.Schedule[1]) + require.Equal(t, sumcheck, blueprint.Schedule[2]) +} diff --git a/prover/gpu/plonk2/stub.go b/prover/gpu/plonk2/stub.go new file mode 100644 index 00000000000..1ad530d1a2f --- /dev/null +++ b/prover/gpu/plonk2/stub.go @@ -0,0 +1,3 @@ +//go:build !cuda + +package plonk2 diff --git a/prover/gpu/quotient/quotient.go b/prover/gpu/quotient/quotient.go new file mode 100644 index 00000000000..b507d573a01 --- /dev/null +++ b/prover/gpu/quotient/quotient.go @@ -0,0 +1,626 @@ +// GPU-accelerated quotient computation with batch NTT + pinned H2D. +// +//go:build cuda + +package quotient + +import ( + "fmt" + "math/big" + "reflect" + "runtime" + "sync/atomic" + "time" + "unsafe" + + "github.com/consensys/gnark-crypto/field/koalabear/extensions" + "github.com/consensys/gnark-crypto/field/koalabear/fft" + "github.com/consensys/linea-monorepo/prover/gpu" + gpusym "github.com/consensys/linea-monorepo/prover/gpu/symbolic" + gpuvortex "github.com/consensys/linea-monorepo/prover/gpu/vortex" + "github.com/consensys/linea-monorepo/prover/maths/common/fastpoly" + "github.com/consensys/linea-monorepo/prover/maths/common/fastpolyext" + "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" + sv "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/maths/field" + "github.com/consensys/linea-monorepo/prover/maths/field/fext" + "github.com/consensys/linea-monorepo/prover/protocol/coin" + "github.com/consensys/linea-monorepo/prover/protocol/column" + "github.com/consensys/linea-monorepo/prover/protocol/ifaces" + "github.com/consensys/linea-monorepo/prover/protocol/variables" + "github.com/consensys/linea-monorepo/prover/protocol/wizard" + "github.com/consensys/linea-monorepo/prover/symbolic" + "github.com/consensys/linea-monorepo/prover/utils" + "github.com/consensys/linea-monorepo/prover/utils/parallel" + "github.com/consensys/linea-monorepo/prover/utils/profiling" +) + +const maxGPUSlots = 8192 + +func RunGPU( + dev *gpu.Device, run *wizard.ProverRuntime, + domainSize int, ratios []int, + boards []symbolic.ExpressionBoard, + rootsForRatio [][]ifaces.Column, + shiftedForRatio [][]ifaces.Column, + quotientShares [][]ifaces.Column, + constraintsByRatio map[int][]int, +) error { + stopTimer := profiling.LogTimer("GPU quotient (domain size %d)", domainSize) + defer stopTimer() + + maxRatio := 0 + for _, r := range ratios { + if r > maxRatio { + maxRatio = r + } + } + + // ── Compile boards ─────────────────────────────────────────────────── + t0 := time.Now() + type cb struct { + pgm *gpusym.GPUSymProgram + meta []symbolic.Metadata + } + compiled := make([]cb, len(boards)) + var ( + gpuBoardCount int + cpuBoardCount int + maxBoardSlots int + fallbackBySlots int + ) + for k := range boards { + ops := gpusym.BoardToNodeOps(&boards[k]) + if len(ops) == 0 { + cpuBoardCount++ + continue + } + p := gpusym.CompileGPU(ops) + if p.NumSlots > maxBoardSlots { + maxBoardSlots = p.NumSlots + } + if len(p.Bytecode) == 0 || p.NumSlots > maxGPUSlots { + cpuBoardCount++ + if p.NumSlots > maxGPUSlots { + fallbackBySlots++ + } + continue + } + dp, err := gpusym.CompileSymGPU(dev, p) + if err != nil { + panic(fmt.Sprintf("gpu/quotient: compile[%d]: %v", k, err)) + } + defer dp.Free() + compiled[k] = cb{pgm: dp, meta: boards[k].ListVariableMetadata()} + gpuBoardCount++ + } + tCompile := time.Since(t0) + + // ── GPU NTT domain ─────────────────────────────────────────────────── + nttDomain, err := gpuvortex.NewGPUFFTDomain(dev, domainSize) + if err != nil { + return fmt.Errorf("gpu/quotient: NTT domain init (size %d): %w", domainSize, err) + } + defer nttDomain.Free() + var nInv field.Element + nInv.SetUint64(uint64(domainSize)) + nInv.Inverse(&nInv) + + cpuDomain0 := fft.NewDomain(uint64(domainSize), fft.WithCache()) + + var annBase []field.Element + var annExt []fext.Element + annBaseDone, annExtDone := false, false + + // ── Collect ALL roots, split base / ext ─────────────────────────────── + t0 = time.Now() + allRoots := make(map[ifaces.ColID]ifaces.Column) + for _, roots := range rootsForRatio { + for _, r := range roots { + allRoots[r.GetColID()] = r + } + } + + isExtRoot := make(map[ifaces.ColID]bool) + var baseRootIDs []ifaces.ColID + var extRootIDs []ifaces.ColID + for id, root := range allRoots { + w, ok := run.Columns.TryGet(id) + if !ok { + w = root.GetColAssignment(run) + } + if !smartvectors.IsBase(w) { + isExtRoot[id] = true + extRootIDs = append(extRootIDs, id) + } else { + baseRootIDs = append(baseRootIDs, id) + } + } + nBaseRoots := len(baseRootIDs) + nExtRoots := len(extRootIDs) + tRootSplit := time.Since(t0) + + ratioRootStats := make(map[int][2]int, len(constraintsByRatio)) + for ratio, constraintsIndices := range constraintsByRatio { + seen := make(map[ifaces.ColID]struct{}) + var baseCount, extCount int + for _, j := range constraintsIndices { + for _, root := range rootsForRatio[j] { + id := root.GetColID() + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + if isExtRoot[id] { + extCount++ + } else { + baseCount++ + } + } + } + ratioRootStats[ratio] = [2]int{baseCount, extCount} + } + + t0 = time.Now() + + // ── Ext root witness data (read once, cache coefficients in CPU memory) ── + extRootIdx := make(map[ifaces.ColID]int, nExtRoots) + for i, id := range extRootIDs { + extRootIdx[id] = i + } + extCoeffs := make([][]fext.Element, nExtRoots) + t0 = time.Now() + if nExtRoots > 0 { + parallel.Execute(nExtRoots, func(start, stop int) { + for k := start; k < stop; k++ { + id := extRootIDs[k] + w, ok := run.Columns.TryGet(id) + if !ok { + w = allRoots[id].GetColAssignment(run) + } + r := make([]fext.Element, domainSize) + w.WriteInSliceExt(r) + cpuDomain0.FFTInverseExt(r, fft.DIF, fft.WithNbTasks(1)) + extCoeffs[k] = r + } + }) + } + tExtIFFT := time.Since(t0) + + type ratioData struct { + baseIDs []ifaces.ColID + baseIdx map[ifaces.ColID]int + dPacked *gpuvortex.KBVector + dEvals *gpuvortex.KBVector + extIDs []ifaces.ColID + extIdx map[ifaces.ColID]int + dExtCoeffs *gpuvortex.KBVector // SoA, natural-order coefficients + dExtEvals *gpuvortex.KBVector // SoA, per-coset evaluations + extEvalPtrs []unsafe.Pointer // ptr to one root block in dExtEvals + } + + ratioPrepared := make(map[int]*ratioData, len(constraintsByRatio)) + var tPack, tH2D, tIFFT time.Duration + var tExtPrepGPU time.Duration + + for ratio, constraintsIndices := range constraintsByRatio { + rd := &ratioData{ + baseIdx: make(map[ifaces.ColID]int), + extIdx: make(map[ifaces.ColID]int), + } + seen := make(map[ifaces.ColID]struct{}) + for _, j := range constraintsIndices { + for _, root := range rootsForRatio[j] { + id := root.GetColID() + if _, ok := seen[id]; ok { + continue + } + seen[id] = struct{}{} + if isExtRoot[id] { + rd.extIdx[id] = len(rd.extIDs) + rd.extIDs = append(rd.extIDs, id) + } else { + rd.baseIdx[id] = len(rd.baseIDs) + rd.baseIDs = append(rd.baseIDs, id) + } + } + } + ratioRootStats[ratio] = [2]int{len(rd.baseIDs), len(rd.extIDs)} + + if len(rd.baseIDs) > 0 { + t0 = time.Now() + // Cached pinned buffer keyed on (deviceID, capacity). The + // first call on a given (device, ratio-shape) pays the + // cudaMallocHost; subsequent calls reuse it. This is the + // single biggest pre-optimization improvement for the + // quotient hot path — see gpu/vortex/pinned_cache.go. + deviceID := 0 + if dev != nil { + deviceID = dev.DeviceID() + } + capacity := len(rd.baseIDs) * domainSize + pinnedBuf := gpuvortex.GetPinned(deviceID, capacity)[:capacity] + parallel.Execute(len(rd.baseIDs), func(start, stop int) { + for k := start; k < stop; k++ { + id := rd.baseIDs[k] + w, ok := run.Columns.TryGet(id) + if !ok { + w = allRoots[id].GetColAssignment(run) + } + dst := pinnedBuf[k*domainSize : (k+1)*domainSize] + if reg, ok := w.(*sv.Regular); ok { + copy(dst, *reg) + } else { + w.WriteInSlice(dst) + } + } + }) + tPack += time.Since(t0) + + t0 = time.Now() + rd.dPacked, err = gpuvortex.NewKBVector(dev, len(rd.baseIDs)*domainSize) + if err != nil { + return fmt.Errorf("gpu/quotient: alloc dPacked (ratio %d, %d elems): %w", ratio, len(rd.baseIDs)*domainSize, err) + } + defer rd.dPacked.Free() + rd.dPacked.CopyFromHostPinned(pinnedBuf) + tH2D += time.Since(t0) + + t0 = time.Now() + nttDomain.BatchIFFTScale(rd.dPacked, len(rd.baseIDs), nInv) + gpuvortex.Sync(dev) + tIFFT += time.Since(t0) + // Pinned buffer stays in the cache; reused on next call. + + rd.dEvals, err = gpuvortex.NewKBVector(dev, len(rd.baseIDs)*domainSize) + if err != nil { + return fmt.Errorf("gpu/quotient: alloc dEvals (ratio %d, %d elems): %w", ratio, len(rd.baseIDs)*domainSize, err) + } + defer rd.dEvals.Free() + } + + if len(rd.extIDs) > 0 { + t0 = time.Now() + rd.dExtCoeffs, err = gpuvortex.NewKBVector(dev, len(rd.extIDs)*domainSize*4) + if err != nil { + return fmt.Errorf("gpu/quotient: alloc dExtCoeffs (ratio %d, %d elems): %w", ratio, len(rd.extIDs)*domainSize*4, err) + } + defer rd.dExtCoeffs.Free() + rd.dExtEvals, err = gpuvortex.NewKBVector(dev, len(rd.extIDs)*domainSize*4) + if err != nil { + return fmt.Errorf("gpu/quotient: alloc dExtEvals (ratio %d, %d elems): %w", ratio, len(rd.extIDs)*domainSize*4, err) + } + defer rd.dExtEvals.Free() + rd.extEvalPtrs = make([]unsafe.Pointer, len(rd.extIDs)) + for k := range rd.extIDs { + rd.extEvalPtrs[k] = unsafe.Add(rd.dExtEvals.DevicePtr(), k*domainSize*4*4) + } + + extSOA := make([]field.Element, len(rd.extIDs)*domainSize*4) + for k, id := range rd.extIDs { + globalIdx := extRootIdx[id] + coeffs := extCoeffs[globalIdx] + base := k * domainSize * 4 + dst0 := extSOA[base : base+domainSize] + dst1 := extSOA[base+domainSize : base+2*domainSize] + dst2 := extSOA[base+2*domainSize : base+3*domainSize] + dst3 := extSOA[base+3*domainSize : base+4*domainSize] + for j := range coeffs { + dst0[j] = coeffs[j].B0.A0 + dst1[j] = coeffs[j].B0.A1 + dst2[j] = coeffs[j].B1.A0 + dst3[j] = coeffs[j].B1.A1 + } + } + rd.dExtCoeffs.CopyFromHost(extSOA) + for vec := 0; vec < len(rd.extIDs)*4; vec++ { + ptr := unsafe.Add(rd.dExtCoeffs.DevicePtr(), vec*domainSize*4) + gpuvortex.BitRevRaw(dev, ptr, domainSize) + } + gpuvortex.Sync(dev) + tExtPrepGPU += time.Since(t0) + } + + ratioPrepared[ratio] = rd + } + + var tCosetPrep, tCosetNTT, tExtFFT, tSymEval time.Duration + var tSymInputs, tSymKernel, tSymPost, tSymAssign, tSymAuxVec, tSymAuxFree time.Duration + var symAuxVecCount int + + for i := 0; i < maxRatio; i++ { + for ratio, constraintsIndices := range constraintsByRatio { + if i%(maxRatio/ratio) != 0 { + continue + } + share := i * ratio / maxRatio + shift := computeShift(uint64(domainSize), ratio, share) + rd := ratioPrepared[ratio] + + t0 = time.Now() + cosetDomain := fft.NewDomain(uint64(domainSize), fft.WithShift(shift), fft.WithCache()) + tCosetPrep += time.Since(t0) + + // ── Batch coset FFT for ratio-specific base roots ─ + t0 = time.Now() + if rd.dPacked != nil { + rd.dEvals.CopyFromDevice(rd.dPacked) + nttDomain.BatchCosetFFTBitRev(rd.dEvals, len(rd.baseIDs), shift) + gpuvortex.Sync(dev) + } + tCosetNTT += time.Since(t0) + + evalPtr := func(id ifaces.ColID) unsafe.Pointer { + return unsafe.Add(rd.dEvals.DevicePtr(), rd.baseIdx[id]*domainSize*4) + } + + // ── Ratio-specific extension roots: GPU batch coset FFT on SoA blocks ─ + t0 = time.Now() + if rd.dExtCoeffs != nil { + rd.dExtEvals.CopyFromDevice(rd.dExtCoeffs) + nttDomain.BatchCosetFFTBitRev(rd.dExtEvals, len(rd.extIDs)*4, shift) + gpuvortex.Sync(dev) + } + tExtFFT += time.Since(t0) + + miscVecs := make(map[string]*gpuvortex.KBVector) + + // ── GPU symbolic eval ──────────────────────────────────── + t0 = time.Now() + for _, j := range constraintsIndices { + c := &compiled[j] + if c.pgm == nil { + cpuFallback(run, &boards[j], rootsForRatio[j], shiftedForRatio[j], + cpuDomain0, cosetDomain, quotientShares[j][share].GetColID(), + domainSize, i, maxRatio, &annBase, &annExt, &annBaseDone, &annExtDone) + continue + } + + tIn := time.Now() + inputs := make([]gpusym.SymInput, len(c.meta)) + for k, mi := range c.meta { + switch m := mi.(type) { + case ifaces.Column: + root := column.RootParents(m) + rid := root.GetColID() + isExt := isExtRoot[rid] + if shifted, isShifted := m.(column.Shifted); isShifted { + if isExt { + inputs[k] = gpusym.SymInput{Tag: gpusym.SymInputRotE4SOA, DPtr: rd.extEvalPtrs[rd.extIdx[rid]], Offset: shifted.Offset} + } else { + inputs[k] = gpusym.SymInput{Tag: gpusym.SymInputRotKB, DPtr: evalPtr(rid), Offset: shifted.Offset} + } + } else { + if isExt { + inputs[k] = gpusym.SymInput{Tag: gpusym.SymInputE4VecSOA, DPtr: rd.extEvalPtrs[rd.extIdx[rid]]} + } else { + inputs[k] = gpusym.SymInput{Tag: gpusym.SymInputKB, DPtr: evalPtr(rid)} + } + } + case coin.Info: + inputs[k] = gpusym.SymInputFromConst(run.GetRandomCoinFieldExt(m.Name)) + case variables.X: + key := fmt.Sprintf("X_%d", i) + if dV, ok := miscVecs[key]; ok { + inputs[k] = gpusym.SymInputFromVec(dV) + } else { + tAux := time.Now() + h := make([]field.Element, domainSize) + m.EvalCoset(domainSize, i, maxRatio, true).WriteInSlice(h) + dV, auxErr := gpuvortex.NewKBVector(dev, domainSize) + if auxErr != nil { + return fmt.Errorf("gpu/quotient: alloc X vec: %w", auxErr) + } + dV.CopyFromHost(h) + miscVecs[key] = dV + tSymAuxVec += time.Since(tAux) + symAuxVecCount++ + inputs[k] = gpusym.SymInputFromVec(dV) + } + case variables.PeriodicSample: + key := fmt.Sprintf("%s_%d", m.String(), i) + if dV, ok := miscVecs[key]; ok { + inputs[k] = gpusym.SymInputFromVec(dV) + } else { + tAux := time.Now() + h := make([]field.Element, domainSize) + m.EvalCoset(domainSize, i, maxRatio, true).WriteInSlice(h) + dV, auxErr := gpuvortex.NewKBVector(dev, domainSize) + if auxErr != nil { + return fmt.Errorf("gpu/quotient: alloc PeriodicSample vec: %w", auxErr) + } + dV.CopyFromHost(h) + miscVecs[key] = dV + tSymAuxVec += time.Since(tAux) + symAuxVecCount++ + inputs[k] = gpusym.SymInputFromVec(dV) + } + case ifaces.Accessor: + if m.IsBase() { + v := m.GetVal(run) + var e4 fext.Element + fext.SetFromBase(&e4, &v) + inputs[k] = gpusym.SymInputFromConst(e4) + } else { + inputs[k] = gpusym.SymInputFromConst(m.GetValExt(run)) + } + default: + utils.Panic("unknown metadata type %v", reflect.TypeOf(mi)) + } + } + tSymInputs += time.Since(tIn) + + tK := time.Now() + result := gpusym.EvalSymGPU(dev, c.pgm, inputs, domainSize) + tSymKernel += time.Since(tK) + + tP := time.Now() + var assigned sv.SmartVector + // Three host-side passes used to be sequential: + // 1. allBaseField detection (n iters of fext.IsBase) + // 2. extract B0.A0 into []field.Element (n iters) + // 3. parallel ScalarMul by annBase[i] + // Total ~3.3 ms at n=1M. + // + // Now: parallel detect + fused parallel extract+scalarMul. + // In the all-base common case the standalone ScalarMul is + // folded into the extract pass. + // + // Use parallel.Execute (NOT ExecuteChunky — the latter + // dispatches a goroutine per single iteration, fatal at + // n=1M). + var anyNonBase atomic.Bool + parallel.Execute(len(result), func(lo, hi int) { + for k := lo; k < hi; k++ { + if !fext.IsBase(&result[k]) { + anyNonBase.Store(true) + return + } + } + }) + allBaseField := !anyNonBase.Load() + + if allBaseField { + if !annBaseDone { + annBase = fastpoly.EvalXnMinusOneOnACoset(domainSize, domainSize*maxRatio) + annBase = field.ParBatchInvert(annBase, runtime.GOMAXPROCS(0)) + annBaseDone = true + } + // Fused parallel extract + ScalarMul: br[k] = result[k].B0.A0 * ann. + // Eliminates the standalone extract loop AND the standalone + // vq.ScalarMul pass. + br := make([]field.Element, domainSize) + ann := annBase[i] + parallel.Execute(len(result), func(lo, hi int) { + for k := lo; k < hi; k++ { + br[k].Mul(&result[k].B0.A0, &ann) + } + }) + assigned = sv.NewRegular(br) + } else { + if !annExtDone { + annExt = fastpolyext.EvalXnMinusOneOnACoset(domainSize, domainSize*maxRatio) + annExt = fext.ParBatchInvert(annExt, runtime.GOMAXPROCS(0)) + annExtDone = true + } + extensions.Vector(result).ScalarMul(extensions.Vector(result), &annExt[i]) + assigned = sv.NewRegularExt(result) + } + tSymPost += time.Since(tP) + + tA := time.Now() + run.AssignColumn(quotientShares[j][share].GetColID(), assigned) + tSymAssign += time.Since(tA) + } + tSymEval += time.Since(t0) + + tF := time.Now() + for _, v := range miscVecs { + v.Free() + } + tSymAuxFree += time.Since(tF) + } + } + + fmt.Printf("gpu/quotient TIMING: compile=%v rootSplit=%v extIFFT=%v pack=%v h2d=%v ifft=%v cosetPrep=%v cosetNTT=%v extFFT=%v symEval=%v (inputs=%v kernel=%v post=%v assign=%v auxBuild=%v auxFree=%v auxVecs=%d) | %d base, %d ext roots | boards gpu=%d cpu=%d maxSlots=%d slotFallback=%d\n", + tCompile, tRootSplit, tExtIFFT, tPack, tH2D, tIFFT, tCosetPrep, tCosetNTT, tExtFFT, tSymEval, + tSymInputs, tSymKernel, tSymPost, tSymAssign, tSymAuxVec, tSymAuxFree, symAuxVecCount, + nBaseRoots, nExtRoots, gpuBoardCount, cpuBoardCount, maxBoardSlots, fallbackBySlots) + fmt.Printf("gpu/quotient ROOTS per ratio: %v\n", ratioRootStats) + return nil +} + +func computeShift(n uint64, cosetRatio int, cosetID int) field.Element { + var s field.Element + g := fft.GeneratorFullMultiplicativeGroup() + omega, _ := fft.Generator(n * uint64(cosetRatio)) + omega.Exp(omega, new(big.Int).SetInt64(int64(cosetID))) + s.Mul(&g, &omega) + return s +} + +func cpuFallback( + run *wizard.ProverRuntime, board *symbolic.ExpressionBoard, + roots, shifted []ifaces.Column, domain0, cosetDomain *fft.Domain, + colID ifaces.ColID, domainSize, cosetIdx, maxRatio int, + annBase *[]field.Element, annExt *[]fext.Element, annBaseDone, annExtDone *bool, +) { + reeval := make(map[ifaces.ColID]sv.SmartVector) + for _, root := range roots { + id := root.GetColID() + if _, ok := reeval[id]; ok { + continue + } + w, ok := run.Columns.TryGet(id) + if !ok { + w = root.GetColAssignment(run) + } + if smartvectors.IsBase(w) { + r := make([]field.Element, domainSize) + w.WriteInSlice(r) + domain0.FFTInverse(r, fft.DIF, fft.WithNbTasks(2)) + cosetDomain.FFT(r, fft.DIT, fft.OnCoset(), fft.WithNbTasks(2)) + reeval[id] = sv.NewRegular(r) + } else { + r := make([]fext.Element, domainSize) + w.WriteInSliceExt(r) + domain0.FFTInverseExt(r, fft.DIF, fft.WithNbTasks(2)) + cosetDomain.FFTExt(r, fft.DIT, fft.OnCoset(), fft.WithNbTasks(2)) + reeval[id] = sv.NewRegularExt(r) + } + } + for _, pol := range shifted { + pid := pol.GetColID() + if _, ok := reeval[pid]; ok { + continue + } + rt := column.RootParents(pol) + if s, ok := pol.(column.Shifted); ok { + switch v := reeval[rt.GetColID()].(type) { + case *sv.Regular: + reeval[pid] = sv.SoftRotate(v, s.Offset) + case *sv.RegularExt: + reeval[pid] = sv.SoftRotateExt(v, s.Offset) + } + } + } + metas := board.ListVariableMetadata() + ins := make([]sv.SmartVector, len(metas)) + for k, mi := range metas { + switch m := mi.(type) { + case ifaces.Column: + ins[k] = reeval[m.GetColID()] + case coin.Info: + ins[k] = sv.NewConstantExt(run.GetRandomCoinFieldExt(m.Name), domainSize) + case variables.X: + ins[k] = m.EvalCoset(domainSize, cosetIdx, maxRatio, true) + case variables.PeriodicSample: + ins[k] = m.EvalCoset(domainSize, cosetIdx, maxRatio, true) + case ifaces.Accessor: + if m.IsBase() { + ins[k] = sv.NewConstant(m.GetVal(run), domainSize) + } else { + ins[k] = sv.NewConstantExt(m.GetValExt(run), domainSize) + } + } + } + qs := board.Evaluate(ins) + switch q := qs.(type) { + case *sv.Regular: + if !*annBaseDone { + *annBase = fastpoly.EvalXnMinusOneOnACoset(domainSize, domainSize*maxRatio) + *annBase = field.ParBatchInvert(*annBase, runtime.GOMAXPROCS(0)) + *annBaseDone = true + } + vq := field.Vector(*q) + vq.ScalarMul(vq, &(*annBase)[cosetIdx]) + case *sv.RegularExt: + if !*annExtDone { + *annExt = fastpolyext.EvalXnMinusOneOnACoset(domainSize, domainSize*maxRatio) + *annExt = fext.ParBatchInvert(*annExt, runtime.GOMAXPROCS(0)) + *annExtDone = true + } + extensions.Vector(*q).ScalarMul(extensions.Vector(*q), &(*annExt)[cosetIdx]) + } + run.AssignColumn(colID, qs) +} diff --git a/prover/gpu/quotient/quotient_test.go b/prover/gpu/quotient/quotient_test.go new file mode 100644 index 00000000000..b189d168212 --- /dev/null +++ b/prover/gpu/quotient/quotient_test.go @@ -0,0 +1,81 @@ +//go:build cuda + +package quotient + +import ( + "testing" + + "github.com/consensys/gnark-crypto/field/koalabear/fft" + "github.com/consensys/linea-monorepo/prover/gpu" + gpuvortex "github.com/consensys/linea-monorepo/prover/gpu/vortex" + "github.com/consensys/linea-monorepo/prover/maths/field" +) + +// TestGPUNTTCosetEval verifies the GPU NTT pipeline matches the CPU +// for the full IFFT → coset FFT sequence used in quotient computation. +// +// GPU convention: +// BitReverse → FFTInverse → Scale(1/n) → coefficients +// CopyFromDevice → CosetFFT(shift) → BitReverse → evaluations +// +// Note: GPU FFTInverse does NOT include 1/n normalization (unlike gnark-crypto). +func TestGPUNTTCosetEval(t *testing.T) { + dev := gpu.GetDevice() + if dev == nil { + t.Skip("no GPU") + } + + const n = 1024 + domain0 := fft.NewDomain(uint64(n), fft.WithCache()) + nttDom, _ := gpuvortex.NewGPUFFTDomain(dev, n) + defer nttDom.Free() + + witness := make([]field.Element, n) + for i := range witness { + witness[i].SetUint64(uint64(i + 1)) + } + + shift := computeShift(uint64(n), 2, 0) + cosetDomain := fft.NewDomain(uint64(n), fft.WithShift(shift), fft.WithCache()) + + // CPU reference: IFFT(DIF) → FFT(DIT, OnCoset) + cpuResult := make([]field.Element, n) + copy(cpuResult, witness) + domain0.FFTInverse(cpuResult, fft.DIF, fft.WithNbTasks(1)) + cosetDomain.FFT(cpuResult, fft.DIT, fft.OnCoset(), fft.WithNbTasks(1)) + + // GPU: BitRev → IFFT → Scale(1/n) → D2D → CosetFFT → BitRev + dVec, _ := gpuvortex.NewKBVector(dev, n) + defer dVec.Free() + dVec.CopyFromHost(witness) + dVec.BitReverse() + nttDom.FFTInverse(dVec) + var nInv field.Element + nInv.SetUint64(uint64(n)) + nInv.Inverse(&nInv) + dVec.Scale(nInv) + + dEval, _ := gpuvortex.NewKBVector(dev, n) + defer dEval.Free() + dEval.CopyFromDevice(dVec) + nttDom.CosetFFT(dEval, shift) + dEval.BitReverse() + + gpuResult := make([]field.Element, n) + dEval.CopyToHost(gpuResult) + + mismatches := 0 + for i := range cpuResult { + if cpuResult[i] != gpuResult[i] { + mismatches++ + if mismatches <= 3 { + t.Errorf("[%d] cpu=%v gpu=%v", i, cpuResult[i], gpuResult[i]) + } + } + } + if mismatches > 0 { + t.Errorf("total mismatches: %d/%d", mismatches, n) + } else { + t.Log("GPU coset evaluation matches CPU: PASS") + } +} diff --git a/prover/gpu/quotient/stub.go b/prover/gpu/quotient/stub.go new file mode 100644 index 00000000000..ae3860dae6f --- /dev/null +++ b/prover/gpu/quotient/stub.go @@ -0,0 +1,25 @@ +//go:build !cuda + +// Stub for non-CUDA builds. Guard calls with gpu.Enabled. +package quotient + +import ( + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/protocol/ifaces" + "github.com/consensys/linea-monorepo/prover/protocol/wizard" + "github.com/consensys/linea-monorepo/prover/symbolic" +) + +func RunGPU( + _ *gpu.Device, + _ *wizard.ProverRuntime, + _ int, + _ []int, + _ []symbolic.ExpressionBoard, + _ [][]ifaces.Column, + _ [][]ifaces.Column, + _ [][]ifaces.Column, + _ map[int][]int, +) error { + panic("gpu: cuda required") +} diff --git a/prover/gpu/singleton.go b/prover/gpu/singleton.go new file mode 100644 index 00000000000..c609d0a5d8d --- /dev/null +++ b/prover/gpu/singleton.go @@ -0,0 +1,212 @@ +package gpu + +import ( + "fmt" + "os" + "runtime" + "runtime/debug" + "strconv" + "sync" +) + +const ( + EnvDeviceID = "LINEA_PROVER_GPU_DEVICE_ID" + + // EnvAggregation is the master feature flag that opts the aggregation + // pipeline (public-input wizard, BW6 aggregation, BN254 emulation) into + // the GPU prover. Compression always uses the GPU when one is available; + // aggregation only does so when this is set, since the aggregation GPU + // path also depends on optional PI Vortex/quotient kernels that we want + // to keep behind a flag for now. + EnvAggregation = "LINEA_PROVER_GPU_AGGREGATION" +) + +// HasDevice reports whether the binary was built with the cuda tag and a GPU +// device is reachable on this host. It is the canonical check for "should we +// route compute to the GPU?". Cheap (memoised behind sync.Once for device 0). +func HasDevice() bool { + if !Enabled { + return false + } + return GetDevice() != nil +} + +// IsAggregationEnabled reports whether GPU dispatch should be used for the +// aggregation pipeline. Returns true only when both a GPU is available AND +// the operator has opted in via $LINEA_PROVER_GPU_AGGREGATION=1. +func IsAggregationEnabled() bool { + if !HasDevice() { + return false + } + switch os.Getenv(EnvAggregation) { + case "1", "true", "TRUE", "True", "yes", "YES", "on", "ON": + return true + } + return false +} + +var ( + defaultDevOnce sync.Once + defaultDev *Device + + devicesMu sync.Mutex + devices = map[int]*Device{} + + deviceCountOnce sync.Once + deviceCount int +) + +// GetDevice returns a lazily-initialized default GPU device (device 0). +// Returns nil when GPU is not available (no CUDA build tag or init failure). +// Thread-safe; the device is created at most once per process. +// +// Side effect: raises GOGC to 1000 (if currently lower) when a device is +// successfully created. CUDA's pinned-memory allocations interact poorly +// with Go's GC at the default GOGC=100, adding seconds of overhead per +// GC cycle on large heaps. +func GetDevice() *Device { + return GetDeviceN(0) +} + +// GetDeviceN returns a lazily-initialized GPU device by ID, creating one if +// needed. Each id is initialized at most once. Returns nil when GPU is not +// available or device init fails. +// +// On multi-GPU hosts, callers route work to a specific device (e.g. by +// segment-index modulo DeviceCount()) and must keep all activity for a +// given segment on a single device — buffers and contexts do not migrate. +// +// The first successful device-init also raises GOGC to 1000; see GetDevice. +func GetDeviceN(id int) *Device { + if !Enabled || id < 0 { + return nil + } + + if id == 0 { + defaultDevOnce.Do(func() { + dev, err := New(WithDeviceID(0)) + if err != nil { + return + } + defaultDev = dev + raiseGOGC() + }) + return defaultDev + } + + devicesMu.Lock() + defer devicesMu.Unlock() + if d, ok := devices[id]; ok { + return d + } + dev, err := New(WithDeviceID(id)) + if err != nil { + return nil + } + devices[id] = dev + raiseGOGC() + return dev +} + +// DeviceCount returns the number of GPUs the prover is configured to use, +// read from $LIMITLESS_GPU_COUNT (default 1). Returns 0 when GPU is disabled. +// Read once per process. +func DeviceCount() int { + if !Enabled { + return 0 + } + deviceCountOnce.Do(func() { + deviceCount = 1 + if v := os.Getenv("LIMITLESS_GPU_COUNT"); v != "" { + if n, err := strconv.Atoi(v); err == nil && n > 0 { + deviceCount = n + } + } + }) + return deviceCount +} + +// ConfiguredDeviceID parses LINEA_PROVER_GPU_DEVICE_ID. When unset, callers +// should use the default device routing. +func ConfiguredDeviceID() (id int, configured bool, err error) { + raw := os.Getenv(EnvDeviceID) + if raw == "" { + return 0, false, nil + } + id, err = strconv.Atoi(raw) + if err != nil { + return 0, true, fmt.Errorf("invalid %s %q: %w", EnvDeviceID, raw, err) + } + if id < 0 { + return 0, true, fmt.Errorf("%s must be non-negative, got %d", EnvDeviceID, id) + } + return id, true, nil +} + +// DeviceFromEnvOrCurrent returns the explicitly configured GPU, when +// LINEA_PROVER_GPU_DEVICE_ID is set, or the GPU currently pinned to this OS +// thread. If neither is configured it falls back to device 0. +func DeviceFromEnvOrCurrent() (*Device, int, error) { + id, configured, err := ConfiguredDeviceID() + if err != nil { + return nil, 0, err + } + if !configured { + dev := CurrentDevice() + if dev == nil { + return nil, CurrentDeviceID(), nil + } + return dev, dev.DeviceID(), nil + } + if !Enabled { + return nil, id, fmt.Errorf("%s=%d requires a binary built with the cuda tag", EnvDeviceID, id) + } + dev := GetDeviceN(id) + if dev == nil { + return nil, id, fmt.Errorf("GPU device %d is not available", id) + } + if err := dev.Bind(); err != nil { + return nil, id, fmt.Errorf("bind GPU device %d: %w", id, err) + } + SetCurrentDevice(dev) + SetCurrentDeviceID(id) + return dev, id, nil +} + +// PinConfiguredDevice locks the current goroutine to its OS thread and binds +// LINEA_PROVER_GPU_DEVICE_ID for process-level GPU work. The returned cleanup +// function must be called by the same goroutine. +func PinConfiguredDevice() (id int, configured bool, cleanup func(), err error) { + cleanup = func() {} + id, configured, err = ConfiguredDeviceID() + if err != nil || !configured { + return id, configured, cleanup, err + } + if !Enabled { + return id, configured, cleanup, + fmt.Errorf("%s=%d requires a binary built with the cuda tag", EnvDeviceID, id) + } + dev := GetDeviceN(id) + if dev == nil { + return id, configured, cleanup, fmt.Errorf("GPU device %d is not available", id) + } + runtime.LockOSThread() + if err := dev.Bind(); err != nil { + runtime.UnlockOSThread() + return id, configured, cleanup, fmt.Errorf("bind GPU device %d: %w", id, err) + } + SetCurrentDevice(dev) + SetCurrentDeviceID(id) + return id, configured, func() { + SetCurrentDevice(nil) + SetCurrentDeviceID(0) + runtime.UnlockOSThread() + }, nil +} + +func raiseGOGC() { + const minGOGC = 1000 + if cur := debug.SetGCPercent(minGOGC); cur > minGOGC { + debug.SetGCPercent(cur) // restore if already higher + } +} diff --git a/prover/gpu/singleton_cuda_test.go b/prover/gpu/singleton_cuda_test.go new file mode 100644 index 00000000000..21eb407b355 --- /dev/null +++ b/prover/gpu/singleton_cuda_test.go @@ -0,0 +1,33 @@ +//go:build cuda + +package gpu + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestDeviceFromEnvOrCurrent_CUDA(t *testing.T) { + t.Setenv(EnvDeviceID, "0") + + dev, id, err := DeviceFromEnvOrCurrent() + if err != nil { + t.Skipf("CUDA device 0 unavailable: %v", err) + } + require.NotNil(t, dev, "configured CUDA device should be available") + require.Equal(t, 0, id, "configured device id should be returned") + require.Equal(t, 0, dev.DeviceID(), "selected device should match the env var") +} + +func TestDeviceFromEnvOrCurrent_CUDADevice1(t *testing.T) { + t.Setenv(EnvDeviceID, "1") + + dev, id, err := DeviceFromEnvOrCurrent() + if err != nil { + t.Skipf("CUDA device 1 unavailable: %v", err) + } + require.NotNil(t, dev, "configured CUDA device should be available") + require.Equal(t, 1, id, "configured device id should be returned") + require.Equal(t, 1, dev.DeviceID(), "selected device should match the env var") +} diff --git a/prover/gpu/singleton_test.go b/prover/gpu/singleton_test.go new file mode 100644 index 00000000000..314e18c2eab --- /dev/null +++ b/prover/gpu/singleton_test.go @@ -0,0 +1,43 @@ +package gpu + +import ( + "testing" + + "github.com/stretchr/testify/require" +) + +func TestConfiguredDeviceID(t *testing.T) { + t.Run("unset", func(t *testing.T) { + t.Setenv(EnvDeviceID, "") + + id, configured, err := ConfiguredDeviceID() + require.NoError(t, err) + require.False(t, configured, "unset device env should preserve default routing") + require.Zero(t, id, "unset device env should report the default id") + }) + + t.Run("valid", func(t *testing.T) { + t.Setenv(EnvDeviceID, "1") + + id, configured, err := ConfiguredDeviceID() + require.NoError(t, err) + require.True(t, configured, "set device env should enable explicit routing") + require.Equal(t, 1, id, "configured device id should match the env var") + }) + + t.Run("invalid", func(t *testing.T) { + t.Setenv(EnvDeviceID, "gpu1") + + _, configured, err := ConfiguredDeviceID() + require.Error(t, err, "non-integer device id should be rejected") + require.True(t, configured, "invalid env still counts as explicit configuration") + }) + + t.Run("negative", func(t *testing.T) { + t.Setenv(EnvDeviceID, "-1") + + _, configured, err := ConfiguredDeviceID() + require.Error(t, err, "negative device id should be rejected") + require.True(t, configured, "negative env still counts as explicit configuration") + }) +} diff --git a/prover/gpu/symbolic/adapter.go b/prover/gpu/symbolic/adapter.go new file mode 100644 index 00000000000..204af7c3a64 --- /dev/null +++ b/prover/gpu/symbolic/adapter.go @@ -0,0 +1,71 @@ +// Adapter: converts symbolic.ExpressionBoard → []NodeOp for GPU compilation. +// +// The ExpressionBoard uses Go interface types (Variable, Constant, LinComb, +// Product, PolyEval) while the GPU compiler needs flat []NodeOp. This thin +// glue converts between the two representations. +package symbolic + +import ( + "github.com/consensys/linea-monorepo/prover/symbolic" +) + +// BoardToNodeOps converts an ExpressionBoard's topologically-sorted nodes +// into the GPU-portable NodeOp representation. +// +// board.Nodes[i].Operator → NodeOp.Kind +// Variable → OpInput (leaf, references next input variable) +// Constant → OpConst (leaf, E4 value in Montgomery form) +// LinComb{Coeffs} → OpLinComb{Children, Coeffs} +// Product{Exponents} → OpProduct{Children, Coeffs=exponents} +// PolyEval{} → OpPolyEval{Children} +func BoardToNodeOps(board *symbolic.ExpressionBoard) []NodeOp { + nodes := board.Nodes + ops := make([]NodeOp, len(nodes)) + + for i, node := range nodes { + children := make([]int, len(node.Children)) + for j, c := range node.Children { + children[j] = int(c) + } + + switch op := node.Operator.(type) { + case symbolic.Variable: + ops[i] = NodeOp{Kind: OpInput} + + case symbolic.Constant: + // Extract E4 value (always available via GetExt, in Montgomery form) + val := op.Val.GetExt() + ops[i] = NodeOp{ + Kind: OpConst, + ConstVal: [4]uint32{ + uint32(val.B0.A0[0]), uint32(val.B0.A1[0]), + uint32(val.B1.A0[0]), uint32(val.B1.A1[0]), + }, + } + + case symbolic.LinComb: + ops[i] = NodeOp{ + Kind: OpLinComb, + Children: children, + Coeffs: append([]int(nil), op.Coeffs...), + } + + case symbolic.Product: + ops[i] = NodeOp{ + Kind: OpProduct, + Children: children, + Coeffs: append([]int(nil), op.Exponents...), + } + + case symbolic.PolyEval: + ops[i] = NodeOp{ + Kind: OpPolyEval, + Children: children, + } + + default: + panic("gpu/symbolic: BoardToNodeOps: unknown operator type") + } + } + return ops +} diff --git a/prover/gpu/symbolic/compile.go b/prover/gpu/symbolic/compile.go new file mode 100644 index 00000000000..291429c8c17 --- /dev/null +++ b/prover/gpu/symbolic/compile.go @@ -0,0 +1,167 @@ +// GPU symbolic expression compiler. +// +// Compiles a DAG of arithmetic operations over E4 (KoalaBear degree-4 extension) +// into bytecode for parallel GPU evaluation. One GPU thread per vector element, +// zero warp divergence — every thread executes the identical instruction stream. +// +// ┌─────────────────────────┐ ┌──────────────────────────┐ +// │ NodeOp[] (topo-sorted) │ │ kern_symbolic_eval │ +// │ │ │ │ +// │ liveness analysis │ H2D │ thread i: │ +// │ register allocation │ ──────▶ │ E4 slots[S] │ +// │ bytecode emission │ │ for pc in program: │ +// │ │ │ execute(i) │ +// │ → GPUProgram │ │ out[i] = slots[R] │ +// └─────────────────────────┘ └──────────────────────────┘ +// +// The NodeOp representation is decoupled from linea-monorepo's symbolic package. +// A thin adapter in the monorepo converts ExpressionBoard.Nodes[] → []NodeOp. +// +// Bytecode format (uint32 words): +// +// OP_CONST (0): [0, dst, const_idx] 3 words +// OP_INPUT (1): [1, dst, input_id] 3 words +// OP_MUL (2): [2, dst, n, s₀, e₀, ..., sₙ, eₙ] 3 + 2n words +// OP_LINCOMB (3): [3, dst, n, s₀, c₀, ..., sₙ, cₙ] 3 + 2n words +// OP_POLYEVAL(4): [4, dst, n, s₀, s₁, ..., sₙ] 3 + n words +package symbolic + +// Opcodes — match CUDA kernel's switch cases exactly. +const ( + OpConst = 0 + OpInput = 1 + OpProduct = 2 + OpLinComb = 3 + OpPolyEval = 4 +) + +// NodeOp describes one node in a topologically-sorted expression DAG. +// +// Kind=OpConst: leaf, ConstVal holds the E4 value (Montgomery form) +// Kind=OpInput: leaf, references the next input variable +// Kind=OpLinComb: Σ Coeffs[k] · Children[k], Coeffs = small integers +// Kind=OpProduct: Π Children[k]^Coeffs[k], Coeffs = exponents ≥ 0 +// Kind=OpPolyEval: Horner(Children[0]=x, Children[1..]=coefficients) +type NodeOp struct { + Kind int + Children []int // indices into nodes array (child < self) + Coeffs []int // LinComb: coefficients, Product: exponents + ConstVal [4]uint32 // E4 constant, [b0.a0, b0.a1, b1.a0, b1.a1] +} + +// GPUProgram holds compiled bytecode ready for GPU evaluation. +type GPUProgram struct { + Bytecode []uint32 // packed GPU instructions + Constants []uint32 // E4 constants (4 uint32 each) + NumSlots int + ResultSlot int + NumInputs int +} + +// CompileGPU compiles a topologically-sorted DAG into GPU bytecode. +// +// Algorithm (identical to CPU compiler in symbolic/compiler.go): +// 1. Liveness analysis — determine last use of each node +// 2. Register allocation — assign slots, reuse dead slots +// 3. Instruction emission — emit uint32 bytecode per node +func CompileGPU(nodes []NodeOp) *GPUProgram { + n := len(nodes) + if n == 0 { + return &GPUProgram{} + } + + // ── 1. Liveness: lastUse[i] = latest node index that reads node i ──── + lastUse := make([]int, n) + for i := range lastUse { + lastUse[i] = -1 + } + lastUse[n-1] = n // output node is implicitly live + for i, node := range nodes { + for _, c := range node.Children { + if i > lastUse[c] { + lastUse[c] = i + } + } + } + + // ── 2. Register allocation + 3. Instruction emission ───────────────── + pgm := &GPUProgram{ + Bytecode: make([]uint32, 0, n*4), + Constants: make([]uint32, 0), + } + + slots := make([]int, n) + freeSlots := make([]int, 0, 32) + nextSlot := 0 + + alloc := func() int { + if k := len(freeSlots); k > 0 { + s := freeSlots[k-1] + freeSlots = freeSlots[:k-1] + return s + } + s := nextSlot + nextSlot++ + return s + } + + inputCursor := 0 + + for i, node := range nodes { + dst := alloc() + slots[i] = dst + + switch node.Kind { + case OpConst: + ci := len(pgm.Constants) / 4 + pgm.Constants = append(pgm.Constants, + node.ConstVal[0], node.ConstVal[1], + node.ConstVal[2], node.ConstVal[3]) + pgm.Bytecode = append(pgm.Bytecode, OpConst, uint32(dst), uint32(ci)) + + case OpInput: + pgm.Bytecode = append(pgm.Bytecode, OpInput, uint32(dst), uint32(inputCursor)) + inputCursor++ + + case OpLinComb: + nc := len(node.Children) + pgm.Bytecode = append(pgm.Bytecode, OpLinComb, uint32(dst), uint32(nc)) + for k, c := range node.Children { + pgm.Bytecode = append(pgm.Bytecode, + uint32(slots[c]), uint32(int32(node.Coeffs[k]))) + } + + case OpProduct: + nc := len(node.Children) + pgm.Bytecode = append(pgm.Bytecode, OpProduct, uint32(dst), uint32(nc)) + for k, c := range node.Children { + pgm.Bytecode = append(pgm.Bytecode, + uint32(slots[c]), uint32(node.Coeffs[k])) + } + + case OpPolyEval: + nc := len(node.Children) + pgm.Bytecode = append(pgm.Bytecode, OpPolyEval, uint32(dst), uint32(nc)) + for _, c := range node.Children { + pgm.Bytecode = append(pgm.Bytecode, uint32(slots[c])) + } + + default: + panic("vortex: CompileGPU: unknown node kind") + } + + // Free dead children + freed := make(map[int]bool) + for _, c := range node.Children { + if lastUse[c] == i && !freed[slots[c]] { + freeSlots = append(freeSlots, slots[c]) + freed[slots[c]] = true + } + } + } + + pgm.NumSlots = nextSlot + pgm.ResultSlot = slots[n-1] + pgm.NumInputs = inputCursor + return pgm +} diff --git a/prover/gpu/symbolic/gpu.go b/prover/gpu/symbolic/gpu.go new file mode 100644 index 00000000000..718d3a8d609 --- /dev/null +++ b/prover/gpu/symbolic/gpu.go @@ -0,0 +1,206 @@ +// GPU evaluation of compiled symbolic programs via CUDA. +// +// Build constraint: requires CGO + CUDA (same as gpu.go). + +//go:build cuda + +package symbolic + +/* +#cgo LDFLAGS: -L${SRCDIR}/../cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/../cuda/include + +#include "gnark_gpu_kb.h" +*/ +import "C" +import ( + "fmt" + "unsafe" + + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/vortex" + + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" +) + +// devCtx casts the common gpu.Device handle back to the C type for CGO calls. +func devCtx(d *gpu.Device) C.gnark_gpu_context_t { + return C.gnark_gpu_context_t(d.Handle()) +} + +func kbError(code C.kb_error_t) error { + switch code { + case C.KB_SUCCESS: + return nil + case C.KB_ERROR_CUDA: + return fmt.Errorf("symbolic: CUDA error") + case C.KB_ERROR_INVALID: + return fmt.Errorf("symbolic: invalid argument") + case C.KB_ERROR_OOM: + return fmt.Errorf("symbolic: out of GPU memory") + case C.KB_ERROR_SIZE: + return fmt.Errorf("symbolic: size mismatch") + default: + return fmt.Errorf("symbolic: unknown error %d", int(code)) + } +} + +func must(code C.kb_error_t) { + if err := kbError(code); err != nil { + panic(err) + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// GPUSymProgram — compiled program handle (device-resident bytecode + consts) +// ───────────────────────────────────────────────────────────────────────────── + +type GPUSymProgram struct { + dev *gpu.Device + handle C.kb_sym_program_t +} + +// CompileSymGPU uploads a compiled GPUProgram to the GPU. +func CompileSymGPU(dev *gpu.Device, pgm *GPUProgram) (*GPUSymProgram, error) { + var h C.kb_sym_program_t + + var bcPtr, cPtr *C.uint32_t + if len(pgm.Bytecode) > 0 { + bcPtr = (*C.uint32_t)(unsafe.Pointer(&pgm.Bytecode[0])) + } + if len(pgm.Constants) > 0 { + cPtr = (*C.uint32_t)(unsafe.Pointer(&pgm.Constants[0])) + } + + if err := kbError(C.kb_sym_compile( + devCtx(dev), + bcPtr, C.uint32_t(len(pgm.Bytecode)), + cPtr, C.uint32_t(len(pgm.Constants)/4), + C.uint32_t(pgm.NumSlots), + C.uint32_t(pgm.ResultSlot), + &h, + )); err != nil { + return nil, err + } + + return &GPUSymProgram{dev: dev, handle: h}, nil +} + +func (p *GPUSymProgram) Free() { + if p.handle != nil { + C.kb_sym_free(p.handle) + p.handle = nil + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// SymInput — describes how the GPU reads one variable +// ───────────────────────────────────────────────────────────────────────────── + +const ( + SymInputKB = 0 // base field vector, embed into E4 as (val, 0, 0, 0) + SymInputConstE4 = 1 // broadcast E4 constant to all threads + SymInputRotKB = 2 // rotated base field vector: d_ptr[(i+offset)%n] + SymInputE4Vec = 3 // E4 AoS vector: d_ptr[i*4..i*4+3] + SymInputE4VecSOA = 4 // E4 SoA vector: d_ptr[c*n+i], c in [0..3] + SymInputRotE4SOA = 5 // rotated E4 SoA: d_ptr[c*n+((i+off)%n)] + SymInputRotE4AOS = 6 // rotated E4 AoS: d_ptr[((i+off)%n)*4..+3] +) + +type SymInput struct { + Tag int // SymInputKB, SymInputConstE4, SymInputRotKB, SymInputE4Vec + DPtr unsafe.Pointer // device pointer (nil for ConstE4) + Offset int // rotation offset + Val [4]uint32 // E4 constant value (Montgomery form) +} + +// SymInputFromVec creates a base-field input from a device-resident KBVector. +func SymInputFromVec(v *vortex.KBVector) SymInput { + return SymInput{ + Tag: SymInputKB, + DPtr: v.DevicePtr(), + } +} + +// SymInputFromRotatedVec creates a rotated base-field input. +func SymInputFromRotatedVec(v *vortex.KBVector, offset int) SymInput { + return SymInput{ + Tag: SymInputRotKB, + DPtr: v.DevicePtr(), + Offset: offset, + } +} + +// SymInputFromE4Vec creates an E4 vector input from a device buffer of 4n uint32. +// The buffer layout is [b0.a0, b0.a1, b1.a0, b1.a1] × n elements. +func SymInputFromE4Vec(v *vortex.KBVector) SymInput { + return SymInput{ + Tag: SymInputE4Vec, + DPtr: v.DevicePtr(), + } +} + +// SymInputFromE4SOA creates an E4 vector input in SoA layout. +// Layout per root is 4 contiguous vectors of size n: +// [b0.a0(0..n), b0.a1(0..n), b1.a0(0..n), b1.a1(0..n)]. +func SymInputFromE4SOA(ptr unsafe.Pointer) SymInput { + return SymInput{ + Tag: SymInputE4VecSOA, + DPtr: ptr, + } +} + +// SymInputFromRotE4SOA creates a rotated E4 vector input in SoA layout. +func SymInputFromRotE4SOA(ptr unsafe.Pointer, offset int) SymInput { + return SymInput{ + Tag: SymInputRotE4SOA, + DPtr: ptr, + Offset: offset, + } +} + +// SymInputFromConst creates a constant E4 input (broadcast). +func SymInputFromConst(val fext.E4) SymInput { + return SymInput{ + Tag: SymInputConstE4, + Val: [4]uint32{ + uint32(val.B0.A0[0]), uint32(val.B0.A1[0]), + uint32(val.B1.A0[0]), uint32(val.B1.A1[0]), + }, + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// EvalSymGPU — evaluate compiled program over n elements → host E4 slice +// ───────────────────────────────────────────────────────────────────────────── + +func EvalSymGPU(dev *gpu.Device, pgm *GPUSymProgram, inputs []SymInput, n int) []fext.E4 { + // Build C input descriptors + descs := make([]C.SymInputDesc, len(inputs)) + for i, inp := range inputs { + descs[i].tag = C.uint32_t(inp.Tag) + descs[i].offset = C.uint32_t(inp.Offset) + descs[i].d_ptr = (*C.uint32_t)(inp.DPtr) + descs[i].val[0] = C.uint32_t(inp.Val[0]) + descs[i].val[1] = C.uint32_t(inp.Val[1]) + descs[i].val[2] = C.uint32_t(inp.Val[2]) + descs[i].val[3] = C.uint32_t(inp.Val[3]) + } + + result := make([]fext.E4, n) + + var descPtr *C.SymInputDesc + if len(descs) > 0 { + descPtr = &descs[0] + } + + must(C.kb_sym_eval( + devCtx(dev), + pgm.handle, + descPtr, C.uint32_t(len(inputs)), + C.uint32_t(n), + (*C.uint32_t)(unsafe.Pointer(&result[0])), + )) + + return result +} diff --git a/prover/gpu/symbolic/stub.go b/prover/gpu/symbolic/stub.go new file mode 100644 index 00000000000..fd6ec655976 --- /dev/null +++ b/prover/gpu/symbolic/stub.go @@ -0,0 +1,44 @@ +//go:build !cuda + +// Stub types for non-CUDA builds. Guard calls with gpu.Enabled. +package symbolic + +import ( + "unsafe" + + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/vortex" +) + +type GPUSymProgram struct{} + +func CompileSymGPU(_ *gpu.Device, _ *GPUProgram) (*GPUSymProgram, error) { + panic("gpu: cuda required") +} +func (p *GPUSymProgram) Free() {} + +// SymInput input descriptor tags. +const ( + SymInputKB = 0 + SymInputConstE4 = 1 + SymInputRotKB = 2 + SymInputE4Vec = 3 +) + +type SymInput struct { + Tag int + DPtr unsafe.Pointer + Offset int + Val [4]uint32 +} + +func SymInputFromVec(_ *vortex.KBVector) SymInput { panic("gpu: cuda required") } +func SymInputFromRotatedVec(_ *vortex.KBVector, _ int) SymInput { panic("gpu: cuda required") } +func SymInputFromE4Vec(_ *vortex.KBVector) SymInput { panic("gpu: cuda required") } +func SymInputFromConst(_ fext.E4) SymInput { panic("gpu: cuda required") } + +func EvalSymGPU(_ *gpu.Device, _ *GPUSymProgram, _ []SymInput, _ int) []fext.E4 { + panic("gpu: cuda required") +} diff --git a/prover/gpu/symbolic/symbolic_test.go b/prover/gpu/symbolic/symbolic_test.go new file mode 100644 index 00000000000..739b02514c6 --- /dev/null +++ b/prover/gpu/symbolic/symbolic_test.go @@ -0,0 +1,347 @@ +//go:build cuda + +package symbolic_test + +import ( + "testing" + + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + "github.com/stretchr/testify/require" + + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/gpu/symbolic" + "github.com/consensys/linea-monorepo/prover/gpu/vortex" +) + +// ─── Compiler unit tests (pure Go) ────────────────────────────────────────── + +func TestCompileGPU_Simple(t *testing.T) { + // Expression: a + 2*b (LinComb with 2 children) + // + // DAG: + // node 0: Input(a) + // node 1: Input(b) + // node 2: LinComb([0, 1], [1, 2]) + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpLinComb, Children: []int{0, 1}, Coeffs: []int{1, 2}}, + } + pgm := symbolic.CompileGPU(nodes) + + require.Equal(t, 2, pgm.NumInputs) + require.Equal(t, 0, len(pgm.Constants)) + require.True(t, pgm.NumSlots >= 2 && pgm.NumSlots <= 3) + require.True(t, len(pgm.Bytecode) > 0) +} + +func TestCompileGPU_Constant(t *testing.T) { + // Expression: 42 (constant) + var c koalabear.Element + c.SetUint64(42) + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(c[0]), 0, 0, 0}}, + } + pgm := symbolic.CompileGPU(nodes) + + require.Equal(t, 0, pgm.NumInputs) + require.Equal(t, 4, len(pgm.Constants)) + require.Equal(t, 1, pgm.NumSlots) +} + +func TestCompileGPU_Product(t *testing.T) { + // Expression: a * b^2 + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpProduct, Children: []int{0, 1}, Coeffs: []int{1, 2}}, + } + pgm := symbolic.CompileGPU(nodes) + require.Equal(t, 2, pgm.NumInputs) +} + +func TestCompileGPU_PolyEval(t *testing.T) { + // P(x) = c₀ + c₁·x where x=const(2), c₀=input(a), c₁=input(b) + var two koalabear.Element + two.SetUint64(2) + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(two[0]), 0, 0, 0}}, // x=2 + {Kind: symbolic.OpInput}, // c₀ = a + {Kind: symbolic.OpInput}, // c₁ = b + {Kind: symbolic.OpPolyEval, Children: []int{0, 1, 2}}, + } + pgm := symbolic.CompileGPU(nodes) + require.Equal(t, 2, pgm.NumInputs) + require.Equal(t, 4, len(pgm.Constants)) // one E4 constant +} + +// ─── GPU evaluation tests ──────────────────────────────────────────────────── + +func TestGPUSymEval_LinComb(t *testing.T) { + // f(a, b) = a + 2·b, evaluate at a=3, b=5 → expect 13 + dev, err := gpu.New() + require.NoError(t, err) + defer dev.Close() + + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpLinComb, Children: []int{0, 1}, Coeffs: []int{1, 2}}, + } + pgm := symbolic.CompileGPU(nodes) + + gpuPgm, err := symbolic.CompileSymGPU(dev, pgm) + require.NoError(t, err) + defer gpuPgm.Free() + + n := 1024 + aVec, _ := vortex.NewKBVector(dev, n) + bVec, _ := vortex.NewKBVector(dev, n) + defer aVec.Free() + defer bVec.Free() + + // Fill a=3, b=5 (Montgomery form) + var three, five koalabear.Element + three.SetUint64(3) + five.SetUint64(5) + aHost := make([]koalabear.Element, n) + bHost := make([]koalabear.Element, n) + for i := range aHost { + aHost[i] = three + bHost[i] = five + } + aVec.CopyFromHost(aHost) + bVec.CopyFromHost(bHost) + + inputs := []symbolic.SymInput{ + symbolic.SymInputFromVec(aVec), + symbolic.SymInputFromVec(bVec), + } + + result := symbolic.EvalSymGPU(dev, gpuPgm, inputs, n) + + // Expected: 3 + 2*5 = 13 + var expected fext.E4 + var thirteen koalabear.Element + thirteen.SetUint64(13) + expected.B0.A0 = thirteen + for i := 0; i < n; i++ { + require.Equal(t, expected, result[i], "mismatch at i=%d", i) + } +} + +func TestGPUSymEval_Product(t *testing.T) { + // f(a, b) = a · b², evaluate at a=3, b=5 → expect 75 + dev, err := gpu.New() + require.NoError(t, err) + defer dev.Close() + + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpProduct, Children: []int{0, 1}, Coeffs: []int{1, 2}}, + } + pgm := symbolic.CompileGPU(nodes) + + gpuPgm, err := symbolic.CompileSymGPU(dev, pgm) + require.NoError(t, err) + defer gpuPgm.Free() + + n := 512 + aVec, _ := vortex.NewKBVector(dev, n) + bVec, _ := vortex.NewKBVector(dev, n) + defer aVec.Free() + defer bVec.Free() + + var three, five koalabear.Element + three.SetUint64(3) + five.SetUint64(5) + aHost := make([]koalabear.Element, n) + bHost := make([]koalabear.Element, n) + for i := range aHost { + aHost[i] = three + bHost[i] = five + } + aVec.CopyFromHost(aHost) + bVec.CopyFromHost(bHost) + + result := symbolic.EvalSymGPU(dev, gpuPgm, inputs(aVec, bVec), n) + + // 3 * 5² = 75 + var expected fext.E4 + var seventyfive koalabear.Element + seventyfive.SetUint64(75) + expected.B0.A0 = seventyfive + for i := 0; i < n; i++ { + require.Equal(t, expected, result[i], "mismatch at i=%d", i) + } +} + +func TestGPUSymEval_ConstantExpr(t *testing.T) { + // Expression: const(7) + const(3) = 10 + dev, err := gpu.New() + require.NoError(t, err) + defer dev.Close() + + var seven, three koalabear.Element + seven.SetUint64(7) + three.SetUint64(3) + + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(seven[0]), 0, 0, 0}}, + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(three[0]), 0, 0, 0}}, + {Kind: symbolic.OpLinComb, Children: []int{0, 1}, Coeffs: []int{1, 1}}, + } + pgm := symbolic.CompileGPU(nodes) + + gpuPgm, err := symbolic.CompileSymGPU(dev, pgm) + require.NoError(t, err) + defer gpuPgm.Free() + + n := 256 + result := symbolic.EvalSymGPU(dev, gpuPgm, nil, n) + + var expected fext.E4 + var ten koalabear.Element + ten.SetUint64(10) + expected.B0.A0 = ten + for i := 0; i < n; i++ { + require.Equal(t, expected, result[i]) + } +} + +func TestGPUSymEval_PolyEval(t *testing.T) { + // P(x) = c₀ + c₁·x, with x=2, c₀=3, c₁=5 → P(2) = 3 + 5*2 = 13 + dev, err := gpu.New() + require.NoError(t, err) + defer dev.Close() + + var two, three_, five koalabear.Element + two.SetUint64(2) + three_.SetUint64(3) + five.SetUint64(5) + + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(two[0]), 0, 0, 0}}, // x=2 + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(three_[0]), 0, 0, 0}}, // c₀=3 + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(five[0]), 0, 0, 0}}, // c₁=5 + {Kind: symbolic.OpPolyEval, Children: []int{0, 1, 2}}, // P(x) = c₀ + c₁·x + } + pgm := symbolic.CompileGPU(nodes) + + gpuPgm, err := symbolic.CompileSymGPU(dev, pgm) + require.NoError(t, err) + defer gpuPgm.Free() + + n := 256 + result := symbolic.EvalSymGPU(dev, gpuPgm, nil, n) + + var expected fext.E4 + var thirteen koalabear.Element + thirteen.SetUint64(13) + expected.B0.A0 = thirteen + for i := 0; i < n; i++ { + require.Equal(t, expected, result[i], "mismatch at i=%d", i) + } +} + +func TestGPUSymEval_RotatedInput(t *testing.T) { + // f(a) = a + rot(a, 1), where rot shifts by +1 cyclically + dev, err := gpu.New() + require.NoError(t, err) + defer dev.Close() + + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpInput}, // will bind to rotated version + {Kind: symbolic.OpLinComb, Children: []int{0, 1}, Coeffs: []int{1, 1}}, + } + pgm := symbolic.CompileGPU(nodes) + + gpuPgm, err := symbolic.CompileSymGPU(dev, pgm) + require.NoError(t, err) + defer gpuPgm.Free() + + n := 8 + aVec, _ := vortex.NewKBVector(dev, n) + defer aVec.Free() + + // a = [0, 1, 2, 3, 4, 5, 6, 7] (in Montgomery) + aHost := make([]koalabear.Element, n) + for i := 0; i < n; i++ { + aHost[i].SetUint64(uint64(i)) + } + aVec.CopyFromHost(aHost) + + // inputs[0] = a (regular), inputs[1] = rot(a, 1) + syminputs := []symbolic.SymInput{ + symbolic.SymInputFromVec(aVec), + symbolic.SymInputFromRotatedVec(aVec, 1), + } + + result := symbolic.EvalSymGPU(dev, gpuPgm, syminputs, n) + + // expected[i] = a[i] + a[(i+1)%8] + for i := 0; i < n; i++ { + var expected koalabear.Element + expected.SetUint64(uint64(i + (i+1)%n)) + require.Equal(t, expected, result[i].B0.A0, "mismatch at i=%d", i) + } +} + +func TestGPUSymEval_NegCoeff(t *testing.T) { + // f(a, b) = a - b (LinComb with coeffs [1, -1]) + dev, err := gpu.New() + require.NoError(t, err) + defer dev.Close() + + nodes := []symbolic.NodeOp{ + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpInput}, + {Kind: symbolic.OpLinComb, Children: []int{0, 1}, Coeffs: []int{1, -1}}, + } + pgm := symbolic.CompileGPU(nodes) + + gpuPgm, err := symbolic.CompileSymGPU(dev, pgm) + require.NoError(t, err) + defer gpuPgm.Free() + + n := 256 + aVec, _ := vortex.NewKBVector(dev, n) + bVec, _ := vortex.NewKBVector(dev, n) + defer aVec.Free() + defer bVec.Free() + + var seven, three koalabear.Element + seven.SetUint64(7) + three.SetUint64(3) + aHost := make([]koalabear.Element, n) + bHost := make([]koalabear.Element, n) + for i := range aHost { + aHost[i] = seven + bHost[i] = three + } + aVec.CopyFromHost(aHost) + bVec.CopyFromHost(bHost) + + result := symbolic.EvalSymGPU(dev, gpuPgm, inputs(aVec, bVec), n) + + // 7 - 3 = 4 + var expected fext.E4 + var four koalabear.Element + four.SetUint64(4) + expected.B0.A0 = four + for i := 0; i < n; i++ { + require.Equal(t, expected, result[i]) + } +} + +// inputs is a helper to construct a SymInput slice from KBVectors. +func inputs(vecs ...*vortex.KBVector) []symbolic.SymInput { + out := make([]symbolic.SymInput, len(vecs)) + for i, v := range vecs { + out[i] = symbolic.SymInputFromVec(v) + } + return out +} diff --git a/prover/gpu/threadlocal_linux.go b/prover/gpu/threadlocal_linux.go new file mode 100644 index 00000000000..a78cfefdef9 --- /dev/null +++ b/prover/gpu/threadlocal_linux.go @@ -0,0 +1,68 @@ +//go:build linux + +package gpu + +import ( + "sync" + + "golang.org/x/sys/unix" +) + +// SetCurrentDevice associates the current OS thread with a GPU device. +// The caller is expected to have called runtime.LockOSThread() so the +// goroutine doesn't migrate. Pass nil to clear. +// +// Used to pin each segment-prover goroutine to a specific GPU on multi-GPU +// hosts. GPU dispatch sites should call CurrentDevice() to honour the +// per-thread choice; falls back to GetDevice() when unset. +func SetCurrentDevice(d *Device) { + tid := unix.Gettid() + if d == nil { + threadDeviceMu.Lock() + delete(threadDevice, tid) + threadDeviceMu.Unlock() + return + } + threadDeviceMu.Lock() + threadDevice[tid] = d + threadDeviceMu.Unlock() +} + +// CurrentDevice returns the device pinned to the current OS thread via +// SetCurrentDevice. Falls back to GetDevice() when unset. +func CurrentDevice() *Device { + tid := unix.Gettid() + threadDeviceMu.RLock() + d := threadDevice[tid] + threadDeviceMu.RUnlock() + if d != nil { + return d + } + return GetDevice() +} + +// CurrentDeviceID returns the index passed to GetDeviceN for the current +// OS thread's device, or 0 when unset / multi-device disabled. Used by the +// GPU phase tracer. +func CurrentDeviceID() int { + tid := unix.Gettid() + threadDeviceMu.RLock() + id := threadDeviceID[tid] + threadDeviceMu.RUnlock() + return id +} + +// SetCurrentDeviceID is the lower-level setter used together with +// SetCurrentDevice when tracing wants to know the index, not the handle. +func SetCurrentDeviceID(id int) { + tid := unix.Gettid() + threadDeviceMu.Lock() + threadDeviceID[tid] = id + threadDeviceMu.Unlock() +} + +var ( + threadDeviceMu sync.RWMutex + threadDevice = map[int]*Device{} + threadDeviceID = map[int]int{} +) diff --git a/prover/gpu/threadlocal_other.go b/prover/gpu/threadlocal_other.go new file mode 100644 index 00000000000..35642f8e645 --- /dev/null +++ b/prover/gpu/threadlocal_other.go @@ -0,0 +1,20 @@ +//go:build !linux + +package gpu + +// SetCurrentDevice is a no-op on platforms without a stable OS-thread ID API. +// Linux CUDA builds use a per-thread implementation for multi-GPU workers. +func SetCurrentDevice(d *Device) {} + +// CurrentDevice returns the default device on non-Linux platforms. +func CurrentDevice() *Device { + return GetDevice() +} + +// CurrentDeviceID reports the default device ID on non-Linux platforms. +func CurrentDeviceID() int { + return 0 +} + +// SetCurrentDeviceID is a no-op on non-Linux platforms. +func SetCurrentDeviceID(id int) {} diff --git a/prover/gpu/trace.go b/prover/gpu/trace.go new file mode 100644 index 00000000000..d309bbe4a4f --- /dev/null +++ b/prover/gpu/trace.go @@ -0,0 +1,118 @@ +package gpu + +import ( + "encoding/json" + "fmt" + "os" + "sync" + "sync/atomic" + "time" + + "github.com/sirupsen/logrus" +) + +// Trace emits per-phase GPU timing events when LIMITLESS_GPU_PROFILE=1. +// Cheap to call when disabled: a single atomic load and return. The events +// are written as JSONL to $LIMITLESS_GPU_PROFILE_PATH, or to a timestamped +// file under $LIMITLESS_GPU_PROFILE_DIR (default /scratch/runs). +// +// Usage: +// +// defer gpu.TraceTime("vortex_commit", deviceID, time.Now()) +// +// or: +// +// gpu.TraceEvent("quotient", deviceID, dur, map[string]any{"domain": n}) + +var ( + traceEnabled atomic.Bool + traceOnce sync.Once + traceMu sync.Mutex + traceEnc *json.Encoder + traceFile *os.File +) + +func initTrace() { + traceOnce.Do(func() { + if os.Getenv("LIMITLESS_GPU_PROFILE") != "1" { + return + } + path := os.Getenv("LIMITLESS_GPU_PROFILE_PATH") + if path == "" { + dir := os.Getenv("LIMITLESS_GPU_PROFILE_DIR") + if dir == "" { + dir = "/scratch/runs" + } + if err := os.MkdirAll(dir, 0o755); err != nil { //nolint:gosec // operator-supplied profiling path + logrus.Warnf("gpu/trace: mkdir %s: %v (tracing disabled)", dir, err) + return + } + path = fmt.Sprintf("%s/gpu_profile_%s.jsonl", dir, time.Now().UTC().Format("20060102_150405")) + } + f, err := os.Create(path) //nolint:gosec // operator-supplied profiling path + if err != nil { + logrus.Warnf("gpu/trace: create %s: %v (tracing disabled)", path, err) + return + } + traceFile = f + traceEnc = json.NewEncoder(f) + traceEnabled.Store(true) + logrus.Infof("gpu/trace: writing events to %s", path) + }) +} + +// TraceEnabled reports whether GPU phase tracing is on. +func TraceEnabled() bool { + initTrace() + return traceEnabled.Load() +} + +// TraceTime records a single phase event whose duration is time.Since(start). +// Intended use: `defer gpu.TraceTime("phase", id, time.Now())`. +func TraceTime(phase string, deviceID int, start time.Time) { + if !TraceEnabled() { + return + } + TraceEvent(phase, deviceID, time.Since(start), nil) +} + +// TraceEvent records a phase event with an explicit duration and optional +// extra fields. Extra is merged into the JSONL record at top level. +func TraceEvent(phase string, deviceID int, dur time.Duration, extra map[string]any) { + if !TraceEnabled() { + return + } + rec := map[string]any{ + "ts": time.Now().UTC().Format(time.RFC3339Nano), + "event": "gpu_phase", + "phase": phase, + "device": deviceID, + "ms": float64(dur.Microseconds()) / 1000.0, + } + for k, v := range extra { + if _, exists := rec[k]; !exists { + rec[k] = v + } + } + traceMu.Lock() + defer traceMu.Unlock() + if traceEnc == nil { + return + } + if err := traceEnc.Encode(rec); err != nil { + logrus.Warnf("gpu/trace: write: %v", err) + } +} + +// TraceClose flushes and closes the trace file. Safe to call multiple times. +func TraceClose() { + traceMu.Lock() + defer traceMu.Unlock() + if traceFile != nil { + _ = traceFile.Sync() + _ = traceFile.Close() + traceFile = nil + traceEnc = nil + traceEnabled.Store(false) + } +} diff --git a/prover/gpu/vortex/commit_cpu.go b/prover/gpu/vortex/commit_cpu.go new file mode 100644 index 00000000000..0ab2b7d9e9e --- /dev/null +++ b/prover/gpu/vortex/commit_cpu.go @@ -0,0 +1,114 @@ +// CPU fallback for Commit/Prove/LinComb/ExtractColumns when CUDA is not available. + +//go:build !cuda + +package vortex + +import ( + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + "github.com/consensys/gnark-crypto/field/koalabear/vortex" + "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" +) + +// CommitState holds prover state after commitment. +// On the CPU build, it wraps either a gnark-crypto ProverState (for Commit) +// or an encodedMatrix (for CommitSIS). +type CommitState struct { + inner *vortex.ProverState + encodedMatrix []smartvectors.SmartVector + nRows int +} + +// NRows returns the number of rows in this commit. +func (cs *CommitState) NRows() int { return cs.nRows } + +// FreeGPU is a no-op on CPU builds. +func (cs *CommitState) FreeGPU() {} + +// IsDeviceResident reports whether the encoded matrix is resident on device. +// CPU builds never keep Vortex state on device. +func (cs *CommitState) IsDeviceResident() bool { return false } + +// Commit encodes the input matrix using Reed-Solomon, hashes columns +// via SIS + Poseidon2, builds a Merkle tree, and returns the commitment root. +func (p *Params) Commit(rows [][]koalabear.Element) (*CommitState, Hash, error) { + ps, err := vortex.Commit(p.inner, rows) + if err != nil { + return nil, Hash{}, err + } + return &CommitState{inner: ps, nRows: len(rows)}, ps.GetCommitment(), nil +} + +// Prove generates an opening proof for the committed matrix. +func (cs *CommitState) Prove(alpha fext.E4, selectedCols []int) (*Proof, error) { + cs.inner.OpenLinComb(alpha) + vp, err := cs.inner.OpenColumns(selectedCols) + if err != nil { + return nil, err + } + return &Proof{ + UAlpha: vp.UAlpha, + Columns: vp.OpenedColumns, + MerkleProofs: vp.MerkleProofOpenedColumns, + }, nil +} + +// LinComb computes UAlpha[j] = Σᵢ αⁱ · rows[i].Get(j) on CPU. +func (cs *CommitState) LinComb(alpha fext.E4) ([]fext.E4, error) { + if cs.encodedMatrix == nil { + panic("vortex: CommitState has no encodedMatrix for CPU LinComb") + } + n := cs.encodedMatrix[0].Len() + result := make([]fext.E4, n) + var pow fext.E4 + pow.SetOne() + for _, row := range cs.encodedMatrix { + for j := range n { + v := row.Get(j) + var term fext.E4 + term.B0.A0 = v + term.Mul(&term, &pow) + result[j].Add(&result[j], &term) + } + pow.Mul(&pow, &alpha) + } + return result, nil +} + +// ExtractColumns gathers selected columns from host-side SmartVectors. +func (cs *CommitState) ExtractColumns(selectedCols []int) ([][]koalabear.Element, error) { + if cs.encodedMatrix == nil { + panic("vortex: CommitState has no encodedMatrix for CPU ExtractColumns") + } + columns := make([][]koalabear.Element, len(selectedCols)) + for i, c := range selectedCols { + col := make([]koalabear.Element, len(cs.encodedMatrix)) + for r, row := range cs.encodedMatrix { + col[r] = row.Get(c) + } + columns[i] = col + } + return columns, nil +} + +// GetEncodedMatrix returns the host-side encoded matrix as SmartVectors. +func (cs *CommitState) GetEncodedMatrix() []smartvectors.SmartVector { + return cs.encodedMatrix +} + +func (cs *CommitState) ExtractAllRows() ([][]koalabear.Element, error) { + panic("gpu: cuda required") +} + +func (cs *CommitState) MerkleTree() any { + panic("gpu: cuda required") +} + +func (cs *CommitState) ExtractSISHashes() ([]koalabear.Element, error) { + panic("gpu: cuda required") +} + +func (cs *CommitState) ExtractLeaves() ([]Hash, error) { + panic("gpu: cuda required") +} diff --git a/prover/gpu/vortex/commit_merkle.go b/prover/gpu/vortex/commit_merkle.go new file mode 100644 index 00000000000..5104ddf7c93 --- /dev/null +++ b/prover/gpu/vortex/commit_merkle.go @@ -0,0 +1,470 @@ +// GPU-accelerated Vortex commit for the protocol compiler. +// +// Two modes: +// +// CommitMerkleWithSIS — legacy drop-in, D2H of full encoded matrix. +// CommitSIS — device-resident: encoded matrix stays on GPU. +// +// CommitSIS returns a *CommitState handle. The protocol compiler stores it +// in prover state and later calls: +// +// cs.LinComb(α) → UAlpha[j] = Σᵢ αⁱ · row[i][j] (GPU kernel) +// cs.ExtractColumns(cols) → selected columns only (small D2H) +// +// This eliminates the ~8 GiB D2H transfer of the full encoded matrix. + +//go:build cuda + +package vortex + +import ( + "fmt" + "os" + "sync" + "time" + + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + refvortex "github.com/consensys/gnark-crypto/field/koalabear/vortex" + "github.com/consensys/linea-monorepo/prover/crypto/state-management/smt_koalabear" + "github.com/consensys/linea-monorepo/prover/crypto/vortex/vortex_koalabear" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/maths/field" + "github.com/sirupsen/logrus" +) + +// gpuVortexCache caches GPUVortex instances keyed by (nCols, maxNRows, rate). +// Avoids re-allocating ~12 GB of GPU memory per commit call. +var ( + gpuVortexMu sync.Mutex + gpuVortexCache = map[gpuVortexKey]*GPUVortex{} + + gpuBenchTimings = os.Getenv("LINEA_GPU_BENCH_TIMINGS") != "" +) + +type gpuVortexKey struct { + deviceID int + nCols int + nRows int + rate int +} + +// EvictPipelineCache frees all cached GPUVortex pipelines, reclaiming GPU memory. +// Call after a recursion level's OpenSelectedColumns to free pipelines that won't +// be reused (each level uses different parameters). +func EvictPipelineCache() { + gpuVortexMu.Lock() + victims := make([]*GPUVortex, 0, len(gpuVortexCache)) + for key, gv := range gpuVortexCache { + delete(gpuVortexCache, key) + victims = append(victims, gv) + } + gpuVortexMu.Unlock() + + for _, gv := range victims { + gv.Free() + } +} + +// EvictPipelineCacheForDevice frees only the cached pipelines bound to the +// given GPU. Use this when one segment goroutine needs to release its +// per-device buffers without disturbing pipelines that other goroutines +// (on other GPUs) are still using. +// +// Production segments do many Vortex rounds with different (nCols, nRows, +// rate) shapes; each cached pipeline holds multi-GiB device buffers +// (d_work, d_encoded_col, d_sis, d_tree, d_leaves). Without per-device +// eviction the cache grows monotonically across rounds within a segment +// and quickly fills the 96 GiB device, causing alloc failures and the +// CUDA runtime to spend minutes retrying. +func EvictPipelineCacheForDevice(deviceID int) { + gpuVortexMu.Lock() + var victims []*GPUVortex + for key, gv := range gpuVortexCache { + if key.deviceID != deviceID { + continue + } + delete(gpuVortexCache, key) + victims = append(victims, gv) + } + gpuVortexMu.Unlock() + + for _, gv := range victims { + gv.Free() + } +} + +func getOrCreateGPUVortex(dev *gpu.Device, deviceID int, params *Params, maxNRows int) (*GPUVortex, error) { + key := gpuVortexKey{ + deviceID: deviceID, + nCols: params.inner.NbColumns, + nRows: maxNRows, + rate: params.inner.ReedSolomonInvRate, + } + gpuVortexMu.Lock() + defer gpuVortexMu.Unlock() + + if gv, ok := gpuVortexCache[key]; ok { + return gv, nil + } + gv, err := NewGPUVortex(dev, params, maxNRows) + if err != nil { + return nil, err + } + gpuVortexCache[key] = gv + return gv, nil +} + +func materializeRows(polysMatrix []smartvectors.SmartVector) [][]koalabear.Element { + rows := make([][]koalabear.Element, len(polysMatrix)) + for i := range polysMatrix { + if reg, ok := polysMatrix[i].(*smartvectors.Regular); ok { + rows[i] = []koalabear.Element(*reg) + continue + } + rows[i] = smartvectors.IntoRegVec(polysMatrix[i]) + } + return rows +} + +func initGPUForRows(params *vortex_koalabear.Params, maxRows int) (*gpu.Device, *Params, *GPUVortex) { + // Honour per-thread device pinning when set (multi-GPU); fall back to the + // process-wide default device otherwise. + dev := gpu.CurrentDevice() + if dev == nil { + return nil, nil, nil + } + deviceID := gpu.CurrentDeviceID() + gpuParams, err := NewParams(params.NbColumns, maxRows, params.Key.SisGnarkCrypto, params.RsParams.Rate, 256) + if err != nil { + logrus.WithError(err).Warn("GPU params init failed") + return nil, nil, nil + } + gv, err := getOrCreateGPUVortex(dev, deviceID, gpuParams, maxRows) + if err != nil { + logrus.WithError(err).Warn("GPU vortex init failed") + return nil, nil, nil + } + return dev, gpuParams, gv +} + +func initGPU(params *vortex_koalabear.Params) (*gpu.Device, *Params, *GPUVortex) { + return initGPUForRows(params, params.MaxNbRows) +} + +func writeSmartVectorRow(s smartvectors.SmartVector, dst []koalabear.Element) { + if reg, ok := s.(*smartvectors.Regular); ok { + copy(dst, []koalabear.Element(*reg)) + return + } + s.WriteInSlice(dst) +} + +func logGPUTiming(format string, args ...any) { + if gpuBenchTimings { + fmt.Printf("[gpu/vortex] "+format+"\n", args...) + } +} + +// PreWarmGPU creates the GPU pipeline during compilation so that the +// first Prove() call doesn't pay the ~5s initialization cost. +// Safe to call multiple times or when GPU is unavailable (no-op). +func PreWarmGPU(params *vortex_koalabear.Params) { + if params == nil { + return + } + initGPU(params) +} + +// PreWarmGPUForRows eagerly initializes a GPU pipeline sized for `maxRows`. +// Useful to avoid first-use latency in prover steps that commit matrices with +// round-specific row counts. +func PreWarmGPUForRows(params *vortex_koalabear.Params, maxRows int) { + if params == nil || maxRows <= 0 { + return + } + initGPUForRows(params, maxRows) +} + +// cloneSMTTreeFromRef converts a gnark-crypto Merkle tree into an smt_koalabear +// tree without rehashing, by deep-copying tree levels into SMT layout. +func cloneSMTTreeFromRef(src *refvortex.MerkleTree) *smt_koalabear.Tree { + if src == nil || len(src.Levels) == 0 { + return nil + } + depth := src.Depth() + + leaves := make([]field.Octuplet, len(src.Levels[depth])) + for i := range leaves { + leaves[i] = field.Octuplet(src.Levels[depth][i]) + } + + occupiedNodes := make([][]field.Octuplet, depth-1) + for level := 1; level < depth; level++ { + // SMT level 0 is just above leaves; ref tree stores levels from root. + srcLevel := src.Levels[depth-level] + dstLevel := make([]field.Octuplet, len(srcLevel)) + for i := range dstLevel { + dstLevel[i] = field.Octuplet(srcLevel[i]) + } + occupiedNodes[level-1] = dstLevel + } + + return &smt_koalabear.Tree{ + Depth: depth, + Root: field.Octuplet(src.Levels[0][0]), + OccupiedLeaves: leaves, + OccupiedNodes: occupiedNodes, + // Full trees never hit empty nodes while proving openings. + EmptyNodes: make([]field.Octuplet, depth-1), + } +} + +// ───────────────────────────────────────────────────────────────────────────── +// CommitSIS — GPU-accelerated commit with host-resident result +// ───────────────────────────────────────────────────────────────────────────── + +// CommitSIS commits on GPU and keeps the encoded matrix on device. +// +// Returns: +// - *CommitState: device-resident handle for LinComb + ExtractColumns +// - *smt_koalabear.Tree: Merkle tree (host-side) +// - []field.Element: SIS column hashes (nil if needSISHashes is false) +func CommitSIS( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, + needSISHashes bool, +) (*CommitState, *smt_koalabear.Tree, []field.Element) { + tAll := time.Now() + var ( + tInit, tCommit, tTree, tSIS time.Duration + usedCPUFallback bool + ) + + defer func() { + logGPUTiming( + "CommitSIS rows=%d cols=%d needSIS=%t cpuFallback=%t init=%v commit=%v tree=%v sis=%v total=%v", + len(polysMatrix), params.NbColumns, needSISHashes, usedCPUFallback, + tInit, tCommit, tTree, tSIS, time.Since(tAll), + ) + }() + + if params.Key.LogTwoBound() != gpuSISLogTwoBound { + usedCPUFallback = true + return commitSISCPU(params, polysMatrix) + } + + t0 := time.Now() + dev, _, gv := initGPU(params) + tInit = time.Since(t0) + if gv == nil { + usedCPUFallback = true + return commitSISCPU(params, polysMatrix) + } + + t0 = time.Now() + var ( + cs *CommitState + tree *smt_koalabear.Tree + sisHashes []field.Element + ) + err := gv.CommitDirectAndThen( + len(polysMatrix), + func(i int, dst []koalabear.Element) { + writeSmartVectorRow(polysMatrix[i], dst) + }, + func(state *CommitState, _ Hash) error { + cs = state + cs.dev = dev + tCommit = time.Since(t0) + + tTreeStart := time.Now() + tree = cloneSMTTreeFromRef(cs.MerkleTree()) + tTree = time.Since(tTreeStart) + if tree == nil { + return fmt.Errorf("tree conversion failed") + } + + if needSISHashes { + tSISStart := time.Now() + var extractErr error + sisHashes, extractErr = cs.ExtractSISHashes() + tSIS = time.Since(tSISStart) + if extractErr != nil { + return fmt.Errorf("extract SIS hashes: %w", extractErr) + } + } + + if err := cs.SnapshotEncoded(dev); err != nil { + return fmt.Errorf("snapshot encoded matrix: %w", err) + } + return nil + }, + ) + if tCommit == 0 { + tCommit = time.Since(t0) + } + if err != nil { + logrus.WithError(err).Warn("GPU CommitSIS failed, falling back to CPU") + usedCPUFallback = true + return commitSISCPU(params, polysMatrix) + } + return cs, tree, sisHashes +} + +// CommitSISRootOnly commits on GPU only to produce the Merkle tree/root and +// optional SIS hashes. It intentionally does not snapshot the encoded matrix. +// The caller can later recommit for UAlpha and selected-column extraction. +// +// Returns ok=false when the GPU path could not run; callers should fall back to +// the CPU commitment path so correctness is preserved. +func CommitSISRootOnly( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, + needSISHashes bool, +) (tree *smt_koalabear.Tree, sisHashes []field.Element, ok bool) { + if params.Key.LogTwoBound() != gpuSISLogTwoBound { + return nil, nil, false + } + + err := withCommitSISState(params, polysMatrix, func(cs *CommitState) error { + tree = cloneSMTTreeFromRef(cs.MerkleTree()) + if tree == nil { + return fmt.Errorf("tree conversion failed") + } + if !needSISHashes { + return nil + } + var err error + sisHashes, err = cs.ExtractSISHashes() + if err != nil { + return fmt.Errorf("extract SIS hashes: %w", err) + } + return nil + }) + if err != nil { + logrus.WithError(err).Warn("GPU CommitSISRootOnly failed") + return nil, nil, false + } + + return tree, sisHashes, true +} + +// CommitSISLinComb recommits a SIS round and returns its UAlpha contribution. +// It is the low-VRAM alternative to keeping a per-round encoded snapshot. +func CommitSISLinComb( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, + alpha fext.E4, +) ([]fext.E4, int, error) { + if params.Key.LogTwoBound() != gpuSISLogTwoBound { + return nil, 0, fmt.Errorf("unsupported SIS LogTwoBound=%d", params.Key.LogTwoBound()) + } + var partial []fext.E4 + err := withCommitSISState(params, polysMatrix, func(cs *CommitState) error { + var err error + partial, err = cs.LinComb(alpha) + return err + }) + return partial, len(polysMatrix), err +} + +// CommitSISExtractColumns recommits a SIS round and extracts only the selected +// encoded columns. It avoids retaining a multi-GiB encoded snapshot until the +// verifier's column-selection coin is known. +func CommitSISExtractColumns( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, + entries []int, +) ([][]field.Element, error) { + if params.Key.LogTwoBound() != gpuSISLogTwoBound { + return nil, fmt.Errorf("unsupported SIS LogTwoBound=%d", params.Key.LogTwoBound()) + } + var cols [][]field.Element + err := withCommitSISState(params, polysMatrix, func(cs *CommitState) error { + var err error + cols, err = cs.ExtractColumns(entries) + return err + }) + return cols, err +} + +func withCommitSISState( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, + use func(*CommitState) error, +) error { + dev, _, gv := initGPU(params) + if gv == nil { + return fmt.Errorf("GPU vortex unavailable") + } + + return gv.CommitDirectAndThen( + len(polysMatrix), + func(i int, dst []koalabear.Element) { + writeSmartVectorRow(polysMatrix[i], dst) + }, + func(cs *CommitState, _ Hash) error { + cs.dev = dev + return use(cs) + }, + ) +} + +// commitSISCPU is the CPU fallback. Commits on CPU and wraps the result. +func commitSISCPU( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, +) (*CommitState, *smt_koalabear.Tree, []field.Element) { + encoded, _, tree, colHashes := params.CommitMerkleWithSIS(polysMatrix) + cs := &CommitState{encodedMatrix: encoded, nRows: len(polysMatrix)} + return cs, tree, colHashes +} + +// ───────────────────────────────────────────────────────────────────────────── +// CommitMerkleWithSIS — legacy drop-in (full D2H of encoded matrix) +// ───────────────────────────────────────────────────────────────────────────── + +// gpuSISLogTwoBound is the SIS LogTwoBound value the GPU kernels are validated +// for. The SIS decomposition produces ceil(31/LogTwoBound) limbs per element; +// changing LogTwoBound alters buffer sizes and kernel indexing. Fall back to +// CPU for other values until the CUDA kernels are generalized. +const gpuSISLogTwoBound = 16 + +// CommitMerkleWithSIS is a GPU-accelerated drop-in replacement for +// vortex_koalabear.Params.CommitMerkleWithSIS. Returns the full encoded +// matrix on host (D2H transfer). Use CommitSIS for device-resident mode. +func CommitMerkleWithSIS( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, +) (vortex_koalabear.EncodedMatrix, vortex_koalabear.Commitment, *smt_koalabear.Tree, []field.Element) { + + if params.Key.LogTwoBound() != gpuSISLogTwoBound { + return params.CommitMerkleWithSIS(polysMatrix) + } + + _, _, gv := initGPU(params) + if gv == nil { + logrus.Warn("GPU not available, falling back to CPU CommitMerkleWithSIS") + return params.CommitMerkleWithSIS(polysMatrix) + } + + rows := materializeRows(polysMatrix) + encodedRows, colHashes, _, _, gpuTree, err := gv.CommitAndExtract(rows) + if err != nil { + logrus.WithError(err).Warn("GPU commit failed, falling back to CPU") + return params.CommitMerkleWithSIS(polysMatrix) + } + + encodedMatrix := make(vortex_koalabear.EncodedMatrix, len(rows)) + for i := range encodedRows { + encodedMatrix[i] = smartvectors.NewRegular(encodedRows[i]) + } + tree := cloneSMTTreeFromRef(gpuTree) + if tree == nil { + logrus.Warn("GPU Merkle tree conversion failed, falling back to CPU") + return params.CommitMerkleWithSIS(polysMatrix) + } + return encodedMatrix, tree.Root, tree, colHashes +} diff --git a/prover/gpu/vortex/commit_merkle_stub.go b/prover/gpu/vortex/commit_merkle_stub.go new file mode 100644 index 00000000000..0e89d0b556b --- /dev/null +++ b/prover/gpu/vortex/commit_merkle_stub.go @@ -0,0 +1,69 @@ +// CPU fallback stubs when CUDA is not available. + +//go:build !cuda + +package vortex + +import ( + "github.com/consensys/linea-monorepo/prover/crypto/state-management/smt_koalabear" + "github.com/consensys/linea-monorepo/prover/crypto/vortex/vortex_koalabear" + "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/maths/field" + "github.com/consensys/linea-monorepo/prover/maths/field/fext" +) + +// PreWarmGPU is a no-op without CUDA. +func PreWarmGPU(_ *vortex_koalabear.Params) {} + +// EvictPipelineCache is a no-op without CUDA. +func EvictPipelineCache() {} + +// EvictPipelineCacheForDevice is a no-op without CUDA. +func EvictPipelineCacheForDevice(_ int) {} + +// ReleasePinnedCache is a no-op without CUDA. +func ReleasePinnedCache(_ int) {} + +// CommitMerkleWithSIS delegates to the CPU implementation when CUDA is not available. +func CommitMerkleWithSIS( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, +) (vortex_koalabear.EncodedMatrix, vortex_koalabear.Commitment, *smt_koalabear.Tree, []field.Element) { + return params.CommitMerkleWithSIS(polysMatrix) +} + +// CommitSIS delegates to the CPU implementation when CUDA is not available. +// The needSISHashes parameter is ignored (CPU always computes hashes). +func CommitSIS( + params *vortex_koalabear.Params, + polysMatrix []smartvectors.SmartVector, + needSISHashes bool, +) (*CommitState, *smt_koalabear.Tree, []field.Element) { + encoded, _, tree, colHashes := params.CommitMerkleWithSIS(polysMatrix) + cs := &CommitState{encodedMatrix: encoded, nRows: len(polysMatrix)} + return cs, tree, colHashes +} + +func CommitSISRootOnly( + _ *vortex_koalabear.Params, + _ []smartvectors.SmartVector, + _ bool, +) (*smt_koalabear.Tree, []field.Element, bool) { + return nil, nil, false +} + +func CommitSISLinComb( + _ *vortex_koalabear.Params, + _ []smartvectors.SmartVector, + _ fext.Element, +) ([]fext.Element, int, error) { + panic("gpu/vortex: CommitSISLinComb requires cuda build tag") +} + +func CommitSISExtractColumns( + _ *vortex_koalabear.Params, + _ []smartvectors.SmartVector, + _ []int, +) ([][]field.Element, error) { + panic("gpu/vortex: CommitSISExtractColumns requires cuda build tag") +} diff --git a/prover/gpu/vortex/commit_merkle_test.go b/prover/gpu/vortex/commit_merkle_test.go new file mode 100644 index 00000000000..91048efe74a --- /dev/null +++ b/prover/gpu/vortex/commit_merkle_test.go @@ -0,0 +1,582 @@ +// Tests and benchmarks for GPU CommitMerkleWithSIS drop-in replacement. +// +// Compares GPU vortex.CommitMerkleWithSIS against the CPU +// vortex_koalabear.Params.CommitMerkleWithSIS using the same SmartVector +// inputs (mix of Constant and Regular SmartVectors, matching production). +// +// Parameters from protocol/compiler/standard_benchmark_test.go: +// - RS rate = 2, SIS(logTwoDegree=9, logTwoBound=16) +// - Matrix ~1B cells (1<<19 cols × 1<<11 rows) +// - ~15% constant rows (mimics SmartVector constant columns in production) + +//go:build cuda + +package vortex + +import ( + "fmt" + "math/rand/v2" + "testing" + "time" + + "github.com/consensys/gnark-crypto/field/koalabear" + "github.com/consensys/gnark-crypto/field/koalabear/sis" + "github.com/consensys/linea-monorepo/prover/crypto/vortex/vortex_koalabear" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/maths/field" + "github.com/stretchr/testify/require" +) + +const constFraction = 0.15 // ~15% constant rows + +// randSmartVectorMatrix creates nRows SmartVectors of length nCols. +// ~constFraction are Constant SmartVectors, the rest are Regular. +func randSmartVectorMatrix(rng *rand.Rand, nRows, nCols int) []smartvectors.SmartVector { + m := make([]smartvectors.SmartVector, nRows) + nConst := int(float64(nRows) * constFraction) + for i := range m { + if i < nConst { + v := randKB(rng) + m[i] = smartvectors.NewConstant(field.Element(v), nCols) + } else { + row := make([]field.Element, nCols) + for j := range row { + row[j] = field.Element(randKB(rng)) + } + m[i] = smartvectors.NewRegular(row) + } + } + return m +} + +// randMatrixMixed creates nRows × nCols with ~constFraction constant rows. +func randMatrixMixed(rng *rand.Rand, nRows, nCols int) [][]koalabear.Element { + m := make([][]koalabear.Element, nRows) + nConst := int(float64(nRows) * constFraction) + for i := range m { + m[i] = make([]koalabear.Element, nCols) + if i < nConst { + v := randKB(rng) + for j := range m[i] { + m[i][j] = v + } + } else { + for j := range m[i] { + m[i][j] = randKB(rng) + } + } + } + return m +} + +// ─── Drop-in CommitMerkleWithSIS: GPU vs CPU with SmartVectors ────────────── + +func TestCommitMerkleWithSIS_GPUvsCPU(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{0xCA})) + + nCols := 1024 + nRows := 128 + rate := 2 + + params := vortex_koalabear.NewParams(rate, nCols, nRows, 9, 16) + m := randSmartVectorMatrix(rng, nRows, nCols) + + // CPU + cpuEncoded, cpuCommit, cpuTree, cpuHashes := params.CommitMerkleWithSIS(m) + + // GPU + gpuEncoded, gpuCommit, gpuTree, gpuHashes := CommitMerkleWithSIS(¶ms, m) + + // Compare commitments (Merkle roots) + assert.Equal(cpuCommit, gpuCommit, "GPU commitment ≠ CPU commitment") + + // Compare tree roots + assert.Equal(cpuTree.Root, gpuTree.Root, "GPU tree root ≠ CPU tree root") + + // Compare SIS column hashes + assert.Equal(len(cpuHashes), len(gpuHashes), "SIS hash length mismatch") + for i := range cpuHashes { + assert.Equal(cpuHashes[i], gpuHashes[i], "SIS hash mismatch at index %d", i) + } + + // Compare encoded matrices + assert.Equal(len(cpuEncoded), len(gpuEncoded), "encoded matrix row count mismatch") + for i := range cpuEncoded { + cpuRow := smartvectors.IntoRegVec(cpuEncoded[i]) + gpuRow := smartvectors.IntoRegVec(gpuEncoded[i]) + assert.Equal(len(cpuRow), len(gpuRow), "encoded row %d length mismatch", i) + for j := range cpuRow { + assert.Equal(cpuRow[j], gpuRow[j], "encoded matrix mismatch at [%d][%d]", i, j) + } + } +} + +func TestCommitMerkleWithSIS_Diagnostic(t *testing.T) { + rng := rand.New(rand.NewChaCha8([32]byte{0xCA})) + + nCols := 256 + nRows := 32 + rate := 2 + + params := vortex_koalabear.NewParams(rate, nCols, nRows, 9, 16) + m := randSmartVectorMatrix(rng, nRows, nCols) + + // CPU path + cpuEncoded, _, _, cpuHashes := params.CommitMerkleWithSIS(m) + + // GPU path (using CommitMerkleWithSIS drop-in) + gpuEncoded, _, _, gpuHashes := CommitMerkleWithSIS(¶ms, m) + + // Step 1: Compare encoded matrices row by row + encMismatch := 0 + for i := 0; i < len(cpuEncoded); i++ { + cpuRow := smartvectors.IntoRegVec(cpuEncoded[i]) + gpuRow := smartvectors.IntoRegVec(gpuEncoded[i]) + if len(cpuRow) != len(gpuRow) { + t.Errorf("row %d length: CPU=%d GPU=%d", i, len(cpuRow), len(gpuRow)) + continue + } + for j := range cpuRow { + if cpuRow[j] != gpuRow[j] { + if encMismatch < 3 { + t.Logf("encoded[%d][%d]: CPU=0x%x GPU=0x%x", i, j, cpuRow[j][0], gpuRow[j][0]) + } + encMismatch++ + } + } + } + t.Logf("Encoded matrix mismatches: %d / %d", encMismatch, len(cpuEncoded)*nCols*rate) + + // Step 2: Compare SIS column hashes + sisMismatch := 0 + minLen := len(cpuHashes) + if len(gpuHashes) < minLen { + minLen = len(gpuHashes) + } + t.Logf("SIS hash count: CPU=%d GPU=%d", len(cpuHashes), len(gpuHashes)) + for i := 0; i < minLen; i++ { + if cpuHashes[i] != gpuHashes[i] { + if sisMismatch < 3 { + t.Logf("SIS[%d]: CPU=0x%x GPU=0x%x", i, cpuHashes[i][0], gpuHashes[i][0]) + } + sisMismatch++ + } + } + t.Logf("SIS hash mismatches: %d / %d", sisMismatch, minLen) +} + +func TestCommitMerkleWithSIS_GPUvsCPU_Rate4(t *testing.T) { + testCommitMerkleWithSISRate(t, 4) +} + +func TestCommitMerkleWithSIS_GPUvsCPU_Rate8(t *testing.T) { + testCommitMerkleWithSISRate(t, 8) +} + +func TestCommitMerkleWithSIS_GPUvsCPU_Rate16(t *testing.T) { + testCommitMerkleWithSISRate(t, 16) +} + +func testCommitMerkleWithSISRate(t *testing.T, rate int) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{byte(rate)})) + + nCols := 256 + nRows := 32 + + params := vortex_koalabear.NewParams(rate, nCols, nRows, 9, 16) + m := randSmartVectorMatrix(rng, nRows, nCols) + + // CPU + _, cpuCommit, cpuTree, cpuHashes := params.CommitMerkleWithSIS(m) + + // GPU + _, gpuCommit, gpuTree, gpuHashes := CommitMerkleWithSIS(¶ms, m) + + assert.Equal(cpuCommit, gpuCommit, "GPU commitment ≠ CPU commitment (rate=%d)", rate) + assert.Equal(cpuTree.Root, gpuTree.Root, "GPU tree root ≠ CPU tree root (rate=%d)", rate) + assert.Equal(len(cpuHashes), len(gpuHashes), "SIS hash length mismatch (rate=%d)", rate) + for i := range cpuHashes { + assert.Equal(cpuHashes[i], gpuHashes[i], "SIS hash mismatch at %d (rate=%d)", i, rate) + } +} + +// ─── Low-level GPU correctness tests (RS encoding + SIS + columns) ────────── + +func TestGPUEncodeAndSIS(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{0xCA})) + dev := newTestDevice(t) + + nCols := 1024 + nRows := 128 + rate := 2 + nSelected := 32 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrixMixed(rng, nRows, nCols) + + // CPU RS encode + cpuEncoded := make([][]koalabear.Element, nRows) + for i := range m { + cpuEncoded[i] = make([]koalabear.Element, nCols*rate) + params.EncodeReedSolomon(m[i], cpuEncoded[i]) + } + + // GPU commit + extract + gv, err := NewGPUVortex(dev, params, nRows) + assert.NoError(err) + defer gv.Free() + + cs, _, err := gv.Commit(m) + assert.NoError(err) + + gpuRows, err := cs.ExtractAllRows() + assert.NoError(err) + + // Compare RS-encoded rows + assert.Equal(len(cpuEncoded), len(gpuRows), "row count mismatch") + for i := range cpuEncoded { + for j := range cpuEncoded[i] { + assert.Equal(cpuEncoded[i][j], gpuRows[i][j], "encoded[%d][%d] mismatch", i, j) + } + } + + // Compare SIS hashes + gpuSIS, err := cs.ExtractSISHashes() + assert.NoError(err) + + degree := sisParams.Degree + scw := nCols * rate + cpuSIS := make([]koalabear.Element, scw*degree) + for col := 0; col < scw; col++ { + column := make([]koalabear.Element, nRows) + for row := 0; row < nRows; row++ { + column[row] = cpuEncoded[row][col] + } + sisParams.Hash(column, cpuSIS[col*degree:(col+1)*degree]) + } + assert.Equal(len(cpuSIS), len(gpuSIS), "SIS hash length mismatch") + for i := range cpuSIS { + assert.Equal(cpuSIS[i], gpuSIS[i], "SIS[%d] mismatch", i) + } + + // Compare leaves (GPU MD hash vs CPU CompressPoseidon2x16) + gpuLeaves, err := cs.ExtractLeaves() + assert.NoError(err) + + cpuLeaves := make([]Hash, scw) + n16 := scw / 16 + for c := 0; c < n16; c++ { + start := c * 16 * degree + CompressPoseidon2x16(gpuSIS[start:start+16*degree], degree, cpuLeaves[c*16:(c+1)*16]) + } + for i := 0; i < scw; i++ { + assert.Equal(cpuLeaves[i], gpuLeaves[i], "leaf[%d] mismatch", i) + } +} + +func TestGPUColumnExtraction(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{0xBE})) + dev := newTestDevice(t) + + nCols := 256 + nRows := 32 + rate := 2 + nSelected := 8 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrixMixed(rng, nRows, nCols) + + gv, err := NewGPUVortex(dev, params, nRows) + assert.NoError(err) + defer gv.Free() + + cs, _, err := gv.Commit(m) + assert.NoError(err) + + allRows, err := cs.ExtractAllRows() + assert.NoError(err) + + // Extract specific columns and verify against full matrix + selectedCols := make([]int, nSelected) + for i := range selectedCols { + selectedCols[i] = rng.IntN(nCols*rate - 1) + } + cols, err := cs.ExtractColumns(selectedCols) + assert.NoError(err) + + for i, c := range selectedCols { + for row := 0; row < nRows; row++ { + assert.Equal(allRows[row][c], cols[i][row], + "col extraction mismatch: col=%d row=%d", c, row) + } + } +} + +func TestEvictPipelineCacheForDeviceWaitsForActiveCommit(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{0xE7})) + dev := newTestDevice(t) + deviceID := dev.DeviceID() + t.Cleanup(func() { EvictPipelineCacheForDevice(deviceID) }) + + nCols := 64 + nRows := 16 + rate := 2 + nSelected := 8 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + gv, err := getOrCreateGPUVortex(dev, deviceID, params, nRows) + assert.NoError(err) + + m := randMatrixMixed(rng, nRows, nCols) + inUse := make(chan struct{}) + release := make(chan struct{}) + doneCommit := make(chan error, 1) + + go func() { + doneCommit <- gv.CommitDirectAndThen( + nRows, + func(i int, dst []koalabear.Element) { + copy(dst, m[i]) + }, + func(cs *CommitState, _ Hash) error { + close(inUse) + <-release + _, err := cs.ExtractSISHashes() + return err + }, + ) + }() + + <-inUse + evicted := make(chan struct{}) + go func() { + EvictPipelineCacheForDevice(deviceID) + close(evicted) + }() + + select { + case <-evicted: + close(release) + assert.NoError(<-doneCommit) + t.Fatal("pipeline eviction returned while CommitDirectAndThen was active") + case <-time.After(100 * time.Millisecond): + } + + close(release) + assert.NoError(<-doneCommit) + + select { + case <-evicted: + case <-time.After(2 * time.Second): + t.Fatal("pipeline eviction did not complete after active commit returned") + } +} + +// ─── Benchmarks (drop-in CommitMerkleWithSIS) ─────────────────────────────── + +// BenchmarkCommitMerkleWithSIS_Small: 4096 × 256 ≈ 1M cells, rate=2. +func BenchmarkCommitMerkleWithSIS_Small(b *testing.B) { + benchCommitMerkleWithSIS(b, 4096, 256, 2) +} + +// BenchmarkCommitMerkleWithSIS_Typical: 16384 × 256, rate=2. +func BenchmarkCommitMerkleWithSIS_Typical(b *testing.B) { + benchCommitMerkleWithSIS(b, 16384, 256, 2) +} + +// BenchmarkCommitMerkleWithSIS_MedLarge: 1<<16 × 1<<10, rate=2. +func BenchmarkCommitMerkleWithSIS_MedLarge(b *testing.B) { + benchCommitMerkleWithSIS(b, 1<<16, 1<<10, 2) +} + +// BenchmarkCommitMerkleWithSIS_Large: 1<<19 × 1<<11 ≈ 1B cells, rate=2. +func BenchmarkCommitMerkleWithSIS_Large(b *testing.B) { + benchCommitMerkleWithSIS(b, 1<<19, 1<<11, 2) +} + +func benchCommitMerkleWithSIS(b *testing.B, nCols, nRows, rate int) { + rng := rand.New(rand.NewChaCha8([32]byte{})) + inputBytes := int64(nCols * nRows * 4) + + params := vortex_koalabear.NewParams(rate, nCols, nRows, 9, 16) + m := randSmartVectorMatrix(rng, nRows, nCols) + + b.Logf("matrix: %dx%d (%s cells, %s encoded, %s input, %.0f%% const rows)", + nCols, nRows, + fmtCount(int64(nCols)*int64(nRows)), + fmtCount(int64(nCols)*int64(nRows)*int64(rate)), + fmtBytes(inputBytes), + constFraction*100) + + // Warmup GPU + CommitMerkleWithSIS(¶ms, m) + + b.Run("GPU_CommitMerkleWithSIS", func(b *testing.B) { + b.SetBytes(inputBytes) + b.ResetTimer() + for i := 0; i < b.N; i++ { + CommitMerkleWithSIS(¶ms, m) + } + }) + + b.Run("CPU_CommitMerkleWithSIS", func(b *testing.B) { + b.SetBytes(inputBytes) + b.ResetTimer() + for i := 0; i < b.N; i++ { + params.CommitMerkleWithSIS(m) + } + }) + + b.Run("Speedup", func(b *testing.B) { + var gpuTotal, cpuTotal time.Duration + for i := 0; i < b.N; i++ { + start := time.Now() + CommitMerkleWithSIS(¶ms, m) + gpuTotal += time.Since(start) + + start = time.Now() + params.CommitMerkleWithSIS(m) + cpuTotal += time.Since(start) + } + n := time.Duration(b.N) + gpuAvg := gpuTotal / n + cpuAvg := cpuTotal / n + speedup := float64(cpuAvg) / float64(gpuAvg) + b.ReportMetric(float64(gpuAvg.Milliseconds()), "gpu_ms") + b.ReportMetric(float64(cpuAvg.Milliseconds()), "cpu_ms") + b.ReportMetric(speedup, "speedup_x") + }) +} + +// ─── Low-level benchmarks (GPU commit only, no D2H) ──────────────────────── + +func BenchmarkCommitMerkle_Large(b *testing.B) { + benchCommitMerkle(b, 1<<19, 1<<11, 2) +} + +func BenchmarkCommitMerkle_Medium(b *testing.B) { + benchCommitMerkle(b, 1<<18, 1<<12, 2) +} + +func BenchmarkCommitMerkle_Rate8(b *testing.B) { + benchCommitMerkle(b, 1<<16, 1<<11, 8) +} + +func benchCommitMerkle(b *testing.B, nCols, nRows, rate int) { + rng := rand.New(rand.NewChaCha8([32]byte{})) + nSelected := min(256, nCols*rate/4) + + sisParams, _ := sis.NewRSis(0, 9, 16, nRows) + params, _ := NewParams(nCols, nRows, sisParams, rate, nSelected) + + m := randMatrixMixed(rng, nRows, nCols) + inputBytes := int64(nCols * nRows * 4) + + b.Logf("matrix: %dx%d (%s cells, %s encoded, %s input, %.0f%% const rows)", + nCols, nRows, + fmtCount(int64(nCols)*int64(nRows)), + fmtCount(int64(nCols)*int64(nRows)*int64(rate)), + fmtBytes(inputBytes), + constFraction*100) + + dev, err := gpu.New() + if err != nil { + b.Fatal(err) + } + defer dev.Close() + + gv, err := NewGPUVortex(dev, params, nRows) + if err != nil { + b.Fatal(err) + } + defer gv.Free() + + if _, _, err := gv.Commit(m); err != nil { + b.Fatal(err) + } + + b.Run("GPU", func(b *testing.B) { + b.SetBytes(inputBytes) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, _, err := gv.Commit(m); err != nil { + b.Fatal(err) + } + } + }) + + b.Run("CPU", func(b *testing.B) { + b.SetBytes(inputBytes) + b.ResetTimer() + for i := 0; i < b.N; i++ { + if _, _, err := params.Commit(m); err != nil { + b.Fatal(err) + } + } + }) + + b.Run("Speedup", func(b *testing.B) { + var gpuTotal, cpuTotal time.Duration + for i := 0; i < b.N; i++ { + start := time.Now() + _, _, _ = gv.Commit(m) + gpuTotal += time.Since(start) + + start = time.Now() + _, _, _ = params.Commit(m) + cpuTotal += time.Since(start) + } + n := time.Duration(b.N) + gpuAvg := gpuTotal / n + cpuAvg := cpuTotal / n + speedup := float64(cpuAvg) / float64(gpuAvg) + b.ReportMetric(float64(gpuAvg.Milliseconds()), "gpu_ms") + b.ReportMetric(float64(cpuAvg.Milliseconds()), "cpu_ms") + b.ReportMetric(speedup, "speedup_x") + }) +} + +// ─── Helpers ───────────────────────────────────────────────────────────────── + +func fmtCount(n int64) string { + switch { + case n >= 1<<30: + return fmt.Sprintf("%.1fG", float64(n)/float64(1<<30)) + case n >= 1<<20: + return fmt.Sprintf("%.1fM", float64(n)/float64(1<<20)) + default: + return fmt.Sprintf("%d", n) + } +} + +func fmtBytes(n int64) string { + switch { + case n >= 1<<30: + return fmt.Sprintf("%.1f GiB", float64(n)/float64(1<<30)) + case n >= 1<<20: + return fmt.Sprintf("%.1f MiB", float64(n)/float64(1<<20)) + default: + return fmt.Sprintf("%d B", n) + } +} diff --git a/prover/gpu/vortex/gpu.go b/prover/gpu/vortex/gpu.go new file mode 100644 index 00000000000..bd5d1a2f5f8 --- /dev/null +++ b/prover/gpu/vortex/gpu.go @@ -0,0 +1,1453 @@ +// GPU wrappers for KoalaBear vector operations, NTT, Poseidon2, SIS, and Vortex commit. +// +// Build constraint: requires CGO + CUDA library. +// Without CGO, the package falls back to pure Go (gnark-crypto CPU) via vortex.go. + +//go:build cuda + +package vortex + +/* +#cgo LDFLAGS: -L${SRCDIR}/../cuda/build -lgnark_gpu -L/usr/local/cuda/lib64 -lcudart -lstdc++ -lm +#cgo CFLAGS: -I${SRCDIR}/../cuda/include + +#include "gnark_gpu.h" +#include "gnark_gpu_kb.h" +#include +#include +*/ +import "C" +import ( + "fmt" + "math/bits" + "runtime" + "sync" + "unsafe" + + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + "github.com/consensys/gnark-crypto/field/koalabear/fft" + "github.com/consensys/gnark-crypto/field/koalabear/poseidon2" + refvortex "github.com/consensys/gnark-crypto/field/koalabear/vortex" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" + "github.com/consensys/linea-monorepo/prover/utils/parallel" +) + +// devCtx casts the common gpu.Device handle back to the C type for CGO calls. +func devCtx(d *gpu.Device) C.gnark_gpu_context_t { + return C.gnark_gpu_context_t(d.Handle()) +} + +// ───────────────────────────────────────────────────────────────────────────── +// KBVector — KoalaBear vector on GPU +// ───────────────────────────────────────────────────────────────────────────── + +type KBVector struct { + dev *gpu.Device + handle C.kb_vec_t + n int +} + +func NewKBVector(d *gpu.Device, n int) (*KBVector, error) { + var h C.kb_vec_t + if err := kbError(C.kb_vec_alloc(devCtx(d), C.size_t(n), &h)); err != nil { + return nil, err + } + v := &KBVector{dev: d, handle: h, n: n} + runtime.SetFinalizer(v, (*KBVector).Free) + return v, nil +} + +func (v *KBVector) Free() { + if v.handle != nil { + C.kb_vec_free(v.handle) + v.handle = nil + } +} + +func (v *KBVector) Len() int { return v.n } + +func (v *KBVector) CopyFromHost(src []koalabear.Element) { + if len(src) != v.n { + panic(fmt.Sprintf("vortex: CopyFromHost size mismatch: got %d, want %d", len(src), v.n)) + } + ptr := (*C.uint32_t)(unsafe.Pointer(&src[0])) + if err := kbError(C.kb_vec_h2d(devCtx(v.dev), v.handle, ptr, C.size_t(v.n))); err != nil { + panic("vortex: CopyFromHost: " + err.Error()) + } +} + +// CopyFromHostPinned copies from a pre-pinned host buffer (allocated by AllocPinned). +// Much faster than CopyFromHost for large buffers (DMA without staging). +func (v *KBVector) CopyFromHostPinned(src []koalabear.Element) { + if len(src) != v.n { + panic(fmt.Sprintf("vortex: CopyFromHostPinned size mismatch: got %d, want %d", len(src), v.n)) + } + ptr := (*C.uint32_t)(unsafe.Pointer(&src[0])) + if err := kbError(C.kb_vec_h2d_pinned(devCtx(v.dev), v.handle, ptr, C.size_t(v.n))); err != nil { + panic("vortex: CopyFromHostPinned: " + err.Error()) + } +} + +// AllocPinned allocates page-locked host memory for fast H2D. +// Returns a Go slice backed by CUDA pinned memory. Free with FreePinned. +func AllocPinned(n int) []koalabear.Element { + var ptr *C.uint32_t + if err := kbError(C.kb_pinned_alloc(C.size_t(n*4), &ptr)); err != nil { // 4 bytes per element + panic("vortex: AllocPinned: " + err.Error()) + } + return unsafe.Slice((*koalabear.Element)(unsafe.Pointer(ptr)), n) +} + +// FreePinned frees memory allocated by AllocPinned. +func FreePinned(buf []koalabear.Element) { + if len(buf) > 0 { + C.kb_pinned_free((*C.uint32_t)(unsafe.Pointer(&buf[0]))) + } +} + +func (v *KBVector) CopyToHost(dst []koalabear.Element) { + if len(dst) != v.n { + panic(fmt.Sprintf("vortex: CopyToHost size mismatch: got %d, want %d", len(dst), v.n)) + } + ptr := (*C.uint32_t)(unsafe.Pointer(&dst[0])) + if err := kbError(C.kb_vec_d2h(devCtx(v.dev), ptr, v.handle, C.size_t(v.n))); err != nil { + panic("vortex: CopyToHost: " + err.Error()) + } +} + +func (v *KBVector) Add(a, b *KBVector) { + must(C.kb_vec_add(devCtx(v.dev), v.handle, a.handle, b.handle)) +} +func (v *KBVector) Sub(a, b *KBVector) { + must(C.kb_vec_sub(devCtx(v.dev), v.handle, a.handle, b.handle)) +} +func (v *KBVector) Mul(a, b *KBVector) { + must(C.kb_vec_mul(devCtx(v.dev), v.handle, a.handle, b.handle)) +} + +func (v *KBVector) Scale(scalar koalabear.Element) { + must(C.kb_vec_scale(devCtx(v.dev), v.handle, C.uint32_t(scalar[0]))) +} + +func (v *KBVector) ScaleByPowers(g koalabear.Element) { + must(C.kb_vec_scale_by_powers(devCtx(v.dev), v.handle, C.uint32_t(g[0]))) +} + +// BitReverse applies the bit-reversal permutation in-place. +// Required because GPU NTT uses DIF/DIT without internal bit-reversal: +// +// IFFT: bitrev(input) → kb_ntt_inv → natural-order coefficients +// FFT: kb_ntt_fwd → bitrev(output) → natural-order evaluations +func (v *KBVector) BitReverse() { + must(C.kb_vec_bitrev(devCtx(v.dev), v.handle)) +} + +// Sync waits for all queued GPU operations on the default stream to complete. +func Sync(d *gpu.Device) { + must(C.kb_sync(devCtx(d))) +} + +// D2DRaw copies n uint32 from src to dst device pointers (async, no sync). +func D2DRaw(d *gpu.Device, dst, src unsafe.Pointer, n int) { + must(C.kb_vec_d2d_offset(devCtx(d), (*C.uint32_t)(dst), (*C.uint32_t)(src), C.size_t(n))) +} + +// CosetFFTRaw applies coset forward NTT on raw device pointer (async, no sync). +func (f *GPUFFTDomain) CosetFFTRaw(data unsafe.Pointer, g koalabear.Element) { + must(C.kb_ntt_coset_fwd_raw(devCtx(f.dev), f.handle, (*C.uint32_t)(data), C.uint32_t(g[0]))) +} + +// BitRevRaw applies bit-reversal on raw device pointer of n elements (async, no sync). +func BitRevRaw(d *gpu.Device, data unsafe.Pointer, n int) { + must(C.kb_vec_bitrev_raw(devCtx(d), (*C.uint32_t)(data), C.size_t(n))) +} + +// D2HRaw copies n uint32 from device src to host dst (synchronous). +func D2HRaw(d *gpu.Device, dst []koalabear.Element, src unsafe.Pointer, n int) { + must(C.kb_vec_d2h_raw(devCtx(d), (*C.uint32_t)(unsafe.Pointer(&dst[0])), (*C.uint32_t)(src), C.size_t(n))) +} + +// CopyFromDevice2 copies n elements starting at offset srcOff from src to this vector. +func (v *KBVector) CopyFromDevice2(src *KBVector, srcOff int) { + if srcOff+v.n > src.n { + panic(fmt.Sprintf("vortex: CopyFromDevice2 bounds: srcOff=%d n=%d src.n=%d", srcOff, v.n, src.n)) + } + // Use raw device pointer arithmetic + srcDevPtr := unsafe.Add(src.DevicePtr(), srcOff*4) // 4 bytes per uint32 + dstDevPtr := v.DevicePtr() + must(C.kb_vec_d2d_offset(devCtx(v.dev), + (*C.uint32_t)(dstDevPtr), + (*C.uint32_t)(srcDevPtr), + C.size_t(v.n))) +} + +// CopyFromDevice copies data from another device-resident KBVector (GPU→GPU, same size). +func (v *KBVector) CopyFromDevice(src *KBVector) { + if v.n != src.n { + panic(fmt.Sprintf("vortex: CopyFromDevice size mismatch: got %d, want %d", src.n, v.n)) + } + if err := kbError(C.kb_vec_d2d(devCtx(v.dev), v.handle, src.handle)); err != nil { + panic("vortex: CopyFromDevice: " + err.Error()) + } +} + +// DevicePtr returns the raw device pointer for cross-package access (e.g. symbolic eval). +func (v *KBVector) DevicePtr() unsafe.Pointer { + return unsafe.Pointer(C.kb_vec_device_ptr(v.handle)) +} + +// ───────────────────────────────────────────────────────────────────────────── +// FFTDomain — NTT twiddles on GPU +// ───────────────────────────────────────────────────────────────────────────── + +type GPUFFTDomain struct { + dev *gpu.Device + handle C.kb_ntt_t + n int +} + +func NewGPUFFTDomain(d *gpu.Device, size int) (*GPUFFTDomain, error) { + domain := fft.NewDomain(uint64(size)) + halfN := size / 2 + + fwdTw := make([]koalabear.Element, halfN) + invTw := make([]koalabear.Element, halfN) + fwdTw[0].SetOne() + invTw[0].SetOne() + gen := domain.Generator + genInv := domain.GeneratorInv + for i := 1; i < halfN; i++ { + fwdTw[i].Mul(&fwdTw[i-1], &gen) + invTw[i].Mul(&invTw[i-1], &genInv) + } + + var h C.kb_ntt_t + fptr := (*C.uint32_t)(unsafe.Pointer(&fwdTw[0])) + iptr := (*C.uint32_t)(unsafe.Pointer(&invTw[0])) + if err := kbError(C.kb_ntt_init(devCtx(d), C.size_t(size), fptr, iptr, &h)); err != nil { + return nil, err + } + return &GPUFFTDomain{dev: d, handle: h, n: size}, nil +} + +func (f *GPUFFTDomain) Free() { + if f.handle != nil { + C.kb_ntt_free(f.handle) + f.handle = nil + } +} + +func (f *GPUFFTDomain) FFT(v *KBVector) { must(C.kb_ntt_fwd(devCtx(f.dev), f.handle, v.handle)) } + +// BatchCosetFFTBitRev applies coset forward NTT + bit-reversal to `batch` packed vectors. +// `data` must contain batch*n elements packed contiguously. Single CGO call. +func (f *GPUFFTDomain) BatchCosetFFTBitRev(data *KBVector, batch int, g koalabear.Element) { + must(C.kb_ntt_batch_coset_fwd_bitrev(devCtx(f.dev), f.handle, + (*C.uint32_t)(data.DevicePtr()), C.size_t(f.n), C.size_t(batch), C.uint32_t(g[0]))) +} + +// BatchIFFTScale applies bit-reversal + inverse NTT + scale(nInv) to `batch` packed vectors. +func (f *GPUFFTDomain) BatchIFFTScale(data *KBVector, batch int, nInv koalabear.Element) { + must(C.kb_ntt_batch_ifft_scale(devCtx(f.dev), f.handle, + (*C.uint32_t)(data.DevicePtr()), C.size_t(f.n), C.size_t(batch), C.uint32_t(nInv[0]))) +} + +func (f *GPUFFTDomain) FFTInverse(v *KBVector) { must(C.kb_ntt_inv(devCtx(f.dev), f.handle, v.handle)) } +func (f *GPUFFTDomain) CosetFFT(v *KBVector, g koalabear.Element) { + must(C.kb_ntt_coset_fwd(devCtx(f.dev), f.handle, v.handle, C.uint32_t(g[0]))) +} + +// ───────────────────────────────────────────────────────────────────────────── +// E4 NTT — FFT on KoalaBear extension field (degree-4) +// +// E4 elements are (B0.A0, B0.A1, B1.A0, B1.A1) — 4 base-field components. +// An E4 NTT decomposes into 4 independent base-field NTTs (one per component). +// +// Data layout on GPU (SoA): for n E4 elements, stored as 4*n base-field +// elements in 4 contiguous blocks: [A0(0..n), A1(0..n), A2(0..n), A3(0..n)]. +// +// The GPU pipeline: AoS→SoA transpose → 4× batch NTT → SoA→AoS transpose. +// ───────────────────────────────────────────────────────────────────────────── + +// FFTE4 performs forward NTT on n E4 elements. +// Input: a in natural order (AoS). Output: a in bit-reversed order (AoS). +// Decomposes into 4 independent base-field NTTs (one per E4 component). +func (f *GPUFFTDomain) FFTE4(a []fext.E4) { + n := f.n + if len(a) != n { + panic(fmt.Sprintf("vortex: FFTE4 size mismatch: got %d, want %d", len(a), n)) + } + soa := e4AoSToSoA(a) + vecs := allocE4Components(f.dev, n) + defer freeE4Components(vecs) + copyE4SoAToGPU(vecs, soa, n) + for c := 0; c < 4; c++ { + f.FFT(vecs[c]) + } + Sync(f.dev) + copyE4GPUToSoA(vecs, soa, n) + e4SoAToAoS(soa, a) +} + +// FFTInverseE4 performs inverse NTT on n E4 elements. +// Input: a in bit-reversed order (AoS). Output: a in natural order (AoS). +// Note: like the base-field GPU IFFT, this does NOT include 1/n scaling. +func (f *GPUFFTDomain) FFTInverseE4(a []fext.E4) { + n := f.n + if len(a) != n { + panic(fmt.Sprintf("vortex: FFTInverseE4 size mismatch: got %d, want %d", len(a), n)) + } + soa := e4AoSToSoA(a) + vecs := allocE4Components(f.dev, n) + defer freeE4Components(vecs) + copyE4SoAToGPU(vecs, soa, n) + for c := 0; c < 4; c++ { + f.FFTInverse(vecs[c]) + } + Sync(f.dev) + copyE4GPUToSoA(vecs, soa, n) + e4SoAToAoS(soa, a) +} + +// CosetFFTE4 performs coset forward NTT on n E4 elements. +// Input: a holds coefficients in natural order (AoS). +// Output: a holds evaluations on coset g·H in natural order (AoS). +// Internally: ScaleByPowers(g) + forward NTT + bit-reversal, per component. +func (f *GPUFFTDomain) CosetFFTE4(a []fext.E4, g koalabear.Element) { + n := f.n + if len(a) != n { + panic(fmt.Sprintf("vortex: CosetFFTE4 size mismatch: got %d, want %d", len(a), n)) + } + soa := e4AoSToSoA(a) + vecs := allocE4Components(f.dev, n) + defer freeE4Components(vecs) + copyE4SoAToGPU(vecs, soa, n) + for c := 0; c < 4; c++ { + f.CosetFFT(vecs[c], g) + } + Sync(f.dev) + copyE4GPUToSoA(vecs, soa, n) + e4SoAToAoS(soa, a) +} + +// BatchCosetFFTE4BitRev performs coset FFT + bit-reversal on E4 data in SoA layout. +// data must contain nE4*4 base-field elements (4 component vectors of length nE4). +// This is the zero-copy variant for use in pipelines that manage their own GPU buffers. +func (f *GPUFFTDomain) BatchCosetFFTE4BitRev(data *KBVector, nE4 int, g koalabear.Element) { + if data.Len() != nE4*4 { + panic(fmt.Sprintf("vortex: BatchCosetFFTE4BitRev size mismatch: got %d, want %d", data.Len(), nE4*4)) + } + must(C.kb_ntt_batch_coset_fwd_bitrev(devCtx(f.dev), f.handle, + (*C.uint32_t)(data.DevicePtr()), C.size_t(nE4), C.size_t(4), C.uint32_t(g[0]))) +} + +// BatchIFFTScaleE4 performs batch IFFT + scale on E4 data in SoA layout. +// data must contain nE4*4 base-field elements. +func (f *GPUFFTDomain) BatchIFFTScaleE4(data *KBVector, nE4 int, nInv koalabear.Element) { + if data.Len() != nE4*4 { + panic(fmt.Sprintf("vortex: BatchIFFTScaleE4 size mismatch: got %d, want %d", data.Len(), nE4*4)) + } + must(C.kb_ntt_batch_ifft_scale(devCtx(f.dev), f.handle, + (*C.uint32_t)(data.DevicePtr()), C.size_t(nE4), C.size_t(4), C.uint32_t(nInv[0]))) +} + +// ── E4 NTT helpers ────────────────────────────────────────────────────────── + +// e4AoSToSoA transposes n E4 elements from AoS to SoA layout. +// AoS: [e0.B0.A0, e0.B0.A1, e0.B1.A0, e0.B1.A1, e1.B0.A0, ...] +// SoA: [all B0.A0s | all B0.A1s | all B1.A0s | all B1.A1s] +func e4AoSToSoA(a []fext.E4) []koalabear.Element { + n := len(a) + soa := make([]koalabear.Element, n*4) + d0, d1, d2, d3 := soa[:n], soa[n:2*n], soa[2*n:3*n], soa[3*n:] + for i := range a { + d0[i] = a[i].B0.A0 + d1[i] = a[i].B0.A1 + d2[i] = a[i].B1.A0 + d3[i] = a[i].B1.A1 + } + return soa +} + +// e4SoAToAoS transposes SoA back to AoS, writing into the provided slice. +func e4SoAToAoS(soa []koalabear.Element, a []fext.E4) { + n := len(a) + d0, d1, d2, d3 := soa[:n], soa[n:2*n], soa[2*n:3*n], soa[3*n:] + for i := range a { + a[i].B0.A0 = d0[i] + a[i].B0.A1 = d1[i] + a[i].B1.A0 = d2[i] + a[i].B1.A1 = d3[i] + } +} + +// allocE4Components allocates 4 KBVectors (one per E4 component). +func allocE4Components(dev *gpu.Device, n int) [4]*KBVector { + var vecs [4]*KBVector + for c := 0; c < 4; c++ { + v, err := NewKBVector(dev, n) + if err != nil { + for j := 0; j < c; j++ { + vecs[j].Free() + } + panic("vortex: allocE4Components: " + err.Error()) + } + vecs[c] = v + } + return vecs +} + +func freeE4Components(vecs [4]*KBVector) { + for _, v := range vecs { + v.Free() + } +} + +func copyE4SoAToGPU(vecs [4]*KBVector, soa []koalabear.Element, n int) { + for c := 0; c < 4; c++ { + vecs[c].CopyFromHost(soa[c*n : (c+1)*n]) + } +} + +func copyE4GPUToSoA(vecs [4]*KBVector, soa []koalabear.Element, n int) { + for c := 0; c < 4; c++ { + vecs[c].CopyToHost(soa[c*n : (c+1)*n]) + } + Sync(vecs[0].dev) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Poseidon2 — GPU batch hashing +// ───────────────────────────────────────────────────────────────────────────── + +type GPUPoseidon2 struct { + dev *gpu.Device + handle C.kb_p2_t + width int +} + +// NewGPUPoseidon2 creates a Poseidon2 instance with standard parameters (rf=6, rp=21). +func NewGPUPoseidon2(d *gpu.Device, width int) (*GPUPoseidon2, error) { + const ( + rf = 6 + rp = 21 + ) + params := poseidon2.NewParameters(width, rf, rp) + + // Flatten round keys + var flat []koalabear.Element + for _, rk := range params.RoundKeys { + flat = append(flat, rk...) + } + + // Get internal MDS diagonal + diag := poseidon2Diag(width) + + var h C.kb_p2_t + rkPtr := (*C.uint32_t)(unsafe.Pointer(&flat[0])) + dPtr := (*C.uint32_t)(unsafe.Pointer(&diag[0])) + if err := kbError(C.kb_p2_init(devCtx(d), C.int(width), C.int(rf), C.int(rp), rkPtr, dPtr, &h)); err != nil { + return nil, err + } + return &GPUPoseidon2{dev: d, handle: h, width: width}, nil +} + +func (p *GPUPoseidon2) Free() { + if p.handle != nil { + C.kb_p2_free(p.handle) + p.handle = nil + } +} + +func (p *GPUPoseidon2) CompressBatch(input []koalabear.Element, count int) []Hash { + if p.width != 16 { + panic("vortex: CompressBatch requires width=16 Poseidon2") + } + output := make([]Hash, count) + iptr := (*C.uint32_t)(unsafe.Pointer(&input[0])) + optr := (*C.uint32_t)(unsafe.Pointer(&output[0])) + must(C.kb_p2_compress_batch(devCtx(p.dev), p.handle, iptr, optr, C.size_t(count))) + return output +} + +// ───────────────────────────────────────────────────────────────────────────── +// GPU linear combination +// ───────────────────────────────────────────────────────────────────────────── + +func GPULinCombE4(dev *gpu.Device, rows []*KBVector, alpha fext.E4, nCols int) []fext.E4 { + nRows := len(rows) + handles := make([]C.kb_vec_t, nRows) + for i, r := range rows { + handles[i] = r.handle + } + alphaRaw := [4]C.uint32_t{ + C.uint32_t(alpha.B0.A0[0]), + C.uint32_t(alpha.B0.A1[0]), + C.uint32_t(alpha.B1.A0[0]), + C.uint32_t(alpha.B1.A1[0]), + } + result := make([]fext.E4, nCols) + rptr := (*C.uint32_t)(unsafe.Pointer(&result[0])) + must(C.kb_lincomb_e4(devCtx(dev), &handles[0], C.size_t(nRows), C.size_t(nCols), &alphaRaw[0], rptr)) + return result +} + +// ───────────────────────────────────────────────────────────────────────────── +// GPUVortex — GPU-accelerated Vortex commit +// ───────────────────────────────────────────────────────────────────────────── +// +// Full GPU pipeline: RS encode (batch NTT) + SIS hash + Poseidon2 + Merkle. +// Raw rows are uploaded to GPU; RS encoding, hashing, and tree construction +// all run on device. Encoded matrix stays on GPU for Prove (lincomb + column +// extraction), eliminating the 8 GiB D2H bottleneck. +// +// gv, _ := NewGPUVortex(dev, params, nRows) +// defer gv.Free() +// cs, root, _ := gv.Commit(rows) +// proof, _ := cs.Prove(alpha, selectedCols) + +type GPUVortex struct { + // Serialises all uses of this pipeline. The pinned host buffers + // (inputBuf, treeBuf) and the device buffers inside `pipeline` are + // shared across calls; concurrent gv.Commit() / gv.CommitDirect() + // invocations would race on inputBuf and on the pipeline's d_work / + // d_encoded_col / d_tree, leading to SIGSEGVs in the cgo layer. + // + // Each pinGPU'd segment goroutine normally owns its device's cached + // pipeline outright, but when the conglomeration merger runs in + // parallel with GL/LPP workers and shares a device (because nWorkers + // > nGPUs), this mutex prevents the cgo race. + mu sync.Mutex + + dev *gpu.Device + sis C.kb_sis_t + p2s C.kb_p2_t // width=24 sponge (SIS→leaf) + p2c C.kb_p2_t // width=16 compress (Merkle tree) + pipeline C.kb_vortex_pipeline_t + params *Params + + // Pinned host buffers (zero-copy Go slices over cudaMallocHost memory). + inputBuf []koalabear.Element // [maxNRows × nCols], raw input rows + treeBuf []Hash // [2·np − 1] +} + +// NewGPUVortex initializes GPU resources for Vortex commit. +// maxNRows is the maximum number of rows that will be passed to Commit(). +// Pre-allocates all device buffers + RS domain data for zero-allocation commits. +func NewGPUVortex(dev *gpu.Device, params *Params, maxNRows int) (*GPUVortex, error) { + // Pin this thread to the chosen device before any allocation. CUDA's + // "current device" is per-OS-thread state; without this the pipeline's + // device buffers (multi-GB) silently land on device 0 even when + // `dev.DeviceID() == 1`, defeating multi-GPU. + if err := dev.Bind(); err != nil { + return nil, fmt.Errorf("vortex: Bind device %d: %w", dev.DeviceID(), err) + } + inner := params.inner + sisKey := inner.Key + degree := sisKey.Degree + halfDeg := degree / 2 + nCols := inner.NbColumns + rate := inner.ReedSolomonInvRate + + // ── Build SIS domain twiddle arrays ────────────────────────────────── + sisDom := sisKey.Domain + sisFwd := make([]koalabear.Element, halfDeg) + sisInv := make([]koalabear.Element, halfDeg) + sisFwd[0].SetOne() + sisInv[0].SetOne() + gen := sisDom.Generator + genInv := sisDom.GeneratorInv + for i := 1; i < halfDeg; i++ { + sisFwd[i].Mul(&sisFwd[i-1], &gen) + sisInv[i].Mul(&sisInv[i-1], &genInv) + } + + // ── Coset tables (SIS domain) ──────────────────────────────────────── + cosetTable, err := sisDom.CosetTable() + if err != nil { + return nil, fmt.Errorf("vortex: CosetTable: %w", err) + } + cosetTableInv, err := sisDom.CosetTableInv() + if err != nil { + return nil, fmt.Errorf("vortex: CosetTableInv: %w", err) + } + cardInvSIS := sisDom.CardinalityInv + cosetInv := make([]koalabear.Element, degree) + for j := 0; j < degree; j++ { + cosetInv[j].Mul(&cosetTableInv[j], &cardInvSIS) + } + + // ── Flatten SIS Ag keys ────────────────────────────────────────────── + nPolys := len(sisKey.Ag) + agFlat := make([]koalabear.Element, nPolys*degree) + for i := 0; i < nPolys; i++ { + copy(agFlat[i*degree:(i+1)*degree], sisKey.Ag[i]) + } + + // ── Init SIS on GPU ────────────────────────────────────────────────── + var sisHandle C.kb_sis_t + if err := kbError(C.kb_sis_init(devCtx(dev), + C.int(degree), C.int(nPolys), C.int(sisKey.LogTwoBound), + (*C.uint32_t)(unsafe.Pointer(&agFlat[0])), + (*C.uint32_t)(unsafe.Pointer(&sisFwd[0])), + (*C.uint32_t)(unsafe.Pointer(&sisInv[0])), + (*C.uint32_t)(unsafe.Pointer(&cosetTable[0])), + (*C.uint32_t)(unsafe.Pointer(&cosetInv[0])), + &sisHandle)); err != nil { + return nil, fmt.Errorf("vortex: kb_sis_init: %w", err) + } + + // ── Init Poseidon2 (sponge width=24 + compress width=16) ───────────── + const ( + rf = 6 + rp = 21 + ) + initP2 := func(width int) (C.kb_p2_t, error) { + p := poseidon2.NewParameters(width, rf, rp) + var flat []koalabear.Element + for _, rk := range p.RoundKeys { + flat = append(flat, rk...) + } + diag := poseidon2Diag(width) + var h C.kb_p2_t + if err := kbError(C.kb_p2_init(devCtx(dev), C.int(width), C.int(rf), C.int(rp), + (*C.uint32_t)(unsafe.Pointer(&flat[0])), + (*C.uint32_t)(unsafe.Pointer(&diag[0])), + &h)); err != nil { + return nil, err + } + return h, nil + } + + p2s, err := initP2(24) + if err != nil { + C.kb_sis_free(sisHandle) + return nil, fmt.Errorf("vortex: p2_sponge init: %w", err) + } + p2c, err := initP2(16) + if err != nil { + C.kb_sis_free(sisHandle) + C.kb_p2_free(p2s) + return nil, fmt.Errorf("vortex: p2_compress init: %w", err) + } + + // ── Build RS domain twiddle arrays ─────────────────────────────────── + rsDom := inner.Domains[0] + halfNC := nCols / 2 + rsFwd := make([]koalabear.Element, halfNC) + rsInv := make([]koalabear.Element, halfNC) + rsFwd[0].SetOne() + rsInv[0].SetOne() + rsGen := rsDom.Generator + rsGenInv := rsDom.GeneratorInv + for i := 1; i < halfNC; i++ { + rsFwd[i].Mul(&rsFwd[i-1], &rsGen) + rsInv[i].Mul(&rsInv[i-1], &rsGenInv) + } + + // ── Build scaled coset table: CosetTableBitReverse × cardinalityInv ─ + cosetBR := inner.CosetTableBitReverse + cardInvRS := rsDom.CardinalityInv + scaledCoset := make([]koalabear.Element, nCols) + for i := 0; i < nCols; i++ { + scaledCoset[i].Mul(&cosetBR[i], &cardInvRS) + } + + // ── Init pipeline ──────────────────────────────────────────────────── + sizeCodeWord := inner.SizeCodeWord() + treeNP := nextPow2u(sizeCodeWord) + treeNodes := 2*treeNP - 1 + + var pipeHandle C.kb_vortex_pipeline_t + if err := kbError(C.kb_vortex_pipeline_init(devCtx(dev), + sisHandle, p2s, p2c, + C.size_t(maxNRows), C.size_t(nCols), C.int(rate), + (*C.uint32_t)(unsafe.Pointer(&rsFwd[0])), + (*C.uint32_t)(unsafe.Pointer(&rsInv[0])), + (*C.uint32_t)(unsafe.Pointer(&scaledCoset[0])), + &pipeHandle)); err != nil { + C.kb_sis_free(sisHandle) + C.kb_p2_free(p2s) + C.kb_p2_free(p2c) + return nil, fmt.Errorf("vortex: pipeline init: %w", err) + } + + // ── Upload multi-coset scaling tables for rate > 2 ────────────── + // coset_k_br[j] = CosetTableBitReverse[j]^k × cardinalityInv + // Derived iteratively: table_k = table_{k-1} ⊙ CosetTableBitReverse + if rate > 2 { + cosetBR := inner.CosetTableBitReverse + nTables := rate - 1 + flat := make([]koalabear.Element, nTables*nCols) + // Table 1: cosetBR[j]^1 × cardInv + for j := 0; j < nCols; j++ { + flat[j].Mul(&cosetBR[j], &cardInvRS) + } + // Tables 2..rate-1: table_k[j] = table_{k-1}[j] × cosetBR[j] + for k := 2; k < rate; k++ { + prev := (k - 2) * nCols + cur := (k - 1) * nCols + for j := 0; j < nCols; j++ { + flat[cur+j].Mul(&flat[prev+j], &cosetBR[j]) + } + } + if err := kbError(C.kb_vortex_pipeline_set_coset_tables( + pipeHandle, + (*C.uint32_t)(unsafe.Pointer(&flat[0])), + C.size_t(nTables))); err != nil { + C.kb_vortex_pipeline_free(pipeHandle) + C.kb_sis_free(sisHandle) + C.kb_p2_free(p2s) + C.kb_p2_free(p2c) + return nil, fmt.Errorf("vortex: set_coset_tables: %w", err) + } + } + + // Wrap pinned host buffers as Go slices (zero-copy, page-locked DMA) + inputBuf := C.kb_vortex_pipeline_input_buf(pipeHandle) + treePtr := C.kb_vortex_pipeline_tree_buf(pipeHandle) + + gv := &GPUVortex{ + dev: dev, sis: sisHandle, p2s: p2s, p2c: p2c, + pipeline: pipeHandle, params: params, + inputBuf: unsafe.Slice((*koalabear.Element)(unsafe.Pointer(inputBuf)), maxNRows*nCols), + treeBuf: unsafe.Slice((*Hash)(unsafe.Pointer(treePtr)), treeNodes), + } + runtime.SetFinalizer(gv, (*GPUVortex).Free) + return gv, nil +} + +func (gv *GPUVortex) Free() { + gv.mu.Lock() + defer gv.mu.Unlock() + gv.freeLocked() +} + +func (gv *GPUVortex) freeLocked() { + if gv.pipeline != nil { + C.kb_vortex_pipeline_free(gv.pipeline) + gv.pipeline = nil + } + if gv.sis != nil { + C.kb_sis_free(gv.sis) + gv.sis = nil + } + if gv.p2s != nil { + C.kb_p2_free(gv.p2s) + gv.p2s = nil + } + if gv.p2c != nil { + C.kb_p2_free(gv.p2c) + gv.p2c = nil + } + gv.inputBuf = nil + gv.treeBuf = nil +} + +// Commit performs GPU-accelerated Vortex commit. +// +// Raw rows are copied to pinned memory and uploaded to GPU. RS encoding +// (batch NTT), SIS hashing, Poseidon2, and Merkle tree all run on device. +// The encoded matrix stays on GPU for Prove(). +// +// A new Commit() call invalidates any previously returned CommitState. +func (gv *GPUVortex) Commit(rows [][]koalabear.Element) (*CommitState, Hash, error) { + gv.mu.Lock() + defer gv.mu.Unlock() + if gv.pipeline == nil { + return nil, Hash{}, fmt.Errorf("vortex: GPUVortex pipeline is freed") + } + inner := gv.params.inner + nCols := inner.NbColumns + sizeCodeWord := inner.SizeCodeWord() + nRows := len(rows) + treeNP := nextPow2u(sizeCodeWord) + + // ── 1. Copy raw rows to pinned host buffer (parallel) ─────────────── + inputBuf := gv.inputBuf[:nRows*nCols] + parallel.Execute(nRows, func(start, stop int) { + for i := start; i < stop; i++ { + copy(inputBuf[i*nCols:(i+1)*nCols], rows[i]) + } + }) + + // ── 2. GPU pipeline: RS encode + SIS + sponge + Merkle ────────────── + inPtr := (*C.uint32_t)(unsafe.Pointer(&inputBuf[0])) + if err := kbError(C.kb_vortex_commit(gv.pipeline, inPtr, C.size_t(nRows))); err != nil { + return nil, Hash{}, fmt.Errorf("vortex: GPU commit: %w", err) + } + + // Tree is in pinned host buffer (SIS hashes stay on device only) + treeBuf := gv.treeBuf + + // ── 3. Reconstruct MerkleTree from flat heap buffer ───────────────── + depth := bits.Len(uint(treeNP)) - 1 + levels := make([][]Hash, depth+1) + for d := 0; d <= depth; d++ { + start := (1 << d) - 1 + end := (1 << (d + 1)) - 1 + levels[d] = treeBuf[start:end] + } + + root := levels[0][0] + + cs := &CommitState{ + pipeline: gv.pipeline, + params: inner, + nRows: nRows, + merkle: &refvortex.MerkleTree{Levels: levels}, + sizeCodeWord: sizeCodeWord, + } + + return cs, root, nil +} + +// CommitDirect writes rows directly to pinned memory via the loadRow callback, +// avoiding intermediate Go heap allocations. Each call to loadRow(i, dst) +// must fill dst[:nCols] with the i-th row's data. +func (gv *GPUVortex) CommitDirect(nRows int, loadRow func(i int, dst []koalabear.Element)) (*CommitState, Hash, error) { + gv.mu.Lock() + defer gv.mu.Unlock() + return gv.commitDirectLocked(nRows, loadRow) +} + +// CommitDirectAndThen commits rows and runs use while the shared pipeline is +// still locked. Use this for operations that read d_encoded_col immediately +// after commit, such as snapshotting, lincomb, or selected-column extraction. +func (gv *GPUVortex) CommitDirectAndThen( + nRows int, + loadRow func(i int, dst []koalabear.Element), + use func(*CommitState, Hash) error, +) error { + gv.mu.Lock() + defer gv.mu.Unlock() + + cs, root, err := gv.commitDirectLocked(nRows, loadRow) + if err != nil { + return err + } + return use(cs, root) +} + +func (gv *GPUVortex) commitDirectLocked( + nRows int, + loadRow func(i int, dst []koalabear.Element), +) (*CommitState, Hash, error) { + if gv.pipeline == nil { + return nil, Hash{}, fmt.Errorf("vortex: GPUVortex pipeline is freed") + } + inner := gv.params.inner + nCols := inner.NbColumns + sizeCodeWord := inner.SizeCodeWord() + treeNP := nextPow2u(sizeCodeWord) + + // ── 1. Write rows directly to pinned host buffer (parallel) ────── + inputBuf := gv.inputBuf[:nRows*nCols] + parallel.Execute(nRows, func(start, stop int) { + for i := start; i < stop; i++ { + loadRow(i, inputBuf[i*nCols:(i+1)*nCols]) + } + }) + + // ── 2. GPU pipeline: RS encode + SIS + sponge + Merkle ────────────── + inPtr := (*C.uint32_t)(unsafe.Pointer(&inputBuf[0])) + if err := kbError(C.kb_vortex_commit(gv.pipeline, inPtr, C.size_t(nRows))); err != nil { + return nil, Hash{}, fmt.Errorf("vortex: GPU commit: %w", err) + } + + // Tree is in pinned host buffer + treeBuf := gv.treeBuf + + // ── 3. Reconstruct MerkleTree from flat heap buffer ───────────────── + depth := bits.Len(uint(treeNP)) - 1 + levels := make([][]Hash, depth+1) + for d := 0; d <= depth; d++ { + start := (1 << d) - 1 + end := (1 << (d + 1)) - 1 + levels[d] = treeBuf[start:end] + } + + root := levels[0][0] + + cs := &CommitState{ + pipeline: gv.pipeline, + params: inner, + nRows: nRows, + merkle: &refvortex.MerkleTree{Levels: levels}, + sizeCodeWord: sizeCodeWord, + } + + return cs, root, nil +} + +// CommitAndExtract performs GPU-accelerated Vortex commit with overlapped D2H. +// +// Overlaps D2H transfer of the encoded matrix, SIS hashes, and leaf hashes +// with SIS/Poseidon2/Merkle computation on GPU (uses two CUDA streams). +// Returns all results needed for the drop-in replacement in a single call, +// avoiding sequential Extract calls after Commit. +// +// A new CommitAndExtract() call invalidates previously returned pinned buffers. +func (gv *GPUVortex) CommitAndExtract(rows [][]koalabear.Element) ( + encodedRows [][]koalabear.Element, + sisHashes []koalabear.Element, + leaves []Hash, + root Hash, + tree *refvortex.MerkleTree, + err error, +) { + gv.mu.Lock() + defer gv.mu.Unlock() + if gv.pipeline == nil { + err = fmt.Errorf("vortex: GPUVortex pipeline is freed") + return + } + inner := gv.params.inner + nCols := inner.NbColumns + sizeCodeWord := inner.SizeCodeWord() + nRows := len(rows) + treeNP := nextPow2u(sizeCodeWord) + degree := inner.Key.Degree + + // ── 1. Copy raw rows to pinned host buffer (parallel) ─────────────── + inputBuf := gv.inputBuf[:nRows*nCols] + parallel.Execute(nRows, func(start, stop int) { + for i := start; i < stop; i++ { + copy(inputBuf[i*nCols:(i+1)*nCols], rows[i]) + } + }) + + // ── 2. GPU pipeline: commit + overlapped D2H ───────────────────────── + inPtr := (*C.uint32_t)(unsafe.Pointer(&inputBuf[0])) + if err = kbError(C.kb_vortex_commit_and_extract(gv.pipeline, inPtr, C.size_t(nRows))); err != nil { + err = fmt.Errorf("vortex: GPU commit_and_extract: %w", err) + return + } + + // ── 3. Copy from pinned host buffers to Go-managed memory ──────────── + // Pinned buffers are reused on next Commit, so we must copy out. + // Encoded matrix uses parallel copy to saturate memory bandwidth. + + encPtr := C.kb_vortex_h_enc_pinned(gv.pipeline) + pinnedEnc := unsafe.Slice((*koalabear.Element)(unsafe.Pointer(encPtr)), nRows*sizeCodeWord) + encBacking := make([]koalabear.Element, nRows*sizeCodeWord) + { + total := nRows * sizeCodeWord + const chunk = 256 * 1024 // 256K elements = 1 MB per goroutine + numChunks := (total + chunk - 1) / chunk + parallel.Execute(numChunks, func(start, stop int) { + for c := start; c < stop; c++ { + off := c * chunk + end := off + chunk + if end > total { + end = total + } + copy(encBacking[off:end], pinnedEnc[off:end]) + } + }) + } + encodedRows = make([][]koalabear.Element, nRows) + for r := range encodedRows { + encodedRows[r] = encBacking[r*sizeCodeWord : (r+1)*sizeCodeWord] + } + + // SIS hashes: [scw × degree] — small, single copy. + sisPtr := C.kb_vortex_h_sis_pinned(gv.pipeline) + pinnedSIS := unsafe.Slice((*koalabear.Element)(unsafe.Pointer(sisPtr)), sizeCodeWord*degree) + sisHashes = make([]koalabear.Element, sizeCodeWord*degree) + copy(sisHashes, pinnedSIS) + + // Leaves: [scw] Hash — tiny, single copy. + leavesPtr := C.kb_vortex_h_leaves_pinned(gv.pipeline) + pinnedLeaves := unsafe.Slice((*Hash)(unsafe.Pointer(leavesPtr)), sizeCodeWord) + leaves = make([]Hash, sizeCodeWord) + copy(leaves, pinnedLeaves) + + // ── 4. Reconstruct MerkleTree from flat heap buffer ────────────────── + treeBuf := gv.treeBuf + depth := bits.Len(uint(treeNP)) - 1 + levels := make([][]Hash, depth+1) + for d := 0; d <= depth; d++ { + start := (1 << d) - 1 + end := (1 << (d + 1)) - 1 + levels[d] = treeBuf[start:end] + } + root = levels[0][0] + tree = &refvortex.MerkleTree{Levels: levels} + + return +} + +// ───────────────────────────────────────────────────────────────────────────── +// CommitState + Prove — GPU lincomb + column extraction +// ───────────────────────────────────────────────────────────────────────────── + +// CommitState holds prover state after commit. +// For GPU commits, the encoded matrix stays on device; Prove extracts via D2H. +// For CPU commits (benchmark baseline), delegates to gnark-crypto's ProverState. +// +// Three GPU storage modes: +// 1. pipeline set, encodedGPU nil — device-resident in shared pipeline (single-use only) +// 2. pipeline nil, encodedGPU set — device-resident in per-round snapshot (safe across rounds) +// 3. pipeline nil, encodedMatrix set — host-resident CPU fallback +type CommitState struct { + pipeline C.kb_vortex_pipeline_t // nil for CPU commits and snapshots + params *refvortex.Params + nRows int + merkle *refvortex.MerkleTree + sizeCodeWord int + cpuState *refvortex.ProverState // non-nil for CPU Commit() baseline + encodedMatrix []smartvectors.SmartVector // non-nil for CPU CommitSIS fallback + encodedGPU *KBVector // per-round snapshot: [scw × nRows] column-major + dev *gpu.Device // device handle for snapshot operations +} + +// NRows returns the number of rows in this commit. +func (cs *CommitState) NRows() int { return cs.nRows } + +// SnapshotEncoded copies the pipeline's device-resident encoded matrix to a +// per-round GPU buffer (D2D copy). This decouples this round's data from the +// shared pipeline, which will be overwritten by subsequent CommitDirect calls. +// +// After snapshot, LinComb and ExtractColumns use the per-round buffer. +// The pipeline reference is cleared to prevent accidental stale access. +func (cs *CommitState) SnapshotEncoded(dev *gpu.Device) error { + if cs.pipeline == nil { + return fmt.Errorf("vortex: SnapshotEncoded: no pipeline") + } + scw := cs.sizeCodeWord + nRows := cs.nRows + total := scw * nRows + + // Get raw device pointer to pipeline's column-major encoded matrix + srcPtr := C.kb_vortex_encoded_device_ptr(cs.pipeline) + if srcPtr == nil { + return fmt.Errorf("vortex: SnapshotEncoded: null device pointer") + } + + // Allocate per-round device buffer and D2D copy + buf, err := NewKBVector(dev, total) + if err != nil { + return fmt.Errorf("vortex: SnapshotEncoded: alloc: %w", err) + } + D2DRaw(dev, buf.DevicePtr(), unsafe.Pointer(srcPtr), total) + Sync(dev) + + cs.encodedGPU = buf + cs.dev = dev + cs.pipeline = nil // detach from shared pipeline + return nil +} + +// FreeGPU releases GPU-resident memory immediately rather than waiting for GC. +// After this call, the CommitState falls back to CPU for any remaining operations. +func (cs *CommitState) FreeGPU() { + if cs.encodedGPU != nil { + cs.encodedGPU.Free() + cs.encodedGPU = nil + } + cs.pipeline = nil +} + +// IsDeviceResident reports whether this state is backed by GPU-resident data. +func (cs *CommitState) IsDeviceResident() bool { + return cs.encodedGPU != nil || (cs.pipeline != nil && cs.encodedMatrix == nil) +} + +// GetEncodedMatrix returns the host-side encoded matrix as SmartVectors. +// For GPU commits, extracts from device (full D2H). For CPU fallbacks, +// returns the stored host matrix directly. +func (cs *CommitState) GetEncodedMatrix() []smartvectors.SmartVector { + if cs.encodedMatrix != nil { + return cs.encodedMatrix + } + if cs.encodedGPU != nil { + return cs.snapshotToEncodedMatrix() + } + if cs.pipeline == nil { + return nil + } + rows, err := cs.ExtractAllRows() + if err != nil { + panic("vortex: GetEncodedMatrix: " + err.Error()) + } + em := make([]smartvectors.SmartVector, len(rows)) + for i, row := range rows { + em[i] = smartvectors.NewRegular(row) + } + return em +} + +// snapshotToEncodedMatrix downloads a column-major GPU snapshot to host and +// converts to row-major SmartVectors. Used by GetEncodedMatrix for recursion. +func (cs *CommitState) snapshotToEncodedMatrix() []smartvectors.SmartVector { + scw := cs.sizeCodeWord + nRows := cs.nRows + colMajor := make([]koalabear.Element, scw*nRows) + cs.encodedGPU.CopyToHost(colMajor) + Sync(cs.dev) + em := make([]smartvectors.SmartVector, nRows) + for i := range em { + row := make([]koalabear.Element, scw) + for j := 0; j < scw; j++ { + row[j] = colMajor[j*nRows+i] + } + em[i] = smartvectors.NewRegular(row) + } + return em +} + +// Prove generates a Vortex opening proof. +// +// Linear combination (UAlpha) is computed on GPU using the device-resident +// encoded matrix. Opened columns are extracted via small D2H transfers. +// Merkle proofs are computed on the host tree buffer. +func (cs *CommitState) Prove(alpha fext.E4, selectedCols []int) (*Proof, error) { + // CPU fallback path (for Params.Commit baseline) + if cs.cpuState != nil { + cs.cpuState.OpenLinComb(alpha) + vp, err := cs.cpuState.OpenColumns(selectedCols) + if err != nil { + return nil, err + } + return &Proof{ + UAlpha: vp.UAlpha, + Columns: vp.OpenedColumns, + MerkleProofs: vp.MerkleProofOpenedColumns, + }, nil + } + + scw := cs.sizeCodeWord + nRows := cs.nRows + + // ── 1. GPU linear combination: UAlpha[j] = Σᵢ αⁱ · encoded[j][i] ── + alphaRaw := [4]C.uint32_t{ + C.uint32_t(alpha.B0.A0[0]), + C.uint32_t(alpha.B0.A1[0]), + C.uint32_t(alpha.B1.A0[0]), + C.uint32_t(alpha.B1.A1[0]), + } + uAlpha := make([]fext.E4, scw) + if err := kbError(C.kb_vortex_lincomb(cs.pipeline, + C.size_t(nRows), &alphaRaw[0], + (*C.uint32_t)(unsafe.Pointer(&uAlpha[0])))); err != nil { + return nil, fmt.Errorf("vortex: GPU lincomb: %w", err) + } + + // ── 2. Extract opened columns from GPU ────────────────────────────── + columns := make([][]koalabear.Element, len(selectedCols)) + for i, c := range selectedCols { + col := make([]koalabear.Element, nRows) + if err := kbError(C.kb_vortex_extract_col(cs.pipeline, + C.size_t(nRows), C.int(c), + (*C.uint32_t)(unsafe.Pointer(&col[0])))); err != nil { + return nil, fmt.Errorf("vortex: extract col %d: %w", c, err) + } + columns[i] = col + } + + // ── 3. Merkle proofs from host tree buffer ────────────────────────── + merkleProofs := make([]MerkleProof, len(selectedCols)) + for i, c := range selectedCols { + merkleProofs[i] = merkleProve(cs.merkle, c) + } + + return &Proof{ + UAlpha: uAlpha, + Columns: columns, + MerkleProofs: merkleProofs, + }, nil +} + +// LinComb computes UAlpha[j] = Σᵢ αⁱ · encoded[i][j]. +// +// GPU path: single kernel call on device-resident column-major matrix. +// CPU fallback: iterates encodedMatrix SmartVectors on host. +func (cs *CommitState) LinComb(alpha fext.E4) ([]fext.E4, error) { + if cs.encodedGPU != nil { + // Per-round GPU snapshot: use standalone lincomb kernel + alphaRaw := [4]C.uint32_t{ + C.uint32_t(alpha.B0.A0[0]), + C.uint32_t(alpha.B0.A1[0]), + C.uint32_t(alpha.B1.A0[0]), + C.uint32_t(alpha.B1.A1[0]), + } + uAlpha := make([]fext.E4, cs.sizeCodeWord) + if err := kbError(C.kb_lincomb_e4_colmajor(devCtx(cs.dev), + (*C.uint32_t)(cs.encodedGPU.DevicePtr()), + C.size_t(cs.nRows), C.size_t(cs.sizeCodeWord), + &alphaRaw[0], + (*C.uint32_t)(unsafe.Pointer(&uAlpha[0])))); err != nil { + return nil, fmt.Errorf("vortex: GPU snapshot lincomb: %w", err) + } + return uAlpha, nil + } + if cs.pipeline != nil { + alphaRaw := [4]C.uint32_t{ + C.uint32_t(alpha.B0.A0[0]), + C.uint32_t(alpha.B0.A1[0]), + C.uint32_t(alpha.B1.A0[0]), + C.uint32_t(alpha.B1.A1[0]), + } + uAlpha := make([]fext.E4, cs.sizeCodeWord) + if err := kbError(C.kb_vortex_lincomb(cs.pipeline, + C.size_t(cs.nRows), &alphaRaw[0], + (*C.uint32_t)(unsafe.Pointer(&uAlpha[0])))); err != nil { + return nil, fmt.Errorf("vortex: GPU lincomb: %w", err) + } + return uAlpha, nil + } + if cs.encodedMatrix != nil { + return linCombCPU(cs.encodedMatrix, alpha), nil + } + return nil, fmt.Errorf("vortex: CommitState has neither GPU pipeline nor encodedMatrix") +} + +// linCombCPU computes UAlpha[j] = Σᵢ αⁱ · rows[i].Get(j) on CPU. +func linCombCPU(rows []smartvectors.SmartVector, alpha fext.E4) []fext.E4 { + n := rows[0].Len() + result := make([]fext.E4, n) + var pow fext.E4 + pow.SetOne() + for _, row := range rows { + for j := 0; j < n; j++ { + v := row.Get(j) + var term fext.E4 + term.B0.A0 = v + term.Mul(&term, &pow) + result[j].Add(&result[j], &term) + } + pow.Mul(&pow, &alpha) + } + return result +} + +// ExtractColumns extracts selected columns from the encoded matrix. +// +// GPU path: small D2H per column from device-resident column-major matrix. +// CPU fallback: gathers from encodedMatrix SmartVectors on host. +// Returns columns[i][row] for each selectedCols[i], row 0..nRows-1. +func (cs *CommitState) ExtractColumns(selectedCols []int) ([][]koalabear.Element, error) { + if cs.encodedGPU != nil { + // Per-round snapshot: D2H from column-major buffer at offset col*nRows + columns := make([][]koalabear.Element, len(selectedCols)) + for i, c := range selectedCols { + col := make([]koalabear.Element, cs.nRows) + srcOff := unsafe.Add(cs.encodedGPU.DevicePtr(), c*cs.nRows*4) // 4 bytes per uint32 + D2HRaw(cs.dev, col, srcOff, cs.nRows) + columns[i] = col + } + return columns, nil + } + if cs.pipeline != nil { + columns := make([][]koalabear.Element, len(selectedCols)) + for i, c := range selectedCols { + col := make([]koalabear.Element, cs.nRows) + if err := kbError(C.kb_vortex_extract_col(cs.pipeline, + C.size_t(cs.nRows), C.int(c), + (*C.uint32_t)(unsafe.Pointer(&col[0])))); err != nil { + return nil, fmt.Errorf("vortex: extract col %d: %w", c, err) + } + columns[i] = col + } + return columns, nil + } + if cs.encodedMatrix != nil { + return extractColumnsCPU(cs.encodedMatrix, selectedCols), nil + } + return nil, fmt.Errorf("vortex: CommitState has neither GPU pipeline nor encodedMatrix") +} + +// extractColumnsCPU gathers selected columns from host-side SmartVectors. +func extractColumnsCPU(rows []smartvectors.SmartVector, selectedCols []int) [][]koalabear.Element { + columns := make([][]koalabear.Element, len(selectedCols)) + for i, c := range selectedCols { + col := make([]koalabear.Element, len(rows)) + for r, row := range rows { + col[r] = row.Get(c) + } + columns[i] = col + } + return columns +} + +// ExtractAllRows downloads the full GPU encoded matrix and returns it as +// row-major [][]koalabear.Element (one slice per row, length sizeCodeWord). +// The GPU stores column-major, so this transposes during extraction. +func (cs *CommitState) ExtractAllRows() ([][]koalabear.Element, error) { + if cs.pipeline == nil { + return nil, fmt.Errorf("vortex: no GPU pipeline in CommitState") + } + scw := cs.sizeCodeWord + nRows := cs.nRows + + // D2H: row-major flat buffer [nRows × scw], transposed on GPU. + // Single contiguous allocation reduces GC pressure. + rowMajor := make([]koalabear.Element, nRows*scw) + if err := kbError(C.kb_vortex_extract_all_rowmajor(cs.pipeline, + C.size_t(nRows), + (*C.uint32_t)(unsafe.Pointer(&rowMajor[0])))); err != nil { + return nil, fmt.Errorf("vortex: extract all rowmajor: %w", err) + } + + // Slice the contiguous buffer into per-row slices (no copy). + rows := make([][]koalabear.Element, nRows) + for r := range rows { + rows[r] = rowMajor[r*scw : (r+1)*scw] + } + return rows, nil +} + +// MerkleTree returns the reconstructed Merkle tree from the GPU commit. +func (cs *CommitState) MerkleTree() *refvortex.MerkleTree { + return cs.merkle +} + +// ExtractSISHashes returns the SIS column hashes already transferred to +// the pipeline's pinned host buffer during kb_vortex_commit (overlapped with +// Merkle tree construction). +func (cs *CommitState) ExtractSISHashes() ([]koalabear.Element, error) { + if cs.pipeline == nil { + return nil, fmt.Errorf("vortex: no GPU pipeline in CommitState") + } + scw := cs.sizeCodeWord + degree := int(C.kb_vortex_degree(cs.pipeline)) + n := scw * degree + + // SIS hashes were already D2H'd to h_sis_pinned during commit. + sisPtr := C.kb_vortex_h_sis_pinned(cs.pipeline) + if sisPtr == nil { + return nil, fmt.Errorf("vortex: h_sis_pinned is nil") + } + + // Keep the result in Go-managed memory. This can be GB-scale for production + // self-recursion, so it must remain visible to the GC and GOMEMLIMIT. + nbytes := C.size_t(n) * 4 + out := make([]koalabear.Element, n) + if n == 0 { + return out, nil + } + C.memcpy(unsafe.Pointer(&out[0]), unsafe.Pointer(sisPtr), nbytes) + return out, nil +} + +// ExtractLeaves extracts the Poseidon2 leaf hashes from GPU to host. +// Returns []Hash (field.Octuplet) of length sizeCodeWord. +func (cs *CommitState) ExtractLeaves() ([]Hash, error) { + if cs.pipeline == nil { + return nil, fmt.Errorf("vortex: no GPU pipeline in CommitState") + } + scw := cs.sizeCodeWord + out := make([]Hash, scw) + if err := kbError(C.kb_vortex_extract_leaves(cs.pipeline, + (*C.uint32_t)(unsafe.Pointer(&out[0])))); err != nil { + return nil, fmt.Errorf("vortex: extract leaves: %w", err) + } + return out, nil +} + +// merkleProve computes a Merkle inclusion proof for column colIdx. +// The leaf is: Poseidon2_sponge(SIS_hash[colIdx]). +// Tree layout: heap array, root at level 0 (single entry). +func merkleProve(tree *refvortex.MerkleTree, colIdx int) MerkleProof { + // The tree has depth levels: levels[0]=[root], levels[d]=[2^d hashes]. + // Leaf index = colIdx at the bottom level. Walk up collecting siblings. + depth := len(tree.Levels) - 1 + proof := make(MerkleProof, depth) + idx := colIdx + for d := depth; d > 0; d-- { + // Sibling index at this level + sibling := idx ^ 1 + if sibling < len(tree.Levels[d]) { + proof[depth-d] = tree.Levels[d][sibling] + } + idx >>= 1 + } + return proof +} + +// ───────────────────────────────────────────────────────────────────────────── +// CPU Commit (used as benchmark baseline and for tests without GPU) +// ───────────────────────────────────────────────────────────────────────────── + +func (p *Params) Commit(rows [][]koalabear.Element) (*CommitState, Hash, error) { + ps, err := refvortex.Commit(p.inner, rows) + if err != nil { + return nil, Hash{}, err + } + root := ps.GetCommitment() + // Wrap in GPU CommitState — lincomb/column extraction will fall back to CPU + return &CommitState{ + params: p.inner, + nRows: len(rows), + merkle: ps.MerkleTree, + sizeCodeWord: p.inner.SizeCodeWord(), + cpuState: ps, + }, root, nil +} + +// ───────────────────────────────────────────────────────────────────────────── +// Helpers +// ───────────────────────────────────────────────────────────────────────────── + +// poseidon2Diag returns the internal MDS diagonal for Poseidon2 (width 16 or 24). +// Values match gnark-crypto's poseidon2/hash.go init() — SetUint64 converts to Montgomery. +func poseidon2Diag(width int) []koalabear.Element { + var vals []uint64 + switch width { + case 16: + vals = []uint64{ + 2130706431, 1, 2, 1065353217, 3, 4, 1065353216, 2130706430, + 2130706429, 2122383361, 1864368129, 2130706306, + 8323072, 266338304, 133169152, 127, + } + case 24: + vals = []uint64{ + 2130706431, 1, 2, 1065353217, 3, 4, 1065353216, 2130706430, + 2130706429, 2122383361, 1598029825, 1864368129, + 1997537281, 2064121857, 2097414145, 2130706306, + 8323072, 266338304, 133169152, 66584576, + 33292288, 16646144, 4161536, 127, + } + default: + panic(fmt.Sprintf("vortex: unsupported Poseidon2 width %d", width)) + } + diag := make([]koalabear.Element, len(vals)) + for i, v := range vals { + diag[i].SetUint64(v) + } + return diag +} + +func nextPow2u(n int) int { + v := 1 + for v < n { + v <<= 1 + } + return v +} + +func kbError(code C.kb_error_t) error { + switch code { + case C.KB_SUCCESS: + return nil + case C.KB_ERROR_CUDA: + return fmt.Errorf("vortex: CUDA error") + case C.KB_ERROR_INVALID: + return fmt.Errorf("vortex: invalid argument") + case C.KB_ERROR_OOM: + return fmt.Errorf("vortex: out of GPU memory") + case C.KB_ERROR_SIZE: + return fmt.Errorf("vortex: size mismatch") + default: + return fmt.Errorf("vortex: unknown error %d", int(code)) + } +} + +func must(code C.kb_error_t) { + if err := kbError(code); err != nil { + panic(err) + } +} diff --git a/prover/gpu/vortex/gpu_test.go b/prover/gpu/vortex/gpu_test.go new file mode 100644 index 00000000000..3c6f572df77 --- /dev/null +++ b/prover/gpu/vortex/gpu_test.go @@ -0,0 +1,531 @@ +//go:build cuda + +package vortex + +import ( + "math/rand/v2" + "testing" + "time" + + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + "github.com/consensys/gnark-crypto/field/koalabear/fft" + "github.com/consensys/gnark-crypto/field/koalabear/sis" + "github.com/consensys/linea-monorepo/prover/gpu" + "github.com/stretchr/testify/require" +) + +func newTestDevice(t *testing.T) *gpu.Device { + t.Helper() + dev, err := gpu.New() + require.NoError(t, err) + t.Cleanup(func() { dev.Close() }) + return dev +} + +// ─── GPU Poseidon2 compress ────────────────────────────────────────────────── + +func TestGPUPoseidon2Compress(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{50})) + dev := newTestDevice(t) + + p2, err := NewGPUPoseidon2(dev, 16) + assert.NoError(err) + defer p2.Free() + + var a, b Hash + for j := 0; j < 8; j++ { + a[j] = randKB(rng) + b[j] = randKB(rng) + } + + // CPU reference + cpuHash := CompressPoseidon2(a, b) + + // GPU: pack input as [left[8] || right[8]] = 16 koalabear elements + input := make([]koalabear.Element, 16) + copy(input[:8], a[:]) + copy(input[8:], b[:]) + gpuHashes := p2.CompressBatch(input, 1) + + assert.Equal(cpuHash, gpuHashes[0], "GPU Poseidon2 compress mismatch") +} + +// ─── GPU Vortex commit: RS encoding + leaf hash correctness ───────────────── + +func TestGPUVortexCommit(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{42})) + dev := newTestDevice(t) + + nCols := 32 + nRows := 16 + rate := 2 + nSelected := 8 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrix(rng, nRows, nCols) + + // GPU commit + gv, err := NewGPUVortex(dev, params, nRows) + assert.NoError(err) + defer gv.Free() + + cs, _, err := gv.Commit(m) + assert.NoError(err) + + // Compare RS encoding + gpuRows, err := cs.ExtractAllRows() + assert.NoError(err) + for i := range m { + cpuRow := make([]koalabear.Element, nCols*rate) + params.EncodeReedSolomon(m[i], cpuRow) + for j := range cpuRow { + assert.Equal(cpuRow[j], gpuRows[i][j], "encoded[%d][%d]", i, j) + } + } + + // Compare leaves (GPU MD hash vs CPU CompressPoseidon2x16) + gpuSIS, err := cs.ExtractSISHashes() + assert.NoError(err) + gpuLeaves, err := cs.ExtractLeaves() + assert.NoError(err) + + scw := nCols * rate + degree := sisParams.Degree + cpuLeaves := make([]Hash, scw) + n16 := scw / 16 + for c := 0; c < n16; c++ { + start := c * 16 * degree + CompressPoseidon2x16(gpuSIS[start:start+16*degree], degree, cpuLeaves[c*16:(c+1)*16]) + } + for i := 0; i < scw; i++ { + assert.Equal(cpuLeaves[i], gpuLeaves[i], "leaf[%d]", i) + } +} + +// ─── GPU Vortex linear combination + column extraction ────────────────────── + +func TestGPUVortexLinComb(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{77})) + dev := newTestDevice(t) + + nCols := 32 + nRows := 16 + rate := 2 + nSelected := 4 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrix(rng, nRows, nCols) + alpha := randE4(rng) + selectedCols := []int{0, 1, 2, 3} + + gv, err := NewGPUVortex(dev, params, nRows) + assert.NoError(err) + defer gv.Free() + + cs, _, err := gv.Commit(m) + assert.NoError(err) + + // GPU linear combination + uAlpha, err := cs.LinComb(alpha) + assert.NoError(err) + assert.Equal(nCols*rate, len(uAlpha), "UAlpha length") + + // Column extraction + cols, err := cs.ExtractColumns(selectedCols) + assert.NoError(err) + assert.Equal(len(selectedCols), len(cols), "column count") + + // Cross-check: extracted columns against full matrix + allRows, err := cs.ExtractAllRows() + assert.NoError(err) + for i, c := range selectedCols { + for row := 0; row < nRows; row++ { + assert.Equal(allRows[row][c], cols[i][row], "col[%d] row[%d]", c, row) + } + } +} + +// ─── GPU E4 NTT ───────────────────────────────────────────────────────────── + +func TestGPUFFTE4(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{88})) + dev := newTestDevice(t) + + const n = 1 << 14 // 16K E4 elements + + // Random E4 input + data := make([]fext.E4, n) + for i := range data { + data[i] = randE4(rng) + } + + // CPU reference: forward FFT (DIF) + cpuData := make([]fext.E4, n) + copy(cpuData, data) + cpuDomain := fft.NewDomain(uint64(n)) + cpuDomain.FFTExt(cpuData, fft.DIF) + + // GPU forward FFT + gpuData := make([]fext.E4, n) + copy(gpuData, data) + dom, err := NewGPUFFTDomain(dev, n) + assert.NoError(err) + defer dom.Free() + + dom.FFTE4(gpuData) + + // Compare + mismatches := 0 + for i := range cpuData { + if cpuData[i] != gpuData[i] { + mismatches++ + if mismatches <= 5 { + t.Errorf("FFTE4 mismatch at %d: cpu=%v gpu=%v", i, cpuData[i], gpuData[i]) + } + } + } + assert.Equal(0, mismatches, "FFTE4 total mismatches") +} + +func TestGPUFFTE4Roundtrip(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{89})) + dev := newTestDevice(t) + + const n = 1 << 14 + + original := make([]fext.E4, n) + for i := range original { + original[i] = randE4(rng) + } + + dom, err := NewGPUFFTDomain(dev, n) + assert.NoError(err) + defer dom.Free() + + // Forward then inverse + data := make([]fext.E4, n) + copy(data, original) + dom.FFTE4(data) + dom.FFTInverseE4(data) + + // Scale by 1/n (GPU IFFT does not include this) + var nInv koalabear.Element + nInv.SetUint64(uint64(n)) + nInv.Inverse(&nInv) + for i := range data { + data[i].B0.A0.Mul(&data[i].B0.A0, &nInv) + data[i].B0.A1.Mul(&data[i].B0.A1, &nInv) + data[i].B1.A0.Mul(&data[i].B1.A0, &nInv) + data[i].B1.A1.Mul(&data[i].B1.A1, &nInv) + } + + mismatches := 0 + for i := range original { + if original[i] != data[i] { + mismatches++ + if mismatches <= 5 { + t.Errorf("E4 roundtrip mismatch at %d: orig=%v got=%v", i, original[i], data[i]) + } + } + } + assert.Equal(0, mismatches, "E4 roundtrip total mismatches") +} + +func TestGPUCosetFFTE4(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{90})) + dev := newTestDevice(t) + + const n = 1 << 14 + + data := make([]fext.E4, n) + for i := range data { + data[i] = randE4(rng) + } + + cpuDomain := fft.NewDomain(uint64(n)) + + // CPU reference: coset FFT (DIF, OnCoset) → bit-reversed output + cpuData := make([]fext.E4, n) + copy(cpuData, data) + cpuDomain.FFTExt(cpuData, fft.DIF, fft.OnCoset()) + // CPU DIF output is bit-reversed; GPU CosetFFT output is also bit-reversed + // (kb_ntt_coset_fwd = ScaleByPowers + DIF, no final bitrev) + + // GPU coset FFT → bit-reversed output (ScaleByPowers + DIF) + gpuData := make([]fext.E4, n) + copy(gpuData, data) + dom, err := NewGPUFFTDomain(dev, n) + assert.NoError(err) + defer dom.Free() + + dom.CosetFFTE4(gpuData, cpuDomain.FrMultiplicativeGen) + + mismatches := 0 + for i := range cpuData { + if cpuData[i] != gpuData[i] { + mismatches++ + if mismatches <= 5 { + t.Errorf("CosetFFTE4 mismatch at %d: cpu=%v gpu=%v", i, cpuData[i], gpuData[i]) + } + } + } + assert.Equal(0, mismatches, "CosetFFTE4 total mismatches") +} + +// bitReverseE4 applies bit-reversal permutation on a slice of E4 elements. +func bitReverseE4(a []fext.E4) { + n := len(a) + nn := uint64(n) + logN := uint64(0) + for 1<>= 1 + } + return r +} + +// ─── Benchmarks ───────────────────────────────────────────────────────────── + +func benchCommit(b *testing.B, nCols, nRows, rate int) { + dev, err := gpu.New() + if err != nil { + b.Fatal(err) + } + defer dev.Close() + + rng := rand.New(rand.NewChaCha8([32]byte{})) + nSelected := min(32, nCols*rate/4) + + sisParams, _ := sis.NewRSis(0, 9, 16, nRows) + params, _ := NewParams(nCols, nRows, sisParams, rate, nSelected) + m := randMatrix(rng, nRows, nCols) + inputBytes := int64(nCols * nRows * 4) // KoalaBear is uint32 (4 bytes) + + gv, err := NewGPUVortex(dev, params, nRows) + if err != nil { + b.Fatal(err) + } + defer gv.Free() + + b.Run("GPU", func(b *testing.B) { + b.SetBytes(inputBytes) + for i := 0; i < b.N; i++ { + if _, _, err := gv.Commit(m); err != nil { + b.Fatal(err) + } + } + }) + b.Run("CPU", func(b *testing.B) { + b.SetBytes(inputBytes) + for i := 0; i < b.N; i++ { + if _, _, err := params.Commit(m); err != nil { + b.Fatal(err) + } + } + }) + b.Run("GPU_vs_CPU", func(b *testing.B) { + var gpuTotal, cpuTotal time.Duration + for i := 0; i < b.N; i++ { + start := time.Now() + if _, _, err := gv.Commit(m); err != nil { + b.Fatal(err) + } + gpuTotal += time.Since(start) + + start = time.Now() + if _, _, err := params.Commit(m); err != nil { + b.Fatal(err) + } + cpuTotal += time.Since(start) + } + + gpuMBps := toMBps(inputBytes, gpuTotal/time.Duration(b.N)) + cpuMBps := toMBps(inputBytes, cpuTotal/time.Duration(b.N)) + b.ReportMetric(gpuMBps, "gpu_MB/s") + b.ReportMetric(cpuMBps, "cpu_MB/s") + if cpuMBps > 0 { + b.ReportMetric(gpuMBps/cpuMBps, "gpu_vs_cpu_x") + } + }) +} + +func toMBps(bytes int64, d time.Duration) float64 { + if d <= 0 { + return 0 + } + return float64(bytes) / d.Seconds() / (1024 * 1024) +} + +func BenchmarkCommit_1024x128(b *testing.B) { benchCommit(b, 1024, 128, 2) } +func BenchmarkCommit_4096x256(b *testing.B) { benchCommit(b, 4096, 256, 2) } +func BenchmarkCommit_16384x256(b *testing.B) { benchCommit(b, 16384, 256, 2) } +func BenchmarkCommit_524288x2048(b *testing.B) { benchCommit(b, 1<<19, 1<<11, 2) } // ~1B cells +func BenchmarkCommit_1048576x4096(b *testing.B) { benchCommit(b, 1<<20, 1<<12, 2) } // ~4B cells + +// ─── Rate 16: GPU RS encode correctness ────────────────────────────────────── +// +// gnark-crypto's NewParams rejects rate > 8, so we validate GPU RS encoding +// against CPU by doing a full commit + verify roundtrip (which exercises the +// same RS encode → SIS → Poseidon2 → Merkle → lincomb → column extraction +// pipeline with the higher rate). + +func TestGPUVortexCommitRate16(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{44})) + dev := newTestDevice(t) + + nCols := 64 + nRows := 16 + rate := 16 + nSelected := 8 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrix(rng, nRows, nCols) + + // GPU commit + gv, err := NewGPUVortex(dev, params, nRows) + assert.NoError(err) + defer gv.Free() + + cs, _, err := gv.Commit(m) + assert.NoError(err) + + // Verify RS encoding: CPU encode each row and compare with GPU + gpuRows, err := cs.ExtractAllRows() + assert.NoError(err) + assert.Equal(nRows, len(gpuRows)) + assert.Equal(nCols*rate, len(gpuRows[0])) + + for i := range m { + cpuRow := make([]koalabear.Element, nCols*rate) + params.EncodeReedSolomon(m[i], cpuRow) + for j := range cpuRow { + assert.Equal(cpuRow[j], gpuRows[i][j], "rate16 encoded[%d][%d]", i, j) + } + } + + // Compare leaves: GPU MD hash vs CPU CompressPoseidon2x16 + gpuSIS, err := cs.ExtractSISHashes() + assert.NoError(err) + gpuLeaves, err := cs.ExtractLeaves() + assert.NoError(err) + + scw := nCols * rate + degree := sisParams.Degree + cpuLeaves := make([]Hash, scw) + n16 := scw / 16 + for c := 0; c < n16; c++ { + start := c * 16 * degree + CompressPoseidon2x16(gpuSIS[start:start+16*degree], degree, cpuLeaves[c*16:(c+1)*16]) + } + for i := 0; i < scw; i++ { + assert.Equal(cpuLeaves[i], gpuLeaves[i], "rate16 leaf[%d]", i) + } + + // Note: params.Commit() uses gnark-crypto's BuildMerkleTree (single CompressPoseidon2), + // while the GPU uses smt_koalabear's hashLR (2-block MD). Roots differ by design. + // Full GPU-vs-CPU root comparison is done in TestCommitMerkleWithSIS_GPUvsCPU_Rate16. + + // GPU lincomb + column extraction + alpha := randE4(rng) + selectedCols := make([]int, nSelected) + for i := range selectedCols { + selectedCols[i] = rng.IntN(nCols*rate - 1) + } + + gpuProof, err := cs.Prove(alpha, selectedCols) + assert.NoError(err) + assert.Equal(nCols*rate, len(gpuProof.UAlpha)) +} + +func TestGPUVortexLinCombRate16(t *testing.T) { + assert := require.New(t) + rng := rand.New(rand.NewChaCha8([32]byte{45})) + dev := newTestDevice(t) + + nCols := 32 + nRows := 8 + rate := 16 + nSelected := 4 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrix(rng, nRows, nCols) + alpha := randE4(rng) + + gv, err := NewGPUVortex(dev, params, nRows) + assert.NoError(err) + defer gv.Free() + + cs, _, err := gv.Commit(m) + assert.NoError(err) + + // GPU lincomb + gpuUAlpha, err := cs.LinComb(alpha) + assert.NoError(err) + assert.Equal(nCols*rate, len(gpuUAlpha)) + + // CPU reference lincomb from the GPU-extracted encoded rows + gpuRows, err := cs.ExtractAllRows() + assert.NoError(err) + + cpuUAlpha := make([]fext.E4, nCols*rate) + var pow fext.E4 + pow.SetOne() + for _, row := range gpuRows { + for j := range row { + var term fext.E4 + term.B0.A0 = row[j] + term.Mul(&term, &pow) + cpuUAlpha[j].Add(&cpuUAlpha[j], &term) + } + pow.Mul(&pow, &alpha) + } + + for j := range cpuUAlpha { + assert.Equal(cpuUAlpha[j], gpuUAlpha[j], "rate16 UAlpha[%d]", j) + } +} + +func BenchmarkCommit_64x16_rate16(b *testing.B) { benchCommit(b, 64, 16, 16) } +func BenchmarkCommit_256x64_rate16(b *testing.B) { benchCommit(b, 256, 64, 16) } +func BenchmarkCommit_1024x128_rate16(b *testing.B) { benchCommit(b, 1024, 128, 16) } diff --git a/prover/gpu/vortex/pinned_cache.go b/prover/gpu/vortex/pinned_cache.go new file mode 100644 index 00000000000..cc3c661e09e --- /dev/null +++ b/prover/gpu/vortex/pinned_cache.go @@ -0,0 +1,98 @@ +// Reusable pinned host-buffer cache. +// +// Why this exists +// ─────────────── +// `cudaMallocHost` is one of the most expensive CUDA APIs — at ~64 MiB +// it costs several milliseconds, at hundreds of MiB it can be 10-50+ ms. +// Callers that allocate a fresh pinned buffer on every frame +// (gpu/quotient.RunGPU) burn this cost every prover step. +// +// Profiling observation that motivated this: +// +// gpu/quotient TIMING @ n=2^20, 16 base roots: +// pack=19.6 ms ← 64 MiB cudaMallocHost + parallel host copy +// ifft=1.1 ms (GPU) +// symEval kernel=2.7 ms (GPU) +// +// The actual GPU work is ~5 ms. The 19.6 ms "pack" is dominated by the +// fresh pinned alloc, not the host-side copy. Caching the pinned buffer +// per (device-ID, capacity) eliminates that cost on the second and +// subsequent calls. +// +// Lifecycle +// ───────── +// - GetPinned(deviceID, n) returns a slice of at least n elements. +// The buffer is sticky to the (deviceID, capacity) pair. +// - The returned slice is the cached buffer at its original capacity; +// callers should slice it down to their needed length. +// - ReleasePinnedCache(deviceID) frees all cached buffers for that +// device. Pass deviceID < 0 to clear every cached buffer. +// +// Thread safety +// ───────────── +// Concurrent goroutines on different devices serialise on the cache +// mutex but otherwise run in parallel. A goroutine that calls +// GetPinned with the same (deviceID, capacity) twice will get the SAME +// buffer back — callers that need two simultaneous buffers must use +// distinct sizes or fall back to AllocPinned. + +//go:build cuda + +package vortex + +import ( + "sync" + + "github.com/consensys/gnark-crypto/field/koalabear" +) + +type pinnedKey struct { + deviceID int + capacity int // element count, not bytes +} + +var ( + pinnedMu sync.Mutex + pinnedCache = map[pinnedKey][]koalabear.Element{} +) + +// GetPinned returns a pinned host buffer of at least n koalabear.Element +// (4 bytes each), backed by cudaMallocHost so it can be H2D'd via +// cudaMemcpyAsync without a staging copy. Slice it down to your real +// length before use. +// +// Buffers are cached per (deviceID, capacity). The first call at a +// given (deviceID, capacity) pays a cudaMallocHost; subsequent calls +// reuse the existing buffer. +func GetPinned(deviceID, n int) []koalabear.Element { + if n <= 0 { + return nil + } + key := pinnedKey{deviceID: deviceID, capacity: n} + pinnedMu.Lock() + defer pinnedMu.Unlock() + if buf, ok := pinnedCache[key]; ok { + return buf + } + // Cache miss: allocate fresh and stash. AllocPinned panics on failure. + buf := AllocPinned(n) + pinnedCache[key] = buf + return buf +} + +// ReleasePinnedCache frees all cached pinned buffers for the given +// device. Use at logical boundaries to reclaim host RAM (pinned memory +// is page-locked and counts against the system's pinned-memory budget). +// +// Pass deviceID < 0 to release every cached buffer regardless of device. +func ReleasePinnedCache(deviceID int) { + pinnedMu.Lock() + defer pinnedMu.Unlock() + for key, buf := range pinnedCache { + if deviceID >= 0 && key.deviceID != deviceID { + continue + } + FreePinned(buf) + delete(pinnedCache, key) + } +} diff --git a/prover/gpu/vortex/stub.go b/prover/gpu/vortex/stub.go new file mode 100644 index 00000000000..6e45151da05 --- /dev/null +++ b/prover/gpu/vortex/stub.go @@ -0,0 +1,67 @@ +//go:build !cuda + +// GPU type stubs for non-CUDA builds. Guard calls with gpu.Enabled. +package vortex + +import ( + "unsafe" + + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + + "github.com/consensys/linea-monorepo/prover/gpu" +) + +// ─── KBVector ──────────────────────────────────────────────────────────────── + +type KBVector struct{} + +func NewKBVector(_ *gpu.Device, _ int) (*KBVector, error) { panic("gpu: cuda required") } +func (v *KBVector) Free() {} +func (v *KBVector) Len() int { return 0 } +func (v *KBVector) CopyFromHost(_ []koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) CopyToHost(_ []koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) Add(_, _ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) Sub(_, _ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) Mul(_, _ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) Scale(_ koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) ScaleByPowers(_ koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) BitReverse() { panic("gpu: cuda required") } +func (v *KBVector) CopyFromDevice(_ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) DevicePtr() unsafe.Pointer { panic("gpu: cuda required") } + +// ─── GPUFFTDomain ──────────────────────────────────────────────────────────── + +type GPUFFTDomain struct{} + +func NewGPUFFTDomain(_ *gpu.Device, _ int) (*GPUFFTDomain, error) { panic("gpu: cuda required") } +func (f *GPUFFTDomain) Free() {} +func (f *GPUFFTDomain) FFT(_ *KBVector) { panic("gpu: cuda required") } +func (f *GPUFFTDomain) FFTInverse(_ *KBVector) { panic("gpu: cuda required") } +func (f *GPUFFTDomain) CosetFFT(_ *KBVector, _ koalabear.Element) { panic("gpu: cuda required") } + +// ─── GPUPoseidon2 ──────────────────────────────────────────────────────────── + +type GPUPoseidon2 struct{} + +func NewGPUPoseidon2(_ *gpu.Device, _ int) (*GPUPoseidon2, error) { panic("gpu: cuda required") } +func (p *GPUPoseidon2) Free() {} +func (p *GPUPoseidon2) CompressBatch(_ []koalabear.Element, _ int) []Hash { + panic("gpu: cuda required") +} + +// ─── GPUVortex ─────────────────────────────────────────────────────────────── + +type GPUVortex struct{} + +func NewGPUVortex(_ *gpu.Device, _ *Params, _ int) (*GPUVortex, error) { panic("gpu: cuda required") } +func (gv *GPUVortex) Free() {} +func (gv *GPUVortex) Commit(_ [][]koalabear.Element) (*CommitState, Hash, error) { + panic("gpu: cuda required") +} + +// ─── GPU helpers ───────────────────────────────────────────────────────────── + +func GPULinCombE4(_ *gpu.Device, _ []*KBVector, _ fext.E4, _ int) []fext.E4 { + panic("gpu: cuda required") +} diff --git a/prover/gpu/vortex/vortex.go b/prover/gpu/vortex/vortex.go new file mode 100644 index 00000000000..12bf7a02bf7 --- /dev/null +++ b/prover/gpu/vortex/vortex.go @@ -0,0 +1,217 @@ +// Package vortex implements GPU-accelerated Vortex polynomial commitment over KoalaBear. +// +// Vortex encodes a matrix of field elements using Reed-Solomon codes, +// hashes columns (SIS + Poseidon2), builds a Merkle tree, and commits to the root. +// Opening proofs reveal a random linear combination (UAlpha) and selected columns +// with Merkle inclusion proofs. +// +// This package is API-compatible with linea-monorepo/prover/crypto/vortex/vortex_koalabear. +// When built with CGO + CUDA, hot paths (RS encoding, Merkle tree, linear combination) +// run on GPU. Without CGO, it falls back to gnark-crypto's CPU implementation. +// +// Protocol overview (cf. https://eprint.iacr.org/2024/185): +// +// Prover Verifier +// ┌───────────────────────────────────────────────────────────────────────────┐ +// │ M[nRows × nCols] │ +// │ │ │ +// │ RS encode rows ──▶ Encoded[nRows × nCols·ρ] │ +// │ │ │ +// │ SIS hash columns ──▶ Poseidon2 ──▶ Merkle tree ──▶ root ──────▶ commit │ +// │ │ +// │ ◀────────── α, x, S (random challenges) ◀────────────────────── │ +// │ │ +// │ UAlpha = Σ αⁱ · row[i] (E4 vector, length nCols·ρ) │ +// │ open columns S, Merkle proofs ──────────────────────▶ verify │ +// │ │ │ +// │ eval(UAlpha, x) = Σ yᵢ·αⁱ ? │ +// │ UAlpha ∈ RS code? │ +// │ col(α) = UAlpha[col_idx]? │ +// │ Merkle proofs valid? │ +// └───────────────────────────────────────────────────────────────────────────┘ +package vortex + +import ( + "fmt" + "hash" + + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + "github.com/consensys/gnark-crypto/field/koalabear/fft" + "github.com/consensys/gnark-crypto/field/koalabear/sis" + "github.com/consensys/gnark-crypto/field/koalabear/vortex" + "github.com/consensys/gnark-crypto/utils" +) + +// ───────────────────────────────────────────────────────────────────────────── +// Shared types (used by both CPU and GPU paths) +// ───────────────────────────────────────────────────────────────────────────── + +// Hash is a Poseidon2 digest: 8 KoalaBear elements (32 bytes). +type Hash = [8]koalabear.Element + +// HashConstructor mirrors gnark-crypto's vortex hash constructor. +type HashConstructor = vortex.HashConstructor + +// Option configures the prover behavior (hash functions, etc). +type Option = vortex.Option + +var ( + // ErrWrongSizeHash is returned when a custom hash is not 32 bytes. + ErrWrongSizeHash = vortex.ErrWrongSizeHash +) + +// MerkleProof is a sequence of sibling hashes from leaf to root. +type MerkleProof = vortex.MerkleProof + +// Params holds the public parameters of the Vortex commitment scheme. +type Params struct { + inner *vortex.Params +} + +// Proof is a Vortex opening proof. +type Proof struct { + // UAlpha is the random linear combination Σ αⁱ · encoded_row[i] ∈ E4^(nCols·ρ). + UAlpha []fext.E4 + // Columns are the opened columns from the encoded matrix. + Columns [][]koalabear.Element + // MerkleProofs are the Merkle inclusion proofs for each opened column. + MerkleProofs []MerkleProof +} + +// ───────────────────────────────────────────────────────────────────────────── +// Parameters +// ───────────────────────────────────────────────────────────────────────────── + +func WithMerkleHash(h hash.Hash) Option { + return vortex.WithMerkleHash(h) +} + +func WithColumnHash(h hash.Hash) Option { + return vortex.WithColumnHash(h) +} + +// NewParams constructs Vortex parameters. +// +// Unlike gnark-crypto's vortex.NewParams (which restricts rate to 2, 4, 8), +// this constructor accepts any power-of-two rate as long as the total domain +// size (nbColumns × rate) does not exceed the KoalaBear two-adicity limit (2^24). +func NewParams(nbColumns, maxNbRows int, sisParams *sis.RSis, + rate, numSelectedColumns int, opts ...Option) (*Params, error) { + + if nbColumns < 1 || nbColumns&(nbColumns-1) != 0 { + return nil, fmt.Errorf("vortex: number of columns must be a power of two, got %d", nbColumns) + } + if rate < 2 || rate&(rate-1) != 0 { + return nil, fmt.Errorf("vortex: rate must be a power of two >= 2, got %d", rate) + } + + shift, err := koalabear.Generator(uint64(nbColumns * rate)) + if err != nil { + return nil, fmt.Errorf("vortex: generator for domain size %d: %w", nbColumns*rate, err) + } + + smallDomain := fft.NewDomain(uint64(nbColumns), fft.WithShift(shift)) + cosetTable, err := smallDomain.CosetTable() + if err != nil { + return nil, fmt.Errorf("vortex: coset table: %w", err) + } + cosetTableBitReverse := make(koalabear.Vector, len(cosetTable)) + copy(cosetTableBitReverse, cosetTable) + utils.BitReverse(cosetTableBitReverse) + + bigDomain := fft.NewDomain(uint64(nbColumns * rate)) + + p := &vortex.Params{ + Key: sisParams, + Domains: [2]*fft.Domain{smallDomain, bigDomain}, + ReedSolomonInvRate: rate, + NbColumns: nbColumns, + MaxNbRows: maxNbRows, + NumSelectedColumns: numSelectedColumns, + CosetTableBitReverse: cosetTableBitReverse, + } + return &Params{inner: p}, nil +} + +// SizeCodeWord returns the number of columns after RS encoding (nbColumns × rate). +func (p *Params) SizeCodeWord() int { + return p.inner.SizeCodeWord() +} + +// ───────────────────────────────────────────────────────────────────────────── +// Verify (same for CPU and GPU) +// ───────────────────────────────────────────────────────────────────────────── + +type VerifierInput = vortex.VerifierInput + +// VerifyInput matches gnark-crypto's verifier API. +func (p *Params) VerifyInput(input VerifierInput) error { + return p.inner.Verify(input) +} + +// Verify checks a Vortex opening proof. +func (p *Params) Verify(root Hash, proof *Proof, claimedValues []fext.E4, + x, alpha fext.E4, selectedCols []int) error { + + return p.inner.Verify(vortex.VerifierInput{ + MerkleRoot: root, + ClaimedValues: claimedValues, + EvaluationPoint: x, + Alpha: alpha, + SelectedColumns: selectedCols, + Proof: &vortex.Proof{ + UAlpha: proof.UAlpha, + OpenedColumns: proof.Columns, + MerkleProofOpenedColumns: proof.MerkleProofs, + }, + }) +} + +// ───────────────────────────────────────────────────────────────────────────── +// Standalone helpers (exported for GPU kernel validation in tests) +// ───────────────────────────────────────────────────────────────────────────── + +func (p *Params) EncodeReedSolomon(input, res []koalabear.Element) { + p.inner.EncodeReedSolomon(input, res) +} + +func CompressPoseidon2(a, b Hash) Hash { + return vortex.CompressPoseidon2(a, b) +} + +func CompressPoseidon2x16(matrix []koalabear.Element, colSize int, result []Hash) { + vortex.CompressPoseidon2x16(matrix, colSize, result) +} + +func HashPoseidon2(x []koalabear.Element) Hash { + return vortex.HashPoseidon2(x) +} + +func HashPoseidon2x16(sisHashes []koalabear.Element, merkleLeaves []Hash, sisKeySize int) { + vortex.HashPoseidon2x16(sisHashes, merkleLeaves, sisKeySize) +} + +func EvalBasePolyLagrange(poly []koalabear.Element, x fext.E4) (fext.E4, error) { + return vortex.EvalBasePolyLagrange(poly, x) +} + +func EvalFextPolyLagrange(poly []fext.E4, x fext.E4) (fext.E4, error) { + return vortex.EvalFextPolyLagrange(poly, x) +} + +func EvalBasePolyHorner(poly []koalabear.Element, x fext.E4) fext.E4 { + return vortex.EvalBasePolyHorner(poly, x) +} + +func EvalFextPolyHorner(poly []fext.E4, x fext.E4) fext.E4 { + return vortex.EvalFextPolyHorner(poly, x) +} + +func BatchEvalBasePolyLagrange(polys [][]koalabear.Element, x fext.E4, oncoset ...bool) ([]fext.E4, error) { + return vortex.BatchEvalBasePolyLagrange(polys, x, oncoset...) +} + +func BatchEvalFextPolyLagrange(polys [][]fext.E4, x fext.E4, oncoset ...bool) ([]fext.E4, error) { + return vortex.BatchEvalFextPolyLagrange(polys, x, oncoset...) +} diff --git a/prover/gpu/vortex/vortex_test.go b/prover/gpu/vortex/vortex_test.go new file mode 100644 index 00000000000..349249a4f4d --- /dev/null +++ b/prover/gpu/vortex/vortex_test.go @@ -0,0 +1,514 @@ +package vortex + +import ( + "math/rand/v2" + "testing" + + "github.com/consensys/gnark-crypto/field/koalabear" + fext "github.com/consensys/gnark-crypto/field/koalabear/extensions" + "github.com/consensys/gnark-crypto/field/koalabear/sis" + refvortex "github.com/consensys/gnark-crypto/field/koalabear/vortex" + "github.com/stretchr/testify/require" +) + +// ─── Helpers ──────────────────────────────────────────────────────────────── + +func randKB(rng *rand.Rand) koalabear.Element { + return koalabear.Element{rng.Uint32N(2130706433)} +} + +func randE4(rng *rand.Rand) fext.E4 { + return fext.E4{ + B0: fext.E2{A0: randKB(rng), A1: randKB(rng)}, + B1: fext.E2{A0: randKB(rng), A1: randKB(rng)}, + } +} + +func randMatrix(rng *rand.Rand, nRows, nCols int) [][]koalabear.Element { + m := make([][]koalabear.Element, nRows) + for i := range m { + m[i] = make([]koalabear.Element, nCols) + for j := range m[i] { + m[i][j] = randKB(rng) + } + } + return m +} + +func deterministicRNG(seed byte) *rand.Rand { + var s [32]byte + s[0] = seed + return rand.New(rand.NewChaCha8(s)) //nolint:gosec // deterministic test RNG +} + +// ─── Roundtrip: Commit → Prove → Verify ───────────────────────────────────── + +func TestRoundtrip(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(0) + + nCols := 16 + nRows := 8 + rate := 2 + nSelected := 4 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + // Random matrix + m := randMatrix(rng, nRows, nCols) + + // Compute claimed values: Ys[i] = eval(row[i], x) in Lagrange form + x := randE4(rng) + ys := make([]fext.E4, nRows) + for i := range m { + ys[i], err = EvalBasePolyLagrange(m[i], x) + assert.NoError(err) + } + + alpha := randE4(rng) + selectedCols := []int{0, 1, 2, 3} + + // Commit + cs, root, err := params.Commit(m) + assert.NoError(err) + + // Prove + proof, err := cs.Prove(alpha, selectedCols) + assert.NoError(err) + + // Verify + err = params.Verify(root, proof, ys, x, alpha, selectedCols) + assert.NoError(err, "roundtrip verification failed") +} + +// ─── Zero matrix ───────────────────────────────────────────────────────────── + +func TestZeroMatrix(t *testing.T) { + assert := require.New(t) + + nCols := 16 + nRows := 8 + rate := 2 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, 4) + assert.NoError(err) + + // All-zero matrix + m := make([][]koalabear.Element, nRows) + for i := range m { + m[i] = make([]koalabear.Element, nCols) + } + + x := fext.E4{} + ys := make([]fext.E4, nRows) + alpha, _ := new(fext.E4).SetRandom() + selectedCols := []int{0, 1, 2, 3} + + cs, root, err := params.Commit(m) + assert.NoError(err) + + proof, err := cs.Prove(*alpha, selectedCols) + assert.NoError(err) + + err = params.Verify(root, proof, ys, x, *alpha, selectedCols) + assert.NoError(err, "zero matrix verification failed") +} + +// ─── Cross-validation: our API vs gnark-crypto reference ───────────────────── + +func TestCrossValidationCommitment(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(42) + + nCols := 32 + nRows := 16 + rate := 2 + nSelected := 8 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + // Our params + ourParams, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + // Reference params (gnark-crypto) + refParams, err := refvortex.NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + // Same matrix + m := randMatrix(rng, nRows, nCols) + + // Commit via our API + _, ourRoot, err := ourParams.Commit(m) + assert.NoError(err) + + // Commit via reference + refState, err := refvortex.Commit(refParams, m) + assert.NoError(err) + refRoot := refState.GetCommitment() + + // Roots must match + assert.Equal(refRoot, ourRoot, "commitment roots differ from gnark-crypto reference") +} + +func TestCrossValidationFullProtocol(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(99) + + nCols := 32 + nRows := 16 + rate := 2 + nSelected := 4 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + ourParams, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + refParams, err := refvortex.NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrix(rng, nRows, nCols) + + // Shared randomness + x := randE4(rng) + alpha := randE4(rng) + selectedCols := make([]int, nSelected) + for i := range selectedCols { + selectedCols[i] = rng.IntN(nCols*rate - 1) + } + + // Compute claimed values + ys := make([]fext.E4, nRows) + for i := range m { + ys[i], err = refvortex.EvalBasePolyLagrange(m[i], x) + assert.NoError(err) + } + + // ── Our side ── + ourCS, ourRoot, err := ourParams.Commit(m) + assert.NoError(err) + + ourProof, err := ourCS.Prove(alpha, selectedCols) + assert.NoError(err) + + err = ourParams.Verify(ourRoot, ourProof, ys, x, alpha, selectedCols) + assert.NoError(err, "our verify failed") + + // ── Reference side ── + refState, err := refvortex.Commit(refParams, m) + assert.NoError(err) + + refState.OpenLinComb(alpha) + refProof, err := refState.OpenColumns(selectedCols) + assert.NoError(err) + + err = refParams.Verify(refvortex.VerifierInput{ + Proof: refProof, + MerkleRoot: refState.GetCommitment(), + ClaimedValues: ys, + EvaluationPoint: x, + Alpha: alpha, + SelectedColumns: selectedCols, + }) + assert.NoError(err, "reference verify failed") + + // ── Cross-check ── + assert.Equal(refState.GetCommitment(), ourRoot, "roots must match") + assert.Equal(len(refProof.UAlpha), len(ourProof.UAlpha), "UAlpha length mismatch") + for i := range refProof.UAlpha { + assert.Equal(refProof.UAlpha[i], ourProof.UAlpha[i], "UAlpha[%d] mismatch", i) + } +} + +// ─── Component tests ───────────────────────────────────────────────────────── + +func TestReedSolomonEncode(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(7) + + nCols := 64 + rate := 2 + + sisParams, err := sis.NewRSis(0, 9, 16, 8) + assert.NoError(err) + + params, err := NewParams(nCols, 8, sisParams, rate, 4) + assert.NoError(err) + + // Random row + row := make([]koalabear.Element, nCols) + for j := range row { + row[j] = randKB(rng) + } + + // Encode via our API + ourEncoded := make([]koalabear.Element, nCols*rate) + params.EncodeReedSolomon(row, ourEncoded) + + // Encode via reference + refParams, err := refvortex.NewParams(nCols, 8, sisParams, rate, 4) + assert.NoError(err) + refEncoded := make([]koalabear.Element, nCols*rate) + refParams.EncodeReedSolomon(row, refEncoded) + + assert.Equal(refEncoded, ourEncoded, "RS encoding mismatch") +} + +func TestPoseidon2Compress(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(13) + + var a, b Hash + for j := 0; j < 8; j++ { + a[j] = randKB(rng) + b[j] = randKB(rng) + } + + // Our API + ourHash := CompressPoseidon2(a, b) + // Reference + refHash := refvortex.CompressPoseidon2(a, b) + + assert.Equal(refHash, ourHash, "Poseidon2 compress mismatch") +} + +func TestPoseidon2Sponge(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(17) + + input := make([]koalabear.Element, 48) + for j := range input { + input[j] = randKB(rng) + } + + ourHash := HashPoseidon2(input) + refHash := refvortex.HashPoseidon2(input) + + assert.Equal(refHash, ourHash, "Poseidon2 sponge mismatch") +} + +func TestPoseidon2CompressX16(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(19) + + const ( + width = 16 + colSize = 16 + ) + input := make([]koalabear.Element, width*colSize) + for i := range input { + input[i] = randKB(rng) + } + + our := make([]Hash, width) + ref := make([]Hash, width) + CompressPoseidon2x16(input, colSize, our) + refvortex.CompressPoseidon2x16(input, colSize, ref) + + assert.Equal(ref, our, "Poseidon2 x16 mismatch") +} + +func TestPolyEval(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(23) + + n := 16 + poly := make([]koalabear.Element, n) + for j := range poly { + poly[j] = randKB(rng) + } + x := randE4(rng) + + ourVal, err := EvalBasePolyLagrange(poly, x) + assert.NoError(err) + refVal, err := refvortex.EvalBasePolyLagrange(poly, x) + assert.NoError(err) + + assert.Equal(refVal, ourVal, "EvalBasePolyLagrange mismatch") +} + +func TestEvalBasePolyHorner(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(27) + + n := 16 + poly := make([]koalabear.Element, n) + for i := range poly { + poly[i] = randKB(rng) + } + x := randE4(rng) + + assert.Equal(refvortex.EvalBasePolyHorner(poly, x), EvalBasePolyHorner(poly, x)) +} + +func TestBatchPolyEvalLagrange(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(29) + + const ( + numPolys = 5 + n = 16 + ) + basePolys := make([][]koalabear.Element, numPolys) + fextPolys := make([][]fext.E4, numPolys) + for i := 0; i < numPolys; i++ { + basePolys[i] = make([]koalabear.Element, n) + fextPolys[i] = make([]fext.E4, n) + for j := 0; j < n; j++ { + basePolys[i][j] = randKB(rng) + fextPolys[i][j] = randE4(rng) + } + } + x := randE4(rng) + + ourBase, err := BatchEvalBasePolyLagrange(basePolys, x) + assert.NoError(err) + refBase, err := refvortex.BatchEvalBasePolyLagrange(basePolys, x) + assert.NoError(err) + assert.Equal(refBase, ourBase) + + ourFext, err := BatchEvalFextPolyLagrange(fextPolys, x, true) + assert.NoError(err) + refFext, err := refvortex.BatchEvalFextPolyLagrange(fextPolys, x, true) + assert.NoError(err) + assert.Equal(refFext, ourFext) +} + +// ─── Rate 8 ────────────────────────────────────────────────────────────────── + +func TestRoundtripRate8(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(31) + + nCols := 16 + nRows := 4 + rate := 8 + nSelected := 4 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrix(rng, nRows, nCols) + + x := randE4(rng) + ys := make([]fext.E4, nRows) + for i := range m { + ys[i], err = EvalBasePolyLagrange(m[i], x) + assert.NoError(err) + } + + alpha := randE4(rng) + selectedCols := []int{0, 5, 30, 100} + + cs, root, err := params.Commit(m) + assert.NoError(err) + + proof, err := cs.Prove(alpha, selectedCols) + assert.NoError(err) + + err = params.Verify(root, proof, ys, x, alpha, selectedCols) + assert.NoError(err, "rate-8 roundtrip verification failed") +} + +// ─── Larger matrix ─────────────────────────────────────────────────────────── + +func TestLargerMatrix(t *testing.T) { + assert := require.New(t) + rng := deterministicRNG(37) + + nCols := 256 + nRows := 64 + rate := 2 + nSelected := 16 + + sisParams, err := sis.NewRSis(0, 9, 16, nRows) + assert.NoError(err) + + params, err := NewParams(nCols, nRows, sisParams, rate, nSelected) + assert.NoError(err) + + m := randMatrix(rng, nRows, nCols) + + x := randE4(rng) + ys := make([]fext.E4, nRows) + for i := range m { + ys[i], err = EvalBasePolyLagrange(m[i], x) + assert.NoError(err) + } + + alpha := randE4(rng) + selectedCols := make([]int, nSelected) + for i := range selectedCols { + selectedCols[i] = rng.IntN(nCols*rate - 1) + } + + cs, root, err := params.Commit(m) + assert.NoError(err) + + proof, err := cs.Prove(alpha, selectedCols) + assert.NoError(err) + + err = params.Verify(root, proof, ys, x, alpha, selectedCols) + assert.NoError(err, "large matrix roundtrip failed") +} + +// ─── Benchmark ─────────────────────────────────────────────────────────────── + +func BenchmarkVortex(b *testing.B) { + rng := deterministicRNG(0) + + nCols := 1024 + nRows := 128 + rate := 2 + nSelected := 32 + + sisParams, _ := sis.NewRSis(0, 9, 16, nRows) + params, _ := NewParams(nCols, nRows, sisParams, rate, nSelected) + m := randMatrix(rng, nRows, nCols) + + alpha := randE4(rng) + x := randE4(rng) + selectedCols := make([]int, nSelected) + for i := range selectedCols { + selectedCols[i] = rng.IntN(nCols*rate - 1) + } + + b.Run("Commit", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _, _, _ = params.Commit(m) + } + }) + + cs, root, _ := params.Commit(m) + ys := make([]fext.E4, nRows) + for i := range m { + ys[i], _ = EvalBasePolyLagrange(m[i], x) + } + + b.Run("Prove", func(b *testing.B) { + for i := 0; i < b.N; i++ { + cs, _, _ = params.Commit(m) + _, _ = cs.Prove(alpha, selectedCols) + } + }) + + proof, _ := cs.Prove(alpha, selectedCols) + + b.Run("Verify", func(b *testing.B) { + for i := 0; i < b.N; i++ { + _ = params.Verify(root, proof, ys, x, alpha, selectedCols) + } + }) +} diff --git a/prover/integration/circuit-testing/aggregation/main.go b/prover/integration/circuit-testing/aggregation/main.go index b311e9df7c8..279ffbd72e4 100644 --- a/prover/integration/circuit-testing/aggregation/main.go +++ b/prover/integration/circuit-testing/aggregation/main.go @@ -164,7 +164,7 @@ func main() { // Assigning the BW6 circuit logrus.Infof("Generating the aggregation proof for arity %v", nc) - bw6Proof, err := aggregation.MakeProof(&ppBw6, nc, innerProofClaims, piInfo, aggregationPI) + bw6Proof, err := aggregation.MakeProof(&ppBw6, nc, innerProofClaims, piInfo, aggregationPI, false) assert.NoError(t, err) bw6Proofs = append(bw6Proofs, bw6Proof) @@ -184,7 +184,7 @@ func main() { aggregationPiBn254.SetBytes(aggregationPIBytes) for k := range bw6Proofs { logrus.Infof("Generating the proof for the emulation circuit (BW6 Proof #%v)", k) - _, err = emulation.MakeProof(&setupEmulation, k, bw6Proofs[k], aggregationPiBn254) + _, err = emulation.MakeProof(&setupEmulation, k, bw6Proofs[k], aggregationPiBn254, false) assert.NoError(t, err) } diff --git a/prover/protocol/compiler/globalcs/quotient.go b/prover/protocol/compiler/globalcs/quotient.go index 986dad024f4..b1a6d2803c0 100644 --- a/prover/protocol/compiler/globalcs/quotient.go +++ b/prover/protocol/compiler/globalcs/quotient.go @@ -15,6 +15,8 @@ import ( "github.com/consensys/gnark-crypto/ecc" "github.com/consensys/gnark-crypto/field/koalabear/extensions" "github.com/consensys/gnark-crypto/field/koalabear/fft" + "github.com/consensys/linea-monorepo/prover/gpu" + gpuquotient "github.com/consensys/linea-monorepo/prover/gpu/quotient" "github.com/consensys/linea-monorepo/prover/maths/common/fastpoly" "github.com/consensys/linea-monorepo/prover/maths/common/fastpolyext" "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" @@ -195,8 +197,43 @@ type coeffEntry struct { ext []fext.Element } -// compute the quotient shares. +// compute the quotient shares. When the master aggregation flag is set +// (gpu.IsAggregationEnabled) and a GPU is bound to the calling goroutine, +// the work is delegated to gpu/quotient.RunGPU; on any error the CPU +// implementation runs as a fallback, so correctness is preserved even if +// the GPU path errors out. +// +// On multi-GPU hosts each segment goroutine pins itself to one GPU via +// gpu.SetCurrentDevice, and we honour that here through gpu.CurrentDevice(). func (ctx *QuotientCtx) Run(run *wizard.ProverRuntime) { + if gpu.IsAggregationEnabled() { + if dev := gpu.CurrentDevice(); dev != nil { + devID := gpu.CurrentDeviceID() + start := time.Now() + err := gpuquotient.RunGPU( + dev, run, + ctx.DomainSize, + ctx.Ratios, + ctx.AggregateExpressionsBoard, + ctx.RootsForRatio, + ctx.ShiftedColumnsForRatio, + ctx.QuotientShares, + ctx.ConstraintsByRatio, + ) + gpu.TraceEvent("quotient", devID, time.Since(start), map[string]any{ + "domain": ctx.DomainSize, + "ok": err == nil, + }) + if err == nil { + return + } + log.Warnf("[quotient d=%d] GPU path failed on device %d, falling back to CPU: %v", ctx.DomainSize, devID, err) + } + } + ctx.runCPU(run) +} + +func (ctx *QuotientCtx) runCPU(run *wizard.ProverRuntime) { stopTimer := profiling.LogTimer("computed the quotient (domain size %d)", ctx.DomainSize) defer stopTimer() diff --git a/prover/protocol/compiler/recursion/actions.go b/prover/protocol/compiler/recursion/actions.go index d18d39c2693..f84f3f1d111 100644 --- a/prover/protocol/compiler/recursion/actions.go +++ b/prover/protocol/compiler/recursion/actions.go @@ -68,7 +68,17 @@ func ExtractWitness(run *wizard.ProverRuntime) Witness { ) if committedMatrix != nil { - committedMatrices = append(committedMatrices, committedMatrix.(vortex_koalabear.EncodedMatrix)) + // Accept either the legacy raw EncodedMatrix or the new + // *committedHandle wrapper introduced when SIS-applied Koala + // rounds went device-resident. For GPU handles this forces + // a full D2H — unavoidable here because the recursion + // witness needs the host-side encoded matrix to feed into + // the recursion-circuit's verifier. + em := vortex.EncodedMatrixFromState(committedMatrix) + if em == nil { + utils.Panic("recursion ExtractWitness: unexpected vortex state type %T", committedMatrix) + } + committedMatrices = append(committedMatrices, em) } else { committedMatrices = append(committedMatrices, nil) } diff --git a/prover/protocol/compiler/vortex/committed.go b/prover/protocol/compiler/vortex/committed.go new file mode 100644 index 00000000000..b49c9c4adfb --- /dev/null +++ b/prover/protocol/compiler/vortex/committed.go @@ -0,0 +1,162 @@ +package vortex + +// Per-round commit handle stored under VortexProverStateName(round) in the +// Wizard prover runtime State map. A round's committed matrix lives in +// exactly one of two places: +// +// *committedHandle.host — vortex_koalabear.EncodedMatrix (= []SmartVector) +// BLS rounds, NoSIS rounds, precomputeds, and +// SIS rounds when the GPU path is disabled. +// +// *committedHandle.gpu — *gpuvortex.CommitState +// SIS-applied Koala rounds when +// LIMITLESS_GPU_VORTEX=1 and a GPU is bound. +// +// Why this exists +// ─────────────── +// Before this type, the value stored in run.State was the raw EncodedMatrix. +// The GPU "drop-in" CommitMerkleWithSIS therefore had to D2H the entire +// encoded matrix (8 GiB at 2^27 segment size) and reconstruct it as a +// []SmartVector in Go-managed memory before the prover state could hold +// it. That reconstruction cost ~1.4 s of pure host-side work — enough to +// turn a 4× GPU win into a 0.65× *regression*. +// +// With this handle, the GPU SIS path skips the full D2H. The encoded +// matrix stays on device; downstream actions (LinComb, Open) call +// device-resident methods that produce only the small outputs they need +// (UAlpha vector ~16 MiB, selected columns ~few MiB). +// +// Lifecycle +// ───────── +// 1. ColumnAssignmentProverAction.Run inserts a handle. +// 2. LinearCombinationComputationProverAction.Run reads (no mutation). +// 3. OpenSelectedColumnsProverAction.Run reads, then for GPU handles +// calls FreeGPU() once columns are extracted. + +import ( + "github.com/consensys/linea-monorepo/prover/crypto/vortex/vortex_koalabear" + gpuvortex "github.com/consensys/linea-monorepo/prover/gpu/vortex" + "github.com/consensys/linea-monorepo/prover/maths/field" +) + +type committedHandle struct { + gpu *gpuvortex.CommitState // non-nil when device-resident + host vortex_koalabear.EncodedMatrix // non-nil when host-resident + // recommit means the original GPU commit kept only the Merkle tree/root. + // UAlpha and selected columns must recommit from the original Wizard + // columns. This bounds VRAM for large segments that cannot afford one + // encoded snapshot per SIS round. + recommit bool + nRows int +} + +func newHostHandle(m vortex_koalabear.EncodedMatrix) *committedHandle { + return &committedHandle{host: m} +} + +func newGPUHandle(cs *gpuvortex.CommitState) *committedHandle { + return &committedHandle{gpu: cs} +} + +func newGPURecommitHandle(nRows int) *committedHandle { + return &committedHandle{recommit: true, nRows: nRows} +} + +// isGPU reports whether the matrix is device-resident. +func (h *committedHandle) isGPU() bool { return h.gpu != nil } + +func (h *committedHandle) isRecommit() bool { return h.recommit } + +// numRows is the row count without forcing a host materialization. +func (h *committedHandle) numRows() int { + if h.gpu != nil { + return h.gpu.NRows() + } + if h.recommit { + return h.nRows + } + return len(h.host) +} + +// hostMatrix returns the matrix as []SmartVector. For GPU handles this +// triggers a full D2H — used only by callers that genuinely need the +// host-side encoded matrix (self-recursion / debug). The two hot actions +// (LinComb and Open) avoid this and call into gpu/vortex directly. +func (h *committedHandle) hostMatrix() vortex_koalabear.EncodedMatrix { + if h.gpu != nil { + return h.gpu.GetEncodedMatrix() + } + return h.host +} + +// extractColumns returns columns[entryIdx][rowIdx] for each entry in +// entries. For GPU handles this issues a small D2H of only the selected +// columns (typically O(few MiB) regardless of matrix size). For host +// handles it gathers from the SmartVector slice. +// +// Used by OpenSelectedColumnsProverAction to fill proof.Columns without +// materializing the full encoded matrix. +func (h *committedHandle) extractColumns(entries []int) [][]field.Element { + if h.gpu != nil { + // gpu.ExtractColumns may fail if the device pipeline was already + // freed; the OpenSelectedColumns action is the only caller and + // runs before free(), so we surface the error via panic to keep + // the call sites simple. + cols, err := h.gpu.ExtractColumns(entries) + if err != nil { + panic("vortex: GPU ExtractColumns: " + err.Error()) + } + return cols + } + out := make([][]field.Element, len(entries)) + for i, c := range entries { + col := make([]field.Element, len(h.host)) + for r, row := range h.host { + col[r] = row.Get(c) + } + out[i] = col + } + return out +} + +// free releases device buffers. Idempotent. No-op for host handles. +func (h *committedHandle) free() { + if h.gpu != nil { + h.gpu.FreeGPU() + } +} + +// asHandle promotes a value read from run.State into a *committedHandle. +// Accepts either: +// - the new *committedHandle wrapper (preferred path) +// - a raw vortex_koalabear.EncodedMatrix (legacy callers / non-SIS rounds +// that haven't been migrated yet) +// +// Returns nil if v is neither. +func asHandle(v any) *committedHandle { + switch x := v.(type) { + case *committedHandle: + return x + case vortex_koalabear.EncodedMatrix: + return newHostHandle(x) + } + return nil +} + +// EncodedMatrixFromState reads a committed matrix from the prover state map +// regardless of whether it's stored as a raw EncodedMatrix (legacy host +// path) or wrapped in *committedHandle (current path, possibly GPU-resident). +// +// For GPU handles, this triggers a full D2H of the encoded matrix — +// expensive and to be avoided on the hot path. Use only from places that +// genuinely need the host []SmartVector representation, e.g. self-recursion +// witness extraction. Returns nil when the state slot is empty. +func EncodedMatrixFromState(v any) vortex_koalabear.EncodedMatrix { + if v == nil { + return nil + } + if h := asHandle(v); h != nil { + return h.hostMatrix() + } + return nil +} diff --git a/prover/protocol/compiler/vortex/prover.go b/prover/protocol/compiler/vortex/prover.go index f93136c5025..b342a62701f 100644 --- a/prover/protocol/compiler/vortex/prover.go +++ b/prover/protocol/compiler/vortex/prover.go @@ -1,16 +1,23 @@ package vortex import ( + "os" + "strconv" + "time" + "github.com/consensys/linea-monorepo/prover/crypto/encoding" "github.com/consensys/linea-monorepo/prover/utils/types" bls12377 "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + "github.com/consensys/gnark-crypto/field/koalabear/extensions" gnarkvortex "github.com/consensys/gnark-crypto/field/koalabear/vortex" "github.com/consensys/linea-monorepo/prover/crypto/state-management/smt_bls12377" "github.com/consensys/linea-monorepo/prover/crypto/state-management/smt_koalabear" "github.com/consensys/linea-monorepo/prover/crypto/vortex" vortex_bls12377 "github.com/consensys/linea-monorepo/prover/crypto/vortex/vortex_bls12377" "github.com/consensys/linea-monorepo/prover/crypto/vortex/vortex_koalabear" + "github.com/consensys/linea-monorepo/prover/gpu" + gpuvortex "github.com/consensys/linea-monorepo/prover/gpu/vortex" "github.com/consensys/linea-monorepo/prover/maths/common/smartvectors" "github.com/consensys/linea-monorepo/prover/utils" "github.com/sirupsen/logrus" @@ -20,6 +27,61 @@ import ( "github.com/consensys/linea-monorepo/prover/protocol/wizard" ) +// fextE4 alias used for clarity in the GPU LinComb glue. The underlying +// type is github.com/consensys/gnark-crypto/field/koalabear/extensions.E4 +// (the same type the rest of the protocol uses via prover/maths/field/fext). +type fextE4 = extensions.E4 + +// useGPUVortex returns true when the wizard Vortex commit prover should +// dispatch to gpu/vortex. Tied to the master aggregation flag — see +// gpu.IsAggregationEnabled. Compression's wizard runs on CPU for both +// commits and quotient; only the BLS12-377 PlonK proof at the end of +// compression goes to gpu/plonk2. +func useGPUVortex() bool { + return gpu.IsAggregationEnabled() +} + +// forceGPUVortexRecommit forces every committed matrix back to host before +// the OpenSelectedColumnsProverAction so we always re-encode + re-hash on +// CPU at open time, instead of keeping the device-resident snapshot. This +// is an advanced override used to debug the device-resident path; it is +// not needed in production. +func forceGPUVortexRecommit() bool { + return os.Getenv("LINEA_PROVER_GPU_VORTEX_RECOMMIT") == "1" +} + +// gpuVortexSnapshotBudgetBytes caps how many GPU bytes we'll keep alive in +// the cross-round Vortex snapshot before dropping it (forcing a recommit at +// open time). Default 48 GiB leaves headroom on the 96 GiB reference card +// for the PlonK MSM/NTT phases that run later. +func gpuVortexSnapshotBudgetBytes() uint64 { + const defaultGiB = 48 + gib := uint64(defaultGiB) + if v := os.Getenv("LINEA_PROVER_GPU_VORTEX_SNAPSHOT_BUDGET_GIB"); v != "" { + if n, err := strconv.ParseUint(v, 10, 64); err == nil { + gib = n + } + } + return gib << 30 +} + +func (ctx *Ctx) shouldSnapshotGPUVortex() bool { + if ctx.IsSelfrecursed { + return true + } + if forceGPUVortexRecommit() { + return false + } + var sisRows uint64 + for round := 0; round <= ctx.MaxCommittedRound; round++ { + if ctx.RoundStatus[round] == IsSISApplied { + sisRows += uint64(ctx.CommitmentsByRounds.LenOf(round)) + } + } + estimatedBytes := sisRows * uint64(ctx.NumEncodedCols()) * 4 + return estimatedBytes <= gpuVortexSnapshotBudgetBytes() +} + type commitmentMode int const ( @@ -119,15 +181,75 @@ func (ctx *ColumnAssignmentProverAction) Run(run *wizard.ProverRuntime) { run.AssignColumn(ifaces.ColID(ctx.MerkleRootName(round, i)), smartvectors.NewConstant(roots[i], 1)) } } else { - var tree *smt_koalabear.Tree + var ( + tree *smt_koalabear.Tree + gpuCS *gpuvortex.CommitState // device-resident handle when SIS+GPU + ) if ctx.RoundStatus[round] == IsNoSis { committedMatrix, _, tree, noSisColHashes = ctx.VortexKoalaParams.CommitMerkleWithoutSIS(pols) } else if ctx.RoundStatus[round] == IsSISApplied { - committedMatrix, _, tree, sisColHashes = ctx.VortexKoalaParams.CommitMerkleWithSIS(pols) + // SIS-applied Koala rounds are the GPU sweet spot: most of the + // segment-prover wall-clock comes from these. + // + // Path selection: + // + // GPU device-resident (CommitSIS): keeps the encoded matrix on + // device. Downstream LinComb runs as a single device kernel + // that produces a small UAlpha vector (~16 MiB at 2^20 scw), + // and OpenSelectedColumns extracts only the verifier's + // selected columns (~few MiB). Net: we save ~1.4 s of host + // reconstruction at production size and 4.7× the AVX-512 CPU + // vortex_koalabear baseline. Requires the goroutine to have + // bound a GPU via runtime.LockOSThread + dev.Bind() — that's + // what limitless.pinGPU(slot) does. + // + // GPU drop-in (CommitMerkleWithSIS): D2Hs the full encoded + // matrix and rebuilds []SmartVector in Go-managed memory. + // The reconstruction overhead alone (~1.4 s at 2^20×2^11) + // wipes out the GPU compute win. NOT used here — kept around + // for legacy callers that still expect EncodedMatrix. + // + // CPU AVX-512 (vortex_koalabear): production fallback. Used + // when GPU is unavailable, or LIMITLESS_GPU_VORTEX is unset. + if useGPUVortex() && gpu.CurrentDevice() != nil { + start := time.Now() + if ctx.shouldSnapshotGPUVortex() { + gpuCS, tree, sisColHashes = gpuvortex.CommitSIS( + ctx.VortexKoalaParams, pols, ctx.IsSelfrecursed) + } else { + var ok bool + tree, sisColHashes, ok = gpuvortex.CommitSISRootOnly( + ctx.VortexKoalaParams, pols, ctx.IsSelfrecursed) + if ok { + gpuCS = nil + } else { + committedMatrix, _, tree, sisColHashes = ctx.VortexKoalaParams.CommitMerkleWithSIS(pols) + } + } + gpu.TraceEvent("vortex_commit_sis", gpu.CurrentDeviceID(), time.Since(start), map[string]any{ + "round": round, + "rows": len(pols), + "cols": ctx.VortexKoalaParams.NbColumns, + "snapshot": ctx.shouldSnapshotGPUVortex(), + }) + } else { + committedMatrix, _, tree, sisColHashes = ctx.VortexKoalaParams.CommitMerkleWithSIS(pols) + } } - run.State.InsertNew(ctx.VortexProverStateName(round), committedMatrix) + // Store the per-round commit handle. For GPU SIS rounds we wrap the + // *CommitState; otherwise we wrap the host EncodedMatrix. Downstream + // actions read both via asHandle() and dispatch on the variant. + var handle *committedHandle + if gpuCS != nil { + handle = newGPUHandle(gpuCS) + } else if committedMatrix == nil && useGPUVortex() && ctx.RoundStatus[round] == IsSISApplied { + handle = newGPURecommitHandle(len(pols)) + } else { + handle = newHostHandle(committedMatrix) + } + run.State.InsertNew(ctx.VortexProverStateName(round), handle) run.State.InsertNew(ctx.MerkleTreeName(round), tree) // Only to be read by the self-recursion compiler. @@ -151,58 +273,117 @@ type LinearCombinationComputationProverAction struct { *Ctx } -// Prover steps of Vortex that is run when committing to the linear combination -// We stack the No SIS round matrices before the SIS round matrices in the committed matrix stack. -// For the precomputed matrix, we stack it on top of the SIS round matrices if SIS is used on it or -// we stack it on top of the No SIS round matrices if SIS is not used on it. +// Run computes UAlpha = Σᵢ αⁱ · row[i] over the Vortex stack: +// all NoSIS matrices first, then all SIS matrices. Each chunk is accumulated +// with its explicit global row offset so hybrid host/GPU/fallback execution +// cannot silently reorder rows. func (ctx *LinearCombinationComputationProverAction) Run(pr *wizard.ProverRuntime) { - var ( - committedSVSIS = []smartvectors.SmartVector{} - committedSVNoSIS = []smartvectors.SmartVector{} - ) - // Add the precomputed columns - if ctx.IsNonEmptyPrecomputed() { - var precomputedSV = []smartvectors.SmartVector{} - precomputedSV = append(precomputedSV, ctx.Items.Precomputeds.CommittedMatrix...) + randomCoinLC := pr.GetRandomCoinFieldExt(ctx.Items.Alpha.Name) + proof := &vortex.OpeningProof{} + offset := 0 - // Add the precomputed columns to commitedSVSIS or commitedSVNoSIS - if ctx.IsSISAppliedToPrecomputed() { - committedSVSIS = append(committedSVSIS, precomputedSV...) - } else { - committedSVNoSIS = append(committedSVNoSIS, precomputedSV...) - } + if ctx.IsNonEmptyPrecomputed() && !ctx.IsSISAppliedToPrecomputed() { + addHostRowsToLinComb(proof, ctx.Items.Precomputeds.CommittedMatrix, randomCoinLC, offset) + offset += len(ctx.Items.Precomputeds.CommittedMatrix) } - // Collect all the committed polynomials : round by round for round := 0; round <= ctx.MaxCommittedRound; round++ { - // There are not included in the commitments so there - // is no need to compute their linear combination. - if ctx.RoundStatus[round] == IsEmpty { + if ctx.RoundStatus[round] != IsNoSis { continue } + raw := pr.State.MustGet(ctx.VortexProverStateName(round)) + h := asHandle(raw) + if h == nil { + utils.Panic("vortex linComb: unexpected NoSIS state type at round %v: %T", round, raw) + } + rows := h.hostMatrix() + addHostRowsToLinComb(proof, rows, randomCoinLC, offset) + offset += len(rows) + } - committedMatrix := pr.State.MustGet(ctx.VortexProverStateName(round)).(vortex_bls12377.EncodedMatrix) + if ctx.IsNonEmptyPrecomputed() && ctx.IsSISAppliedToPrecomputed() { + addHostRowsToLinComb(proof, ctx.Items.Precomputeds.CommittedMatrix, randomCoinLC, offset) + offset += len(ctx.Items.Precomputeds.CommittedMatrix) + } - // Push pols to the right stack - if ctx.RoundStatus[round] == IsNoSis { - committedSVNoSIS = append(committedSVNoSIS, committedMatrix...) + for round := 0; round <= ctx.MaxCommittedRound; round++ { + if ctx.RoundStatus[round] != IsSISApplied { + continue + } + raw := pr.State.MustGet(ctx.VortexProverStateName(round)) + h := asHandle(raw) + if h == nil { + utils.Panic("vortex linComb: unexpected SIS state type at round %v: %T", round, raw) + } - } else if ctx.RoundStatus[round] == IsSISApplied { - committedSVSIS = append(committedSVSIS, committedMatrix...) + switch { + case h.isRecommit(): + pols := ctx.getPols(pr, round) + partial, nRows, err := gpuvortex.CommitSISLinComb(ctx.VortexKoalaParams, pols, randomCoinLC) + if err != nil { + utils.Panic("vortex recommit linComb round %v: %v", round, err) + } + addPartialToLinComb(proof, partial, randomCoinLC, offset) + offset += nRows + case h.isGPU(): + partial, err := h.gpu.LinComb(randomCoinLC) + if err != nil { + utils.Panic("vortex GPU linComb round %v: %v", round, err) + } + addPartialToLinComb(proof, partial, randomCoinLC, offset) + offset += h.numRows() + default: + rows := h.hostMatrix() + addHostRowsToLinComb(proof, rows, randomCoinLC, offset) + offset += len(rows) } } - // Construct committedSV by stacking the No SIS round - // matrices before the SIS round matrices - committedSV := append(committedSVNoSIS, committedSVSIS...) - // And get the randomness - randomCoinLC := pr.GetRandomCoinFieldExt(ctx.Items.Alpha.Name) - - // and compute and assign the random linear combination of the rows - proof := &vortex.OpeningProof{} - vortex.LinearCombination(proof, committedSV, randomCoinLC) pr.AssignColumn(ctx.Items.Ualpha.GetColID(), proof.LinearCombination) +} +func addHostRowsToLinComb(proof *vortex.OpeningProof, rows []smartvectors.SmartVector, alpha fextE4, offset int) { + if len(rows) == 0 { + return + } + chunkProof := &vortex.OpeningProof{} + vortex.LinearCombination(chunkProof, rows, alpha) + partial := make([]fextE4, chunkProof.LinearCombination.Len()) + chunkProof.LinearCombination.WriteInSliceExt(partial) + addPartialToLinComb(proof, partial, alpha, offset) +} + +func addPartialToLinComb(proof *vortex.OpeningProof, partial []fextE4, alpha fextE4, offset int) { + if len(partial) == 0 { + return + } + if proof.LinearCombination == nil { + zeros := make([]fextE4, len(partial)) + proof.LinearCombination = smartvectors.NewRegularExt(zeros) + } + dst, ok := proof.LinearCombination.(*smartvectors.RegularExt) + if !ok { + utils.Panic("vortex linComb: unexpected accumulator type %T", proof.LinearCombination) + } + if len(*dst) != len(partial) { + utils.Panic("vortex linComb: partial length mismatch got %v want %v", len(partial), len(*dst)) + } + + scale := alphaPower(alpha, offset) + for j := range partial { + var t fextE4 + t.Mul(&partial[j], &scale) + (*dst)[j].Add(&(*dst)[j], &t) + } +} + +func alphaPower(alpha fextE4, exp int) fextE4 { + var res fextE4 + res.SetOne() + for ; exp > 0; exp-- { + res.Mul(&res, &alpha) + } + return res } // ComputeLinearCombFromRsMatrix is the same as ComputeLinearComb but uses @@ -230,7 +411,17 @@ func (ctx *Ctx) ComputeLinearCombFromRsMatrix(run *wizard.ProverRuntime) { continue } - committedMatrix := run.State.MustGet(ctx.VortexProverStateName(round)).(vortex_koalabear.EncodedMatrix) + // Read either the new *committedHandle wrapper or the legacy raw + // EncodedMatrix. For GPU handles this triggers a full D2H — same + // caveat as ExtractWitness above. ComputeLinearCombFromRsMatrix + // is on the recursion-vortex hot path where a future improvement + // could split host vs GPU partials like the main + // LinearCombinationComputationProverAction does. + raw := run.State.MustGet(ctx.VortexProverStateName(round)) + committedMatrix := EncodedMatrixFromState(raw) + if committedMatrix == nil { + utils.Panic("recursion-vortex linComb: unexpected state type at round %v: %T", round, raw) + } // Push pols to the right stack if ctx.RoundStatus[round] == IsNoSis { @@ -261,6 +452,20 @@ type OpenSelectedColumnsProverAction struct { *Ctx } +// Run extracts the selected columns + Merkle proofs and writes them into +// the proof under the appropriate column IDs. +// +// GPU integration +// ─────────────── +// For SIS-applied Koala rounds with a *committedHandle wrapping a +// gpuvortex.CommitState, columns are extracted directly via +// cs.ExtractColumns — a small D2H of only the selected entries. The +// host-side Merkle tree (already stored under MerkleTreeName(round)) is +// used unchanged for proof generation. After all columns are extracted +// the GPU buffers are freed via h.free(). +// +// Host SIS rounds, NoSIS rounds, BLS rounds, and precomputeds all stay +// on the existing host path. func (ctx *OpenSelectedColumnsProverAction) Run(run *wizard.ProverRuntime) { var ( @@ -269,6 +474,14 @@ func (ctx *OpenSelectedColumnsProverAction) Run(run *wizard.ProverRuntime) { treesSIS = []*smt_koalabear.Tree{} treesNoSIS = []*smt_koalabear.Tree{} blsTrees = []*smt_bls12377.Tree{} + // GPU SIS rounds: handle (for ExtractColumns + free) and the index + // into the final committedMatrices list. We pass an empty + // EncodedMatrix as a placeholder so SelectColumnsAndMerkleProofs's + // per-matrix iteration still indexes correctly; we overwrite + // proof.Columns[idx] afterward. + gpuSISHandles []*committedHandle + gpuSISColumns [][][]field.Element + gpuSISMatrixIdx []int // global index into committedMatrices = NoSIS+SIS ) // Append the precomputed committedMatrices and trees to the SIS or no SIS matrices @@ -289,23 +502,32 @@ func (ctx *OpenSelectedColumnsProverAction) Run(run *wizard.ProverRuntime) { } } + entryList := run.GetRandomCoinIntegerVec(ctx.Items.Q.Name) + for round := 0; round <= ctx.MaxCommittedRound; round++ { // There are not included in the commitments so there // is no need to proceed. if ctx.RoundStatus[round] == IsEmpty { continue } - // Fetch it from the state - committedMatrix := run.State.MustGet(ctx.VortexProverStateName(round)).(vortex_koalabear.EncodedMatrix) - // and delete it because it won't be needed anymore and its very heavy + // Fetch the round's commit handle (legacy callers may store a raw + // EncodedMatrix; new callers store a *committedHandle that may + // wrap either a host EncodedMatrix or a GPU CommitState). + raw := run.State.MustGet(ctx.VortexProverStateName(round)) + // Delete eagerly: the encoded matrix may be very large and the + // state map keeps references alive longer than necessary. run.State.Del(ctx.VortexProverStateName(round)) + h := asHandle(raw) + if h == nil { + utils.Panic("vortex open: unexpected state type at round %v: %T", round, raw) + } + // Also fetches the trees from the prover state if ctx.IsBLS { + // BLS path has no GPU implementation yet — must be host-resident. + committedMatrix := h.hostMatrix() tree := run.State.MustGet(ctx.MerkleTreeName(round)).(*smt_bls12377.Tree) - // conditionally stack the matrix and tree - // to SIS or no SIS matrices and trees - if ctx.RoundStatus[round] == IsNoSis { committedMatricesNoSIS = append(committedMatricesNoSIS, committedMatrix) blsTrees = append(blsTrees, tree) @@ -315,19 +537,61 @@ func (ctx *OpenSelectedColumnsProverAction) Run(run *wizard.ProverRuntime) { } else { tree := run.State.MustGet(ctx.MerkleTreeName(round)).(*smt_koalabear.Tree) - // conditionally stack the matrix and tree - // to SIS or no SIS matrices and trees if ctx.RoundStatus[round] == IsNoSis { - committedMatricesNoSIS = append(committedMatricesNoSIS, committedMatrix) + // NoSIS rounds are host-resident. + committedMatricesNoSIS = append(committedMatricesNoSIS, h.hostMatrix()) treesNoSIS = append(treesNoSIS, tree) } else if ctx.RoundStatus[round] == IsSISApplied { - committedMatricesSIS = append(committedMatricesSIS, committedMatrix) - treesSIS = append(treesSIS, tree) + if h.isRecommit() { + cols, err := gpuvortex.CommitSISExtractColumns( + ctx.VortexKoalaParams, ctx.getPols(run, round), entryList) + if err != nil { + utils.Panic("vortex recommit open round %v: %v", round, err) + } + gpuSISHandles = append(gpuSISHandles, nil) + gpuSISColumns = append(gpuSISColumns, cols) + committedMatricesSIS = append(committedMatricesSIS, vortex_bls12377.EncodedMatrix{}) + treesSIS = append(treesSIS, tree) + } else if h.isGPU() { + // Reserve a slot in the SIS section with an empty + // placeholder; we'll overwrite proof.Columns[idx] after + // SelectColumnsAndMerkleProofs runs. The host path + // iterates `range committedMatrices[i]` so an empty + // slice is safe — proof.Columns[idx] just becomes a + // list of empty []field.Element which we replace. + gpuSISHandles = append(gpuSISHandles, h) + gpuSISColumns = append(gpuSISColumns, nil) + committedMatricesSIS = append(committedMatricesSIS, vortex_bls12377.EncodedMatrix{}) + treesSIS = append(treesSIS, tree) + } else { + committedMatricesSIS = append(committedMatricesSIS, h.hostMatrix()) + treesSIS = append(treesSIS, tree) + } } } } + // Pre-compute the global index of each GPU SIS matrix in the final + // committedMatrices list (NoSIS first, then SIS). + if len(gpuSISHandles) > 0 { + // SIS section starts at len(committedMatricesNoSIS). + // gpu handles were appended to committedMatricesSIS in the order + // we encountered them, so their SIS-section offsets are + // (len(committedMatricesSIS) - len(gpuSISHandles)) + i, but a + // single sweep is simpler: + nNoSIS := len(committedMatricesNoSIS) + gpuSISMatrixIdx = make([]int, 0, len(gpuSISHandles)) + gpuPtr := 0 + for sisIdx := 0; sisIdx < len(committedMatricesSIS); sisIdx++ { + if len(committedMatricesSIS[sisIdx]) == 0 { + gpuSISMatrixIdx = append(gpuSISMatrixIdx, nNoSIS+sisIdx) + gpuPtr++ + } + } + _ = gpuPtr // assertion: gpuPtr should equal len(gpuSISHandles) + } + // Free original committed columns from run.Columns — their data has been // encoded into the Vortex matrices and is no longer needed in raw form. for round := 0; round <= ctx.MaxCommittedRound; round++ { @@ -342,8 +606,6 @@ func (ctx *OpenSelectedColumnsProverAction) Run(run *wizard.ProverRuntime) { // Stack the no SIS matrices and trees before the SIS matrices and trees committedMatrices := append(committedMatricesNoSIS, committedMatricesSIS...) trees := append(treesNoSIS, treesSIS...) - - entryList := run.GetRandomCoinIntegerVec(ctx.Items.Q.Name) proof := vortex.OpeningProof{} // Amend the Vortex proof with the Merkle proofs and registers @@ -362,6 +624,20 @@ func (ctx *OpenSelectedColumnsProverAction) Run(run *wizard.ProverRuntime) { merkleProofs := vortex_koalabear.SelectColumnsAndMerkleProofs(&proof, entryList, committedMatrices, trees) + // For GPU SIS rounds, SelectColumnsAndMerkleProofs above produced + // empty placeholder slots in proof.Columns. Replace them with the + // real columns extracted directly from the device buffers. + // This is the small D2H — only |entryList| × nRows × 4 bytes per + // matrix, never the full encoded matrix. + for i, h := range gpuSISHandles { + matrixIdx := gpuSISMatrixIdx[i] + if gpuSISColumns[i] != nil { + proof.Columns[matrixIdx] = gpuSISColumns[i] + } else { + proof.Columns[matrixIdx] = h.extractColumns(entryList) + } + } + packedMProofs := ctx.packMerkleProofs(merkleProofs) for i := range ctx.Items.MerkleProofs { @@ -369,6 +645,37 @@ func (ctx *OpenSelectedColumnsProverAction) Run(run *wizard.ProverRuntime) { } } + // Release GPU buffers now that columns have been extracted. Subsequent + // recursion / self-recursion paths read host data only. + for _, h := range gpuSISHandles { + if h != nil { + h.free() + } + } + + // Evict the per-device GPUVortex pipeline cache for THIS segment's + // device. Production segments commit at multiple Vortex rounds with + // different (nCols, nRows, rate) shapes; each shape gets its own + // cached *GPUVortex holding multi-GiB device buffers (d_work, + // d_encoded_col, d_sis, d_tree, d_leaves). Without eviction the + // cache grows monotonically across rounds within a segment and + // fills the 96 GiB device — we observed allocs failing with CUDA + // errors and wall-clock timing exploding to >2 minutes per call as + // the runtime retried + thrashed. + // + // Evicting after Open is correct: this is the last Vortex action of + // the inner-prove. The recursion stage's Vortex commits will create + // fresh pipelines on demand. Pinned host buffers also released — + // they'll be reallocated on the recursion-stage's first commit if + // needed. + if useGPUVortex() { + dev := gpu.CurrentDevice() + if dev != nil { + id := gpu.CurrentDeviceID() + gpuvortex.EvictPipelineCacheForDevice(id) + gpuvortex.ReleasePinnedCache(id) + } + } selectedCols := proof.Columns // Assign the opened columns diff --git a/prover/reference-benchmarks/README.md b/prover/reference-benchmarks/README.md new file mode 100644 index 00000000000..2f7edcdbb2d --- /dev/null +++ b/prover/reference-benchmarks/README.md @@ -0,0 +1,132 @@ +# Reference Benchmarks + +This directory holds the canonical reference benchmark for the linea-monorepo +prover. It pins the host class, request set, and runtime configuration so that +the GPU compression path can be reproduced and audited. + +## What this branch ships + +- **Compression (data-availability-v2)**: GPU-accelerated through `gpu/plonk2`. + The prover automatically uses the GPU whenever a CUDA device is reachable + (`gpu.HasDevice()`); no environment variable is needed. On the reference host + this brings the per-proof wall-clock time below 2 min 30 s. +- **Aggregation (PI / BW6 / BN254)**: the GPU pipes are wired in (gpu/plonk2 + for the three Plonk phases, gpu/vortex for the public-input wizard + commitments, gpu/quotient for the wizard quotient evaluation), but they are + **disabled by default** behind the master flag + `LINEA_PROVER_GPU_AGGREGATION=1`. Performance of the aggregation GPU path is + not yet at production target — leave the flag off in production today. +- **Controller**: when launched on a GPU host, `cmd/controller` only accepts + compression jobs. Execution / aggregation / invalidity files are ignored + even if the corresponding `Enable*` toggles are on, since the GPU host + would otherwise fall back to a slow CPU prover for those job types. + +## Reference host + +| Property | Value | +| --- | --- | +| Host class | AWS `g7e.8xlarge` | +| GPU | NVIDIA RTX PRO 6000 Blackwell Server Edition, 97887 MiB VRAM | +| NVIDIA driver | 590.48.01 | +| CPU | 32 vCPU, Intel Xeon Platinum 8559C | +| Memory | 249 GiB | +| Kernel | Linux 6.17.0-1013-aws, Ubuntu 24.04 | +| Go | `go1.26.0 linux/amd64` | +| Build command | `make GO_BUILD_TAGS=debug,cuda bin/prover` | +| Config | `reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml` | +| Data | `/home/ubuntu/provertestdata2/prover-compression` | + +## Compression — GPU reference (3-proof batch, 2026-05-08) + +The first three sorted compression requests under +`/home/ubuntu/provertestdata2/prover-compression/requests/` were each proved +in a fresh process (cold-cache for assets is paid once before the batch — the +first proof of a fresh boot is slower until the OS page-cache holds the +~46 GiB of canonical SRS). + +Command shape: + +```sh +GOMEMLIMIT=180GiB GOGC=75 \ + /usr/bin/time -v -o \ + bin/prover prove \ + --config reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml \ + --in \ + --out +``` + +| Run | Block range | Wall time | Setup load | Solver | GPU prover | Max RSS | CPU | +| --- | --- | ---: | ---: | ---: | ---: | ---: | ---: | +| 1 | `30388561-30389025` | 2:10.41 | 16.81s | 33.12s | 1:43.61 | 200.7 GiB | 285% | +| 2 | `30389026-30389504` | 2:10.21 | 16.86s | 33.11s | 1:43.31 | 200.7 GiB | 285% | +| 3 | `30389505-30390023` | 2:09.96 | 16.90s | 33.08s | 1:43.12 | 200.6 GiB | 286% | + +**Average wall time: 2:10.19** (`GPU prover total` average 1:43.35). + +Each run is a single `bin/prover prove` process. The GPU prover sub-phases +(from `gpu/plonk2/bls12377/prove.go` instrumentation) decompose roughly as: +solve 33 s ▸ trace-ready / init GPU instance 19 s ▸ MSM commit L,R,O 4 s ▸ +build Z 5 s ▸ iFFT+commit Z 3 s ▸ quotient GPU 25 s ▸ MSM commit h1,h2,h3 4 s +▸ eval+linearize+open Z 7 s ▸ MSM commit linPol 1.5 s ▸ batch opening 4 s. + +Raw artifacts: + +- `results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/` — + proof responses +- `results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/` — raw prover + logs and per-run `/usr/bin/time -v` output +- `results/2026-05-08-g7e-8xlarge-gpu-compression-final/env.txt` — host / + build environment captured at run time + +## Reproducing the compression run + +1. Build the cuda binary: + `make GO_BUILD_TAGS=debug,cuda bin/prover` +2. Make sure the canonical SRS and the 7.1.0 setup directory are populated + under `prover-assets/7.1.0/data-availability-v2/`. The setup load is the + first ~17 s of each proof. +3. Run any compression request from `provertestdata2`: + ```sh + GOMEMLIMIT=180GiB GOGC=75 bin/prover prove \ + --config reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml \ + --in \ + --out + ``` +4. The GPU is detected automatically. Expect the first compression after a + fresh boot to pay the SRS page-cache fault (~2 min extra); subsequent + runs should track the table above. + +## Aggregation — gated GPU path + +Set `LINEA_PROVER_GPU_AGGREGATION=1` to opt the aggregation pipeline into +GPU dispatch. With the flag set: + +- the PI / BW6 / BN254 PlonK phases use `gpu/plonk2` (per-curve packages); +- the public-input wizard's Vortex MiMC and ring-SIS commitments use + `gpu/vortex` (the keccak-vendored `gpu_mimc_cuda.go` path); +- the wizard quotient evaluation in `protocol/compiler/globalcs/quotient.go` + uses `gpu/quotient`. + +The flag is off by default. We are not benchmarking the aggregation GPU path +in this branch — those numbers belong to a follow-up PR once the path +matches CPU on production hosts. + +## Diagnostic env vars (advanced) + +These are tuning / debugging knobs and should not be set in normal operation: + +| Variable | Purpose | +| --- | --- | +| `LINEA_PROVER_GPU_DEVICE_ID` | Pin the prover process to a specific GPU index (default 0). | +| `GNARK_GPU_PLONK2_MSM_WINDOW_BITS` | Override the auto-selected Pippenger window size for the BLS12-377 MSMs (default 20 for n > 2^26). | +| `GNARK_GPU_PLONK2_LOG_MSM_PHASES` | Log per-phase MSM timings from each `MultiExp` call. | +| `LINEA_PROVER_GPU_VORTEX_RECOMMIT` | Force the wizard Vortex commit prover to recompute the host-side encoded matrix at open time instead of keeping a device snapshot. | +| `LINEA_PROVER_GPU_VORTEX_SNAPSHOT_BUDGET_GIB` | VRAM budget for the cross-round Vortex snapshot (default 48 GiB). | + +The keccak-vendored PI Vortex tuning knobs +(`LINEA_PROVER_GPU_PI_SIS_MIN_ROWS`, `LINEA_PROVER_GPU_PI_SIS_SPLIT_MIN_ROWS`, +etc.) are documented in +`circuits/pi-interconnection/keccak/prover/crypto/vortex/gpu_mimc_cuda.go`. +The master flag `LINEA_PROVER_GPU_AGGREGATION=1` automatically opts in to +PI MiMC, ring-SIS, and quotient-reevaluation when the operator does not +explicitly override the per-knob env vars. diff --git a/prover/reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml b/prover/reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml new file mode 100644 index 00000000000..c58e0a9e5de --- /dev/null +++ b/prover/reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml @@ -0,0 +1,138 @@ +environment = "mainnet" +version = "7.1.0" # TODO @gbotrel hunt all version definitions. +assets_dir = "./prover-assets" +log_level = 4 # TODO @gbotrel will be refactored with new logger. + +[controller] +retry_delays = [0, 1] +spot_instance_reclaim_time_seconds = 120 +termination_grace_period_seconds = 1800 + +[execution] +prover_mode = "limitless" +conflated_traces_dir = "/home/ubuntu/provertestdata2/conflated" +requests_root_dir = "/home/ubuntu/provertestdata2/prover-execution" +limitless_with_debug = false +ignore_compatibility_check = true +keep_traces_until_block = 0 +serialization = false + +[data_availability] +prover_mode = "full" +requests_root_dir = "/home/ubuntu/provertestdata2/prover-compression" +dict_paths = ["lib/compressor/compressor_dict.bin", "lib/compressor/dict/25-04-21.bin"] + +[invalidity] +prover_mode = "full" +requests_root_dir = "/home/ubuntu/provertestdata2/prover-invalidity" +max_rlp_byte_size = 4096 + +[aggregation] +prover_mode = "full" +requests_root_dir = "/home/ubuntu/provertestdata2/prover-aggregation" +num_proofs = [10, 20, 50, 100, 200, 400] +verifier_id = 1 +# allowed_inputs = ["execution", "execution-large", "execution-limitless", "data-availability-v2"] +# Conversion to is_allowed_circuit_id bitmask (MAINNET - no dummy circuits): +# execution-dummy (ID 0, bit 0) = 0 → DISALLOWED +# data-availability-dummy (ID 1, bit 1) = 0 → DISALLOWED +# execution (ID 2, bit 2) = 1 → ALLOWED +# execution-large (ID 3, bit 3) = 1 → ALLOWED +# execution-limitless (ID 4, bit 4) = 1 → ALLOWED +# data-availability-v2 (ID 5, bit 5) = 1 → ALLOWED +# To customize, edit and run: go test -v -run TestCalculateCustomBitmask ./circuits/ +# Binary: 0b111100 = 60 (decimal) +is_allowed_circuit_id = 60 + + +[public_input_interconnection] +max_nb_data_availability = 400 +max_nb_execution = 400 +max_nb_circuits = 400 +execution_max_nb_msg = 16 +l2_msg_merkle_depth = 5 +l2_msg_max_nb_merkle = 200 + +[layer2] +chain_id = 59144 +base_fee = 7 +coin_base = "0x8F81e2E3F8b46467523463835F965fFE476E1c9E" +message_service_contract = "0x508Ca82Df566dCD1B0DE8296e70a96332cD644ec" + +[traces_limits] +modules = [ + {module = "", limit = 131072, limit_large = 262144}, # default module limit + {module = "ADD", limit = 262144, limit_large = 524288}, + {module = "BIN", limit = 262144, limit_large = 524288}, + {module = "BLAKE_MODEXP_DATA", limit = 16384, limit_large = 32768}, + {module = "BLOCK_DATA", limit = 4096, limit_large = 8192}, + {module = "BLOCK_HASH", limit = 2048, limit_large = 4096}, + {module = "EC_DATA", limit = 65536, limit_large = 131072}, + {module = "EUC", limit = 65536, limit_large = 131072}, + {module = "EXP", limit = 65536, limit_large = 131072}, + {module = "EXT", limit = 524288, limit_large = 1048576}, + {module = "GAS", limit = 65536, limit_large = 131072}, + {module = "HUB", limit = 2097152, limit_large = 4194304}, + {module = "HUB×4", limit = 8388608, limit_large = 16777216}, + {module = "LOG_DATA", limit = 65536, limit_large = 131072}, + {module = "LOG_INFO", limit = 4096, limit_large = 8192}, + {module = "MMIO", limit = 2097152, limit_large = 4194304}, + {module = "MMIO×3", limit = 8388608, limit_large = 16777216}, + {module = "MMU", limit = 1048576, limit_large = 2097152}, + {module = "MOD", limit = 131072, limit_large = 262144}, + {module = "MUL", limit = 65536, limit_large = 131072}, + {module = "MXP", limit = 1048576, limit_large = 2097152}, + {module = "OOB", limit = 262144, limit_large = 524288}, + {module = "RLP_ADDR", limit = 4096, limit_large = 8192}, + {module = "RLP_TXN", limit = 131072, limit_large = 262144}, + {module = "RLP_TXN_RCPT", limit = 65536, limit_large = 131072}, + {module = "RLP_AUTH", limit = 16384, limit_large = 32768}, + {module = "ROM", limit = 8388608, limit_large = 8388608}, + {module = "ROM_LEX", limit = 1024, limit_large = 2048}, + {module = "SHAKIRA_DATA", limit = 65536, limit_large = 65536}, + {module = "SHF", limit = 262144, limit_large = 524288}, + {module = "STP", limit = 16384, limit_large = 32768}, + {module = "TRM", limit = 32768, limit_large = 65536}, + {module = "TXN_DATA", limit = 8192, limit_large = 16384}, + {module = "WCP", limit = 262144, limit_large = 524288}, + {module = "PRECOMPILE_ECRECOVER_EFFECTIVE_CALLS", limit = 128, limit_large = 256}, + {module = "PRECOMPILE_SHA2_BLOCKS", limit = 200, limit_large = 400}, + {module = "PRECOMPILE_RIPEMD_BLOCKS", limit = 0, limit_large = 0}, + {module = "PRECOMPILE_MODEXP_EFFECTIVE_CALLS", limit = 32, limit_large = 64}, + {module = "PRECOMPILE_MODEXP_EFFECTIVE_CALLS_4096", limit = 1, limit_large = 1}, + {module = "PRECOMPILE_ECADD_EFFECTIVE_CALLS", limit = 256, limit_large = 512}, + {module = "PRECOMPILE_ECMUL_EFFECTIVE_CALLS", limit = 40, limit_large = 80}, + {module = "PRECOMPILE_ECPAIRING_FINAL_EXPONENTIATIONS", limit = 16, limit_large = 32}, + {module = "PRECOMPILE_ECPAIRING_MILLER_LOOPS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_ECPAIRING_G2_MEMBERSHIP_CALLS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLAKE_EFFECTIVE_CALLS", limit = 0, limit_large = 0}, + {module = "PRECOMPILE_BLAKE_ROUNDS", limit = 0, limit_large = 0}, + {module = "BLOCK_KECCAK", limit = 8192, limit_large = 8192}, + {module = "BLOCK_L1_SIZE", limit = 1000000, limit_large = 1000000, is_not_scalable = true}, + {module = "BLOCK_L2_L1_LOGS", limit = 16, limit_large = 16, is_not_scalable = true}, + {module = "BLOCK_TRANSACTIONS", limit = 300, limit_large = 300}, + {module = "BIN_REFERENCE_TABLE", limit = 262144, limit_large = 262144, is_not_scalable = true}, + {module = "SHF_REFERENCE_TABLE", limit = 4096, limit_large = 4096, is_not_scalable = true}, + {module = "INSTRUCTION_DECODER", limit = 512, limit_large = 512, is_not_scalable = true}, + {module = "U20", limit = 4194304, limit_large = 8388608}, + {module = "U32", limit = 4194304, limit_large = 8388608}, + {module = "U36", limit = 4194304, limit_large = 8388608}, + {module = "U64", limit = 4194304, limit_large = 8388608}, + {module = "U128", limit = 4194304, limit_large = 8388608}, + {module = "SHOMEI_MERKLE_PROOFS", limit = 8192, limit_large = 16384}, + {module = "PRECOMPILE_BLS_G1_ADD_EFFECTIVE_CALLS", limit = 256, limit_large = 512}, + {module = "PRECOMPILE_BLS_G2_ADD_EFFECTIVE_CALLS", limit = 16, limit_large = 32}, + {module = "PRECOMPILE_BLS_G1_MSM_EFFECTIVE_CALLS", limit = 32, limit_large = 64}, + {module = "PRECOMPILE_BLS_G2_MSM_EFFECTIVE_CALLS", limit = 16, limit_large = 32}, + {module = "PRECOMPILE_BLS_PAIRING_CHECK_MILLER_LOOPS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLS_FINAL_EXPONENTIATIONS", limit = 16, limit_large = 32}, + {module = "PRECOMPILE_BLS_G1_MEMBERSHIP_CALLS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLS_G2_MEMBERSHIP_CALLS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLS_MAP_FP_TO_G1_EFFECTIVE_CALLS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLS_MAP_FP2_TO_G2_EFFECTIVE_CALLS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLS_C1_MEMBERSHIP_CALLS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLS_C2_MEMBERSHIP_CALLS", limit = 64, limit_large = 128}, + {module = "PRECOMPILE_BLS_POINT_EVALUATION_EFFECTIVE_CALLS", limit = 16, limit_large = 32}, + {module = "PRECOMPILE_POINT_EVALUATION_FAILURE_EFFECTIVE_CALLS", limit = 4, limit_large = 8}, + {module = "PRECOMPILE_P256_VERIFY_EFFECTIVE_CALLS", limit = 128, limit_large = 256}, +] diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30388561-30389025-response.json b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30388561-30389025-response.json new file mode 100644 index 00000000000..bf22acf1148 --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30388561-30389025-response.json @@ -0,0 +1 @@ +{"eip4844Enabled":true,"dataHash":"0x013073c5f584ec10ad221035689b038577aa0a7b7ac8f0a6ea355c8571a27bd0","compressedData":"P/+DR4rvuKHEmsLIrjjTTOIiVEZYCKl5ZW7uf+DuIlo+qeABQAMakAGSMALeEA0gIA+boBNysAWvMBGTgAPOgABEpADA8AAtZADzDAAIRAAVdAA8WAAl7ABDKAA/ZAKaIQABAAACae3fP68nKYl4UUJY9oTOV1+c0g2vbv8ABMoOTZNhpqSdFrtCZRm9/gAN0uN4dPow25nlbR8XyVCpYzatHyPCrAvkEMP8EA09oFT5mJ/AYAJmwYGj6kofeexdBx18MbYmedghHpopnAG7NdAXIICOPdK4H+Vgyqr/MQMjpgIB/noEvF/g0BA/hMDy/isA/rKkuQ43wskAf9g2nyQze2h6JDXa/zwFmtgm2mf58Czm/8fAOGAnVZ8J4Zy+PNavQALcMmXv5EV290PlWgvHgwoP33G9z5MjX7BxTgYqMnwq9sm4MStZ+emfn3WjCz52KZjZM0J9ZyZaltKPngkQ9dHIMLzX0Lb9JAJyCwDq8Z6IRQmu9wKBJYj+uJ04IF811sRG6WxQzM+grRwQjJk9T+4OPAazmO6MOgaQo18/gICzbWwLq+oErRluMHb7kWLvk4EWO8ij0HHsDvPIzjHLliFXsoefzVyHvNq0+Ur4dD3ATNkWlwUU8/t8eYS1PPXzlHB+biCu4nrHe73xuosT7h9aEjy+z7PJkldy9GoExHhQ7/AQBhDOYdb+jpXVHVCZBynR1HEc0oVe8aov9xiUWZ+hR+PeV7xV9kzjwrKwjitoxLB4flHNF4umFu6rHxJwY2x4Ixx7pR5mx+5Vye65n9mQIZ9XmxSMPMCqQngFLoip+2bmbjljXcclVptGmKCi8MZPS+zWk+PEsEJ6NAd6KaCgDxLw0ZeV5gCudgYammumn0tTgNYjiNSwRTMfEbRSAl8aj3PFAj3Q3+AgMxYpdoPz0rNc4dksoVejJxAR0nZhRXKMP0ZsjzCib0YyYBvXTQXVcs1Ol02Lwby4qQ6Zyd2lZEE/wHoMwODHWfrNcfGTLC+ruZO0gN97eGvmM01VnMYEd5u3tpfSUdHExJNJNgiyu7Wh79oL0a/ZMEgRG5h1n3jI5xu0RaTrxeMIDLsBH+OHwRzEOVTNEeTzCC3czu/JUtLxTMjpXUkEEF1VbpjhZ64G0sAQbNmTAJRHF1h4GvINN1HSp0qQnTjdO2Gl/+ABDM4BYOjDWphw2j8gAQ9FGWJqKNtrezQwJzw3X1zBmOASsbwec9F0O5nstQIP8IBK+HpCASNHYkIBI0eUf4IB4nf8lAC1/+QgHrj/DIF7dcH8VhtP+PgBhH+IBCk7GAMxrc+2IrrxEJvWTdMLi8IA/47D6QFE4xLAAAFp7d9HYdEHbsWhzGsae0ViqxDAMh+KBtpVXax1+GopQx+X8VI0q1GKXsuuDNhCEpNuC1Ylug9er4AvkE/wQEELjNd/hQCRTB6EglGG3SSpArkMQDYpcDo40khRyqHn5IC5BGTeXguaAArV3ISVY93yZy/aJ1pWTF3qrC7+dJIBpTcYRnzx0AT/x0F+UP6L/HovM/4+D+dAOAAQAAAQH/RQUhl/GYqzAfx6NMgP+PgWbWAP89AL5IAwf58B9EQH/LQQcM7pvZ/BIbCBw+x7ZAB/PiAAA1H4ZhWxpfGSXzjaLeeuszLWLkVvTd8I2KlMoRVZCxYIosyjqJlFXeAFcfR8TAZHFxqn0wKSEJS4l8QiJnoWg9Fsl+cE/RdnDLb6bnnsPlSX5jb7T1PZJbJd1gQD2wZLAzgw3pdKOeKl4fTqffQILfCqnGQpkKapFXbUHymDEEZ1qy6bPeqjDMPt0K0g+v0N1bD2UdzsesQP/wAKddkHiSHkGZvsbbQ7JAYQCS518MDIPTRYHRdH0XJrsA01iB21hjt/wAKwXS7/D4G+czgm4v4plr1EeZYpI2PYUPBvGELq6X2hzyIQmd0E4jymXoXbGvUgdyYVzg4+aSOpaGgYdiFjU8egrW8So7SC2iIg5srYzYNMLWfOYB0QzwraAxI7qXifcRaWJ4Yzy1lUYX2LIsrB2awALo6mVUWN5URzJbV/agizkl8KjGIIDhO4Yvrqz0rQ71jZ+Z/fDIQpuXYZEuC8qBrdVq5s20TFcocZncET1rf0CWeQF2E7NtY4UhvfEtPbU+BBqX6VUuvk5r/Aomemw1rXK9RN7QWWYqsmAO8K8A+OjIcDtOydyeyC3gAkK/c/TV7882PGouFow1gxWEnM10yycYAXyC/4IB4TlU4hAJxzQf+AwAQOgwIapz/IAY+Z/DJTRC+urRzyAwPovTj/fwAPlKiEex5Kwv1DK55blxYoFvTr7W3+Og9K6kUv4fC7/8fBYgMUH+xG7KYYY2CqBfO+nzJb1Tc2g9KiEcdzEKCLQTYL9f8Nhp7T2ywnoaqdMNbx0RDZ6AKTwaOLAC2TciJDZnqXmJt/gIGDA/hTxDH+Hi2kg/lwqJAYX2/h0C/3Nr+HgV9TAn2HF+wlWyfclfB3yP1oXvMiu5DyA4w+xXPH/cTxy6l/j8F/0xgJhfzug5u+2uweuFBBCpryr7aswdjKgCdZ+9PClfsVMyELhc3fiHWm4F/cPJAJuPOSBAVTvBPY8/KJ1/1a1b84fMqf4ANfhd7c4q5Lu3Xxg4+gjIbfYOO9TXTVzh1FwhQXo4xa0X/1cGl1fxEFrgX8ZAK4H/DYN+09u57gACAAF/qwLQp/jMBXBv4iG8QL/MgbNuARhHvaLE5tUCYiJLIbdOE8pMJEn93Vu1loXD6J2TLGN6/xsArgP8QBF/8agReBcmNRhmi2k3Snv/n6o/gNysO7KtJUEhrE8Pxk+HfJHEJ/40B2oWnt31nAKU80YE4AkTIcBPrWbwgyXB1d2gEymhoSEgMwE78W5dYaHHHJhx+Hai47/e4Jat0lMDK6vzBAcdmmwgF9eED/AYAJHwTE4SnNL2PeuFhtpkffwqW8AWWaQYN4+kODD8U1kQs9QGAAArT27661+QyhYw7usA0oJWEzpASlaWiVGRgmt0IqtFU0SXc31XaF8CVImEWY1wMKPEgCAPc6z19Wt/goDMadhr+AwG2I/gQAEgmGolCfOF4qSvgedFwqXUGzX7ciDARtOoYOXmAhkxCAwP4TAHM/Bgc4PP/wQBaREFat3+qASWAw55oufuy//hUCF/wUHz6sDQfwSBDQYIOan+KA4iqAuQX/CAcRH/E6Mj/lAIPCP4KAqP8+BB0oe5t7G1srp/hYIOn/isop/w0GFbf4mBAgH+KwB/+EzWn+OAR/+HyvAwafQZiLew/jlLRfaYU9jRD/4MCaEeLC7GlCEVnwxkJZDeSSUi+GWT+HhkfEAAABJLMVNIk/j0CnAED+PSE8BgP4rBh/4SUGf4aPzw3xxr0mZD/KwGLN/GIA+FH38PD16B/lYPxw/hMSB/isAf/h8AX/4fCx1rqV3WVn7wlV8yHibwb+QojnZqR/isC//hEqD96n/h0qv7P4fM3wD/GwVU9/DZy2S/4uArIQST6YQDkOHCf8kAVkP8cquX8PISYD/jAHN+GHwvSQ6Q4mb+GzzgHTdC/x8ECsiH+KgJAT+BwQPq/oMEDav8ZA+Vb3cib7KIP/Dwb4t/xICB/wImYMHl/AA4jJYupJttr58aJxDfERy96G2XqDyfDdX5udGDsw1wS/4sCJpf8ECAIGsefwQD/8Z/AiZgvcW22izBmWowCC4NsA0e94aHWHkBcgcJ/hIA14f40ZCv4fEMALgAf49Rz/4LVIMB/xN2Xrla1MuN4xUAlU00AACXCAIABgcCBAEIAwn+NUbn+H0nD+HAFaJChv/hxmKiTkwnP49AH4K7/jwAfPF8c/jwAfP0To/h0AH4dQ/h8j5IqcxC+h4BBKTxUdxLlrYIP8BB37Ca2ri0YMuJCeSRyMHTb/1ZFtWB0SWuuv6gWbqbAv5tgiLTk1ryIqSm8Qs6nE/SZC2xTwl4OlRcq14Qmci9zeq/4CEO/4NM+44hTFalK+hRXzJE+r0lY69HXMS/Lbl+w0Omgb3egolC9km3C5V86ZKIQ+/h8mZe85rN9Xgbeu0okawqVTvC+XO5Y7KAc48lVuY4UNy6iHfoeLeA9RjI+/eBWVI04SCr/AQW5Q1rW967X3o820OFq4zjpRnhohfYnnxxMvZO8n7p2ViDkt45IuRoiq6bkPB76u5h0zkf3YyYwiEA4ikj6OD+cmUovHFj/AB6qoJln7vO+5XcIYfwKuQMm9gCxEBSaSDf5Dlz+aeJwfmmlX1PW+QCaq3ZhcvX5Hd5R/ABYQiQP9TwOTmCzRQlVTHyqm32nSfwOxYFbp/WzFh/mAPhiv4OdW0Ng4J6YdQ9aezivgKZq8NRDOGeF/jxyaSBktMD+BHohqSSQw1w8EzkNXGA30pa+wnoxjbGU/uHYyIo8YhuF6MJwypan5WPklCx+M8gXqSuZyTnZ/FP+B2LBbED/JAHoICf5QA9BP4YMVj9YAaA/MitqGRudf8D2nQT8bMSbJuEDnxP+fDnaTzXzPGNOP49Br/4hY0P4DYzgEB/kEF3/5SY0P5gXzv5fY0P4YKsDAIqzN9KZj+DlQCFV8rjf4KOtQVo/hgtwM9ND93hjyrj+DWPCj7Bh1/gpJ9/gADjbZ0/h9KgOqUhxcLOi+Va3cKFm/lDZ5ZO+KF7ElJfwBNTCL8U0mmysa3wT0u1FuEtgi1P0aC99NdMrBT0lCrCxLYxxXpFhZbDgU/AHPBZYTgGmgqhO3Yp0RuzIJyz+mEVQAERbkScexeaYtHwXYwTjYKR57V8GIICbUuLeTXgYdNfarJPaL19w8yaoi22we/Io4kg768oGQbqx45SdULzSDZlpbUPbDP1ChmfxxO8RgwfS+6G/Ej1qLQ8M7TMq5CSyVicJlMv1B/h9OAHe24WRe4rSufw9hLOneHGlak8sbNR7Z1XNZlp33DF3NQcIN/MsY59t3+btvzFCnXeiWzxDN9CosiTyMV6o4JvoC7m/At6a19LU48CVnfCLDk83GH5BjRBOcCcq2N8GD6JTCQfvwk3JVZ7pbvbnUwDfDGQ0rEymxou51Uy2m+/gAUN2QUTRKB1zU9kTB23jDLMT6jhCt4+Bm/8DPXCDkSNG0d7TBwz5Rps0gBp+P4oG0LxaFCUzZ9t0Bx1FZqEDyG8HKGbbA/xoBSrBuvtQgISAd7/AGACY0EtHkpzaBgIjq1FrVOTkdehmV6z0pkNhEOA3KBzpsbrFAtcsMidYVkJTvxJYlDZJ0n2OyxVPol7A6Mve0KCBiJfv/ArvU3ULA1EwO7uOKC1sKzC17MnilyBTKmCI5VyHexqbX769ml1Xg3uvB7PRlarrDbU+ug8ERN8v/AilbgP8EJfsjBgR4A/5OBhseGYk2XPElOLS48FujyCgEFjLOA/x0D8k4y6gfwK9LOkA9LQuLn4W8y0OPjLs5TKMzd1oaMRKoLjScCdByQ0YLf8Ks4wR/Aj0sqnMIBMf8AA1cXfwGACGsGEc0p/kAOHv/xoEW6sZcsoA+EcDxCQGyOeylEn2IPe7O93IYxN8rTVWM9maDzP8OOS0Di6/8fs4N/nYDJtael4DwSXWoTGJGv7Bb+LnEKZN46CoVOckvSDy8j5I4dwvVYr/49lEAJg/jyRwAYRkvxbbXGX6aRmCSMc1Pwqdp4/HiVdS/ArsYu1p9VIRYYb/h4Ff2P59BX3wqaYJRVLuNg28UxoJD6jXgce0dVQm7cBr/ThfanXLhYQv/5+fdiwsLUZIxupEsFKkPotPmTO8gl2BbSdsPSJFSOpLmYz648Saf8BBMzKWKdUGYFX1dpMZ/ABPJktGRMnoUvS5T5khzRIOy2FAVQ5xefaSH+lgjGcDjE8NE7cQS6C1w+6qb7Zch2uMGmgIVBE7vswMsCEald9gRa6O8Ryw9FBemD8+FqYboWBZYCEvbgIPRzhTRV+qyFwgWXXJynzoaMGiwAtPQLQFTPSXEMgEAWq2ngSan8SSWCC1P++NBnw592RTPP/2BobB9ySKGDzNEu4iQxr0v7RvAsl/oStoHMPb1/SgIoKiIHemWZwVxLBPuBkjPygcjMUCUIF7v71I5LXZmTvEBSk77hNfiYKFtv7prhm6uKDTGi5Uup6SUOh0vxS/hJhmDC3eYg+paYGPI7RVyhUXefhFKhF6IepZuCJtMTnAi60SHt2U4CzkPujt1cb2t/YhaaZnBq/JaodIAhyaFcvQi8c9yYDdIvMCUhCf3kjArRz2E/pMaKPDFHxzvJljuUweK+qfi4tsiyHdtHS7QXCrk7iDdld9bPfiBPQDaiahUe0GVc7t57Op0NyKWglD7fJRG+KjGJfJibfcb2kVbZe13BUUjVxVUeTSKLe7xRctphkTr+3kfgcmOowBDYMZoBJmTHJ1lyg7BsSG5xGfwrvVQAs0uJyWtDP7HN1B812WD4bnnD3aON1wTOEUsKX+D+p9X0lj4GVhUpUYnooQcPhmqq81QLAh9QE4AABnY5jC6PLFwmOE6sy+dGwYcTQct0Eh0o1XXOFcEa4OLLz/yx6ZC8GYL+ScJRF5JDkePoCcLxTX65C2YPhqPmKL56KX4DktN5snYHvsZQPAMrW2MASdoj0hRYx385vZJgKVH+AguHZXwaxStM2NEFDXyOWhIoXi48skV08TgEv/NO7qy9MLI42HtsajtX6AiFDgyB56F3SjIu0jKiKJvddE3omwPZP6OvVuNMQzYV84ZDx0qSBkX+Q9jRMW37hjpZGuCDAxPasvalZxg0QCo1QQxcJBentDulHiUkdCxfWl3Eo5zBiovcPqAZcxh0u9LMVynO/cPLflSDG6C7b/SX5y7LgMyBWDjGVh+4mqbnQgaBYzSpVsB6Ru1o0/K5Dxipwgy+SisMWm+IOhBfSVsTgkzBNvfZMOr1R7IvZIT4/ke6e56FgVRh7tY0MZat/woEm6dwhFwxSD9ibT1/8BAkKl69D6bD68CH/vxWW6zPFAIsAEX8OgPvYkoLM977vfvdR5InGiXgyGWgdYdzwfppzCWKq4JsdBrRpg9kcFDPgIfjnNxOF5komFsp9hiiKf/L4OJbJyE340hH1aun+9tTh18MEQB1K0P2Cd7hkxxFUsd823HPaLlDnsNOyLGA9F+GmB5F8tetLHEFN6306HOjC3pDDcgA9I3D8Jodsc18aUZA2oyLbtqOaUX8JCYNOczC8TB2/OHXPy8J2B/QQleZB5oMhDQLFwrUkO5LHGhlB4u6VwFQswfNDyVay3Adtx90sn6wXqD0HksYX2TIeJ+pDu+DMWqVdfKYq15wblfXPAWxW2S+eRk57ZMXgHKoL5wOGKvqugdTK+3NaO+5KJLC1MwgZTOp9+6CokHbXlXmnAz3N92Lk96a+FOewWaJfJRF4ld96idBYnMimAYcLlmcQfTSGUTL1jisu0e19e8bP9bKSj0P7yjTXjEoICkecaVBo5YRTFYZi3eN+c9SnpUT77xzdZZaA/uIfIkQJCNn7Al3Qzq0uhyRJHXv3xDJeJjwnQkrD/QkA4PAL0WB1Sp5A72DN+MJ/8U3JueUwQdzPkaspot3KvVnSSB8WDQdwXLNySboWSO8NPMiswiOMeCIqA+mvjXdhv/WVvRSJXot5UKahn6jZdeeWeZVrkxMzQk6rsycOUqOMHQX5w1/As7NADiV/wAefnZyykxFqhmitcBAmWbaIeSOcBiVvoulPj6MfBAwep4KbUYxt1kzJm95mNoPdvwWGrh/BCCLdf3sggtDo/wBByZtO+Iskcqdk/D1qGcuanLBN/HlHsFcDT/AjR7QuAO/WEIbTOXySB4T1nRgYU+9UFx+xJY6Oe2IKkEDXzh/8Jlt/+DAN2MBFIQDQQr+BAAQdgwPb3JTbOjkpIaSgfFXpY2RKJGF8h4woeCAuMR9UWjR/h6a05/4djhPjof4cXu81lACgfbm64pscGvuCJVkUwIckDL4Xgqz21pM25JbnqAKxSxnPYvOx6/TNcP8BCIKD+fy36oCYLVlX0ly/cQ0KNpqjK5P+PgswYb/gWokjT2/Sf8BCNPpuVOOSRsPdQTpjgPt6qvOq2VKSGGNsUryDvWx/ioMYPb4MEqN8/gV5kAbAUlGgwNRiONnD9od7yp6pXQt6ijtXzgLkAUQtUBLH/j6icwP4riXJDopbiB0hH0GNd7qJeHiLYlRCR/wwIGOavAl0ayEa9+t6XVr8fCmkO+D7n8GLlTh6m1JupS1sAPFNw3YPISM0J9/gID8t/iuLc/xsG8s/ws9Sc8POng88fwSxPyKWsMhwF5S+5+LAHfA6ShjoP0f+Agyix0hPLh+Mu+JoE9h1Y/hw05OoQCZTKE/gMAEi4MI5sDPjs05/v4vZ/mA05ZTfBVQUGuWUFRUgXE/4AVnBMQqqU/gCYRhwYcgDVI8i6QJuJdjZ/+HjpnZ/n06ZfwUOYLrBAmTGuxuGV0Dlh81MqH36cBsyPW9KlwWBXBZFKb+fzTJF96s+FMn/Mb8RYx45/BrLfcOWOViCZuHACPyXe32fErDeE7A/bcTLhvOd8QxPiQNsVr0daQAZEc3yhiq1MNwD5L9mNJj6mcmxM97Ojg2BiS1PH8pBms7JPyUw8auOKfHBHZcI+H5FMUhOyOkG9T+d9h+LS3sgGWqSUZqq7Zz2Q74E9KVcSG4iQGWhKW7oZlxmTLjEySywMU+axE/SptoDjQEfm2/wEHjCYctmZOq4kB0jTo/tSUVuXrhhldei58JOqfsWYEbS2cB1s0cB4ddIpGlswU8z/q722b5pMlLHURV43HbAuhOkjeYhyqJVgwKW3D9lkxLinobp/Lt5KwvRC2NXCUglZJd4d9EjvSRqhgQwJ7q2iumy8KEWYA2/mrEfj6c3AosPYWaKTT0fcc2SDdePvTPpG106hUc8aJ/iMzjnbR8W5c080BmELjhaaMfEhUMoB1ymCeW2xwYxugSxbw5KyusYujQiMVsxM7zRGiexnxuH5FSF4OG3HymvhfPPh4n645gAatzG1sAnDKDd2IMST72b6xTZmhv1ZIHhA7zYX4KaBodJmssaZkTeSM3BJLgpRT9ApcwMUlgMn59ShD6xS3QXY1nIfw13KLmKbyarozYLF4BahVOzy62/4CAl82norXOvX5JCylfL6Orx/YHzq+NlEIFka3CDH1TBAB6ePEc/lFbbxlLPsXBp1OwD/2LuXguE/sy9eX98g2/AXBN5FhclVurUpJju+qPmAbg9mybad5i7KwIDWF4L1KPc+muXHkx6UxqiAV7MNJxme6vgXxJZZQUymg22sNAk0qn3Ob8bnU2QoRCGDzvLZTnx2dC92Y7QzsGMszD5CTDzhWRSPyrXtgxjmhI4k8016NVfsCsdPGceyYAbgNOJxwBd4g/U20x5lsuX1G7I3gmpP5okc/WnFWG8GBhGPp4xozavMYix0SOfPCaDJehwlKxrVJnyf2UUQ8B85iCnjIXxVxlw1kvuyoDemXYII5gYAi3AfwcVJwLVwc/3PpIL6uNCypFM+K0VgZ0uyVEaKIcbydOulsg69k1y8N+2n7LQYfBGQT1HXhgqfBAwtC00nCyAhp/mU80RbQqnHKAKgCWDPzl1MrObdieViuaMKhRZELiK+RNuQcL+AAqcut+E2uK4yJ1KyTQGJkX35wMiWucm56q+QSZZqM9yZNc02eNjwEMDJ0PS/qQ0vucpiZ20sAIgiGLxHAYmBu2zCgmJL/6R3FJoAfT1DUywHlVsV2rmsF1Qi8YwrHyXuMKtAmRklUJGNA0UGGQBsqeHhtrU0z6YK2EmY7o3+AAyJzy8LmFDgHhl3/GGkmsiTWeXeXdfT1L3FSxARpY5WnJwQ74NB0nCGl7ZsRh0FxLBy64hVqdWc/SgahZ8G3wurBdv61M/yUBXCooaoRgNEjCorfuCqbSL/BByBPWD6o4xFA1IBIKwEKO9SroBWYXt7c6tO4I7NX3YdUay7raaZ62B7w32M6vTCt8tycNkY/wEJOhR93g8FMPNijfMZeSjxACWNRXMJ9PSGMbf8ANrqkIVS5/ADbEQuOXYtt4GyU+fz09HigtSEhAdU1n0j6v5FDiquu/QSAdl9iYH/vzi5gDv9wUEzKHwzaO/z/AAgHHy9GbccpFK/2gDQKIP2eEmm+tTHVNtf0FDDQhO0yHt8gEdbK5Jp7FXmoow7f9DYHxf8ACGv18wUosTkRCcO0kQ20KTsYKBXZXdtz3xbehw4so97NDh69RwmqwMOcA+GZKpzlsc8nh3bhFeYbT5zp7lQgvGLQFgp8NRUs6M0nThP3KQZPV7ArlDgKzR5w/qcLJDPD+1u2caglG9GcPknfLQqa1wK9Xy9Az+zUNRpHPDskOrKrt8vzszDLfHUMVUKUV0gsXODwlDb51Qw+pbK8y2f7SugbUVQWNsoYuR/ArffKwhShm1Fh4BXyngss3tABBjRyQE8ul8EP/Jp5+vKCEMVyGYn43R4ImQjpBf9G/03546H0/v8dsBBYLCAVUz08GBu3j/HJdAB738PF0FT/jgPtp68IA0A6KUEXRuOdqMgjkukcBYHEiQZgdU0QSJTLx+L2YU36BMMs547/gAXGxIxDEKSWnukLyb7xK9HJIZ1QgecbXxg5dT+Hytj/BQmQTpNkAAdkO8bykrmC5NZuvPi3w8M4F8gPPggMgnDJwnhAJX42j+AwARxgwJjWpRtesXSMmbJ/gAYO6FkY7h3AFv+AHddt5TGngLkBpEVe5Kr+PtCSA/jzpziTcx/5e6vv4cMJwGtP3b+XNSQk3P4/M5wDAAARp7d+Gxua+mPOKXu+bsur6zehtpMUTMBmsr1fiB4hmlmMML0n+S3bhvxs7Tusx7iwmNBXgIU4KAIAvj+BMuNiTYSPDQYAIUBQd12AIMBOICUF4ngBDYjKC1dzH8hPXA8bYuvuME/hJnMoif+AIAFQCWGoZ5fG6DWUNOZx02lF/xYD8ywex6p47QqPoAABgfw2BEJ38bARDc6NNJfwDEEd7Baoqw0ESm6tnyl38LUqilmBEcQ5i4AAH+KgliX1/wQEel9GK/wYBECLhKX0kJhncuWawKS63eE1GZbENcMZxDyJpvAihQABAfw4Ala38HgJQ4JsvDBFd2sR9cuPD+BsOiQV3iZkO9ZPpaXRAAQH8C8wU97qWQWcHaTP8eUvyrozkCQEM+EuhmZ7t9IIX5g/Gizaj/Doe8Uf1SHvZH8/B7z5/LwHvZH8gh738B9ydFsjDNNtdH1geXbw7otfV0CW6Wo/NhJ4JprqvLeH8AF+TzcTDWc2ZDzy3XloKw0sDXteKa4SO8Hf8D9yY2lfzT3JqP9/9yf9r9yYj7bqCGuPqU/w8pcrr/P/cmlMKqLFktNIcWYuxr4FNgJFkHl62Pnba5iyRQdoeOPbbw4Hv1Xg/DeQk8NaVIDde4zPfDaiwj1pqQ8gKnl24wJYbfiNF/gyi40dkLx16x2R0enwUkb7ldrqxUscuYyxAqM3dwss/GpLy9Uk4pKZHElUM/buNEgm2O2AFBf6wSbXkcWY/LEdBr2douF5Gf1fd1ZvBNoupHkjqrH0FwO7TBimTN3oIenJz1sm97ewhETcnRBKKo0sRZno33Mz1bkb0xwV/D+zG95bkNpgwivGXsVxLWdOVkkBIzYfrC4J9Wx1FyHhb6zOtEuVEWdXKr3A1+gQj/mzixfKajtSHm36qHPq3jWRDxo+1nHJq+R0I79faTomsR3CyUn3SYkmmkC4o1n8OcaWEgQmXCgr4nnx81EnjhDghDnjiUfawTrf4X+80GgOd2TroYnd9kKle8ss1oljrROVDvSAKa0cBpeprsDVmjoXiLNQ9wHASMmyZVQ9xsMbb3uh8mBWMI0Zjr5Tdy/Jb6Rfafd/AvcnTB4+NbQ8cD+rddKEs9I9nxAW/wBQl28eqhFJ/gII/YQFed9RCNB2YgSNiruJ6SocMbDY9MvQ/mZpRAP8HBWtq3Q/xLPZEn8hT2QFoQ2qlIkPrlDuuu39a6ZIekeajDT0ezThfFG0xRREgKh/H09l/A8HXgaB/Hk9j/EAjf/f89kAAP8Wz2T4jcX+FAmzUc1cGjf4f6E1tASnAGhYneKd1IgjxecIMOpgY/gcJd9Pq14DurV38TTue3VccaFdbS0HyIcb9ZxnSJ/C2KLju5y6rj0f46MhLD/0G5Q3HgRmIPLRQ74MOStFy6gFQxyfgzRgEO8+gCEQTDc7nE9DOJGz/jUe3aKl1cf39YHESPOiD+48hgsYI96Jaf+QIlJ6tE1a51lunTd9Ieo9u/KWLq0NMA8FsTCDF8BZeSNdA1laTJ5ymarTMCkpwc8Eao3HNE6dBTaADWq//h8XPBijMUTKXZqqR0Y/ooSm/zXpV97aEjQ3p0hEeohAVQzVTWV/47Y9sRhvnwFtnqCy5GQ4XdAN8wydgHThTIgtbElUV6N242o+Fe4PivA1Q2YiM7nZGdPtJybnxj1w5Nb796KU6gAIuFLJ/pjtt/TWdijdtPY9QV0YwTaeLJU3vK0Ys2zsOQz6v2dttadmCRSlYscqL4+2DSXvHGqG+DnuXNRieC4B78AejI+CagGkXHvnwrk9TBW4PF4ZfdZNx2oozjZ/gbpzqpf/AQi87KQK59p7o1wz/DkvIKEEX3WxvMAEo/CkXM61va5l/Dky4d/ySyRf38vhfwwTLjNGnijHTEmKWA8vv8OZW047VGNQ6SmjngvOyWIyPwKXET/D095tfz7PeQLFzhTbvyJpESPVwp7E4+pxiXYdl2m2FX47RYmspodqfIH/P7JES1MtbMtYKR2S3ge2UhGNliQFdiwWclWei1sBqbZyyL6P+YtnCccgZVGoP5p5OZc6Szuc5qJg/3Q/MX/AGCbyAGB3DgXGNLExg6HjkMscOIgFI7oKQGeMgD07HCPhLpcbOra5gpuvc9iGGyMzSQH47TcfrqE93LBqciSf52hvo8IDKgZAVc8lukvdY2GDOBCdovI9P6xVG134alFS5WrriLd8sK5X+J+1pP9S6qsefdIT7HPpiwu36Fu7fWsZQ1L/ADJ0CYvnFDkfSc9g3CWA+QdG1ffBkjo7WfCqkgcFvgprX5WWWupOEaw5mlSxCCF7EyFHKF2M46exVGnvJpWsT4y9oyvs5YsDEzT4CbIJfNw064ncrePDKVDFzIwvemd+jJMRepTJwJD/F6qD4xAQr5Z3Ka1MfKyBPhcyRRmC9c/TMG1pZ+DX+kY33NsqDgKdx77NuyDeBE/nBTFKHrn/gIQG0sNQRDapQuPMdOgh7JHEekWHxd9Gih0nHZ1NnyyRN59hkTnyirhXyDIIdSjGLkb844d9w6bm4WQstZDPuMl/gHjmd6suDbvmzjP5A5v7tyIR9BgMLw2AeWpRykYDdAJgRv91q+upL3pG/dAydST1Yr3eIvCEBshuNIrVndRnylFRiLE2q2wBOfpqxxiUFodz2xZ34/JkXJSpEiuiF3kPVr1Zm5/4Bl0vzggwMaaEsL5/FWzGwyK7YG321iOiStWtI5E7hBhl2AX8ZT0CqSsLVlVKUmEqL+Qgh3BRfIirlbqQWujdvpo9y3TCYQ5ryIvc3J0cC6wkP8BB6ITxkUoMdBU+dMVNKs6DKLMWJyXP2IVihPSp0vof4LELrWv4TYMsn4whw1PLSlepNm8p4JpTky6hIV1YUf6hwLaEB3xercx4D2+sbmygztNu2Dbl1wkM7ODksVdt/YUKHVo6NynmSQG4jWqMBp4WffaoP9J8JgWpvAAlCPpKguHM864kZtPvsEM86EOoDh6wbeEDM8W9xeosgCDURWBe5ZqpZD9PczxhwtgrZbgDsgPZFiTxpukAoFiAaemQgzqxbvr+jbXRIWMu4D6ELJ8JdkEIL0j/JSlapzic/lq0+PIE7iq9HSE8SNmK8ZMd/4ewFb0I5sMNEPw32hoEWxIticBmC3vLzGXwW/He79oa250P0SCpsT6WSfVtljajBmMzREDR6k3fb2Gda81O/V7jW+koKakx/L6m/1dCGzkSTcWJ49JwStSuj9IpsPyB45lyHpIxBqOrq2XSokCQuL8x038vGzsKu4LF80AqLEV/YI7G0ioCZNm53y0a8l7Nu0xmprZOEuyQWRh0IKxRebuiafwBJ9Ed4+6HUWpZakoiQUY4H1uC3hnXkEhFF+VSL4B4c6wD4ExUFRo/zG2yAiuUsi64JwksSiLZB1VkE0asISYcWS4TPAvF67kd+crxkUUMAIqcbL3b+LngK5vbsE9um/77MDJez7tTBa9qP4ySISBaSpGnjE/INwI1UXtWzY3uZ/gCE2E8MmAr4tBaiol/AlvRqCql6JWcfQaWjaRP3ksu7DlyZsWq0Nol/eR/51Al8kRu/xRXyJCdnZBKnwQaa1bBJBThStvjsr7PEz2GcqFYWh3Yr8E/ajVXDClT5tNHHIoObYCQvshZ00nsT3svxs7TiWt+EX2PsL3pgkss/SfApiNBNPDjtAFnJwUswYUySt+gN1DYSEUrsTl68YrwtYCTPnba1M1wj7bbWYDPUBPS1wimxwohKVFawJ3pGOmUCaIKxmWdZsigJLF/0WIIBtsBmR9zNyFRT6IZQYb4wOXc+zJopA5sbfJWDP/AQnZf8KR8B89aOcb+66f1Z20dUuPJJ+7eH8+quKoAuti7FXx10eP1JJcQ+4G5WQ4Bp5wBi3kPwTEouM3wlVnH95F7exciXQGKI5/Au1FdYFCgsQZdRiWdrWq+qBv0t3acwYQpCIERa0tjnwGoP+AhSbck+Rw4J1l5TWpR9lMZnJNOnrwLSn/CAJ6nH/3ttRcM+nQ80v0FAEsnkBnRippCNlQrk/x4w/LSpRfwLtRZXZ4U4mzaWSM8/XLBurpZT+wJZJ5bkCzMlan2zbCdUmDwO4lrpGIDARAhQGCuo8a+Hdve4aXyBh0Ebkn8CylpLP6cp+lP1ukYPbodZEVGOq845s+vnwHPAXIFyf4CEvM9Bj/j71N/wwBtb9OsdsIsm5QBLKrhgNyR6X7KUaJX/xXJ2p8jNl7QDW6WAEBn3Y+e6PXi14p/BgC+HaA7MHrLQ6xJoyhdhNAytMO6hWZ/hgS9c5dIDMmjwU8votlq70A22b6bTm/4MxTeJVYOnPKgRzJzVNGrcjPA2w2H+DIhkJZm6vZQ3FLOM6EsbzS3XiZfSOf4MaRakoS6FzsQZa60YBNREXaSK/cq5tP4rmDWZn8Re5NvF6pi0P0yKK0NsEaYXm/gyijt8wagVbR+PbSzzQHg+MPo9L+/8MB/U9JtU8qRFsS0xOWRJzfsvY0KP14fn+GBBLSY6vxcbi43dhWl54yz+HODWdU/wwGZsLuOgAeBU6EdbqFiHatgX0+rGpv+GAz8tzBF/hCHx8XGZRsxmTPYCCwwYOFT+DJxrNVuBw4lkT1jGHYhIYJ8hkdoc/wwIY9ClBMd7ZgMB9WhOlF6tMtTDow0n+EgHGkx4anW7ryf6n7R2s9ukLGUl2ebP4MAvzxyb/oJegT1WqJkJ4+NZ+kp1FF/gx0Un590tprOK0A436n3w+gkHp6bZ3+DJHn6REI26QWt3TqvPk3MGk1kH4CPs/8FCGwT/igHCjgwSrff8FAjwvxN/wrQo3GJ/IVCh/FjmR/D4Rhfxi5kDnR/BM+hpCsGvRQ554RIOIG+ecoAt9C9wVAkPz9oofoga9QeevQx/wrqFBX+CgTtE8hAJRvtP+AwATYoMHAMT+FtQoK/wgIMs/x0IrBX8PapS6cTR5Tze+7O9Bj+FBP0DaGjm5S+Zbp8MfNv5RENXGLOl/D2qVofz7qlDKqyBJG8IDwA4Rm1gZQG3438XLE9GZPDWq/OZOJJpVwLVy/w8B/rw3wUrsSfefyKnV294e/okUKxLaNdiBVqhzfhPg1sAETX8PAf+j/PoH+z2m087stvWZyctR3rqHh1AVIoU0wB0xSa5eWMrL6KSn5DH8PA/6+10av5jzwqPCINC607LFfOO65PQOSLbi/NaKKhQNsgQj/DwP/pfz6D/jWSry0bjd8L/V7k1mu9BURmGzVubm5fdNuz1ApvN9dvHGlfw8F/sAEYrtPcaI5X7MG0wpserK+2kY+g/XVwZ6Ive5qTHDvx/8PBf+p/PoX+3lqtvdRmB0VpQh3AcJsGOjD6Cplz+lx2BvufWgUJ1uhmV/8PB/7DsU6/ZDEDW5QRCEO1R60cpXwL8lLpLgRb/lFvJXiwWBN/Dwf/q/z6H/skUK+iTTGuGT2xnD4vcaNsvz01Ysjsl/4CDJp/h8tBGRdU2UfT+HhP9is+FueywFpt9YDTU3EtCgHLcmBFcRMzOK3J1Tkwoi9qg/h4T/1v4/E/3aJicy/1/Yd97Kkst60jzfnQnmy5YsPrTE8twKEKAdMf4dE/3SDsSDstDfRL9BSN8fhF5b1F2uUQNrzDnElnkwd+/G6tW4/v/tK/v/rqP7/usP7/Wjf4F/utU94t/Ly3iO8bN3zW4m0v4AuomxQx03CHSwXkppmYFIJ1Od9EUAbhfsISpshFP/AQZ4WnpAq7sZhQZn/A6EaUu/97bdRwN7t2vw/8AAGS3Lza9b8EDo9WIvxva/x5SGLLbJ/wLw9bPdQqnmhryU+mSCIWz1E5tS5WmwBUkdLBB77oLvN3dS/XHZKp3exfqjH72uLfWzPRTpVNf/v+DglbMjooQCf2eA/gQAE/wQFElN/HFpQ2KgfDA5bptb77BeDaegpRQFyBshlf4yb/roE3kACD+Hwdf/j6KNA/49CDwH+PavwBhv4fCj2xsCWD5g7xzlHOKToIQIGvIu7ABRkIwAX+GdYgBa8xB6QAAC7kC/J4Wg0OIZ1iLnnOAzRS5k3+DNVz+HwB//JwlCw5Vfci2s8aJXwU3KQEg4jolf5N2oI+mJYB8V4TW1tId1h6B91/sGCTMg9R6fGsWziXPVFxcx7halocTD52FFZGhxtL4/meGNTYHhXXCoRQ+hpD0YmfBCx6GImPUxVCNUIiMzmLAkqHu5027ofkFAOyNSRMoiEgbZYGl5Bh5nWvqP1yOZ+UD4xdHxUO67rFpfmwgCORknHi3UryH6U0B38bcSLzFD+zkfdwDGTGf4yG1T5ghbctZLi1pRMk++F153kTkza+LMHX8AYakiZ4r1Rnqy2aY2GP+AhO3+03XRFJPeEK5QtCHcNsEyXuNJCGmF90tzlxzp3+drvtlchJBsXf8BAjBbKCcPKcLZWKDdlqjKyzZ1JOKH7a5J8jsbOJe2SMddwiOzVQ0w4m2JJYL2E5Z2a6nH7X+Agqm+2Yo0xGI/7bzRROarDw8B/S5HD8O+nL0ZME2EzKPQReBrjbOs1owwT8nNTL//AQFRYDAG2WJ//AviVxpoBPgYfeSnIKDMIgc6KAr121C4sDeCrvgpItt7ljCAcfpnt94fizdnuwpMTfwCJB4nqoEg1zv8D0DqPO/7SBLAAN7H6I76yIkAWatYdRt10oPgoqIK8EDISFRTlCTXFDpHr/XwJYAAgICA//+BLAP9zAlgE/ioErAkxeK+/4KAPNP9/hgA//yEHjDADx6wkrffw/FRva3sYiB/vrqtGb83AyuWIXI5haDaiS+XZXa1YaGgmOP1IzLotLTQbaGgz1An+Aggv8SJcsIc7xZ7fcQGP4BD0Dp1mBOBrILEZkAPnu8L8rTRI0VOBphSrd9v562oYU1hhGAoGdaiY7hKtEfNW636vD0+CjFflbIB5MEL1UPs2UIvIPkYUBc/QV4OdbV4MQATjKmseoIWma33k3Q/muOgU5HGdUyTISOD9V2K5Y9w1vSGIemP4tpqlS3TNsUCZArg4/SjBrhMv4fjjWIVOazXJu0/HRQOq8hzFpDoBlmRqIAUfoF8yc985kL1UllNYbZwbfzggiZnvbA5w2TQkQvEM793YVvk0FN0o0rknbdI1OqWsuKMckmaScFL8njU7yeyyofQgAgFeYqQKnwLBDsiaJ0yjYj3piEqwhowIYHZfcCrgMU3J1RzgoHrAyddEBcZ1m5UnMCwbGwx693kqBxcA4Pu7SB+2SXuvWm4Wff7HYpuTvd2rxn0xJ7zsWEGDQhxiJ4ZfXueQOqryD5cAABmnt3+cspYbdECeWaEygLxdtF9ex3y00s4PzU3ZZ+g+j69vyGPjEyFDgVur6CC6b184bxurgp4dzH/BQqzTAy38bYIX8J4j/8K1ZYYWBo98TgiQgAAYBFnoTEmJVECdiR0ZjKOxjTp9luQ/wNiqAlP/MABEUt5k6JBcjVJ/D2Kon/xtiqXDlG2ZKUIt5FHuBn98jghaHc75/CyMeAkzwSWgPo8H8PY7qh/G2O6TUG77B7g1shO6s/TkkZAgAX3lf+Kg/M0I6Do+j0gD71gAAMBSOI21h39WG7rzvkauuuXbe43kgH5BI6CBT3/BQXTKmJaASk97I0Yvm9y7cr6pAqY0mTSjzTwm/wGN0iJRK/4YCkAArMGjEsoiizR+LNhO48z73zuytxP5euJ/4f15Y0BLhO+BpNEpHLqV27C+VCHhnsxD0bSQ6SAsXgbODSQt0O/gDDR/8HCcVoAu0UKq5jPyx2bLVQbJFSySQvcuZznVsGAJqE7Wo2lJxoYMhwwmzKYMLIMBZ1eDW+rbBfkpySDxgBM9DHKP8PGWjT27/N/mYFfKDozjHbSOXkRhnSTZ2t/EfIqNNuCBbKyXNgx6rAiRMX+G9fKfwD/+P7mYE/j6/nAD+PS/kC/h+8mQRTjaeErsQTQsxuCxWST/gA44LPgJqJcZT3reZs9nQBZ5rAPOw7poRYnBQv1yTKRFygwo8tmQaaIK7trJ8pcDOFn0PYAf49viEEMYeKVNF5GjxwVcdEj4AdNDh0SClmF36JF3FVAVqBcY5oAAhrowGxXlaLjWFKz6nfTFPdSCjF+vuckTDRYP9WiP+PwH+N4HJP/wEJk0vbie1NfzFABT00LI3boQCF4ejTbh9Q1AxYioyp+nXvTurggmoHKBVJpT/oeKGCwpaI/wB9/CXb6R9qPwiZ/H2BDoIbgCR3mFWUFyBZ1Xnh4prLPCZ5Xe9E1KTot0v+AgEe2X5mtXW8LgxZrPKxHrItVU5hxzJKZCKTM4LHeeWpDh8jB4qz/yQLNw9xx856xv+NGHQvv8yEBLEuyf7U38gGnQQaAfw+JSAZJ/UglIRvF3OrXqZ2tmRZMhy11ZpWtIaloF3g3+59SazwNQmR//AwlIC/wIGpJLXRBJIKkrECQy7u1aXRaR+39LswBhqR0NTB1vm0m0asxmYmZkyGTCxP81CUg7AvQBto5p/gIRAgChLNxzhZ1g4CqUNMYOiIP06sK2M/hy8cKttQAP5XLTv9GAjgaCgfrp4nqXXUKrfMgQIxHgFSorGpp0qg/XeZ2xUdaTyI+4rV6lnEQa8LzGUC5hV8kLDUWvzWNWSu/vSPzFF+LkHp/H4bJYyrItOZxApQ98VN8SixmUi5tseWbqlBsSJlLuxyuA4QrTenNCu+A7mPsGuzg+VCOqv5G41Bqtm3c46w+IK6kVQA/5IFqRvOA+VHt2IIXkxR3HUrbM+99xImG9Tu0/gAHTQfweE130AoX712algf+HLeA9hAJlNjb+AwAQ9gwjmPP47t4P7+jID+YLeBn/IPA3IzTXiAg9eYr1j6W50hLHMxNqVmxE5CXfHDAyb4/h6BS2/59gUoPHb2yMRn6XpCYLtPrJ5Fr8N0sqVBpC0W1yt4GDuoqZqxs/n+3gG7QIDA0jqyW5tXy5/yMNQkiBau3GJTGkxNolfxTI73LE++qrrKFyRpN0TCRD1iWC+Dgn9OJdrai72Rijy6/gBcAF2kuxg1QRJRvEAuNM8HpbiLH3LXUVpsbTq1UIh/1s62BqIvWweBPD55xZjBSv8aDP3FZB71slGaxFZiA8YAjTvm0MNORhH/SQlAy9S18rxaUhxiUfjZHjUWlurYl3bKVbwYQHtLiUFNoE8KkDne2eqNMk4xCHyjX1p24LsoADx6gRGEfiVJhI9EB/3cuszA1xNFD8plMc6jB6xcD1tHPB9BXw4kxKNpwLePF55+Lp2ZnMWADckGA90jfPO11EcEXMzIogO2kywCBM2q1C7PHJd6NHsC/Y5RYw7mpwIWsZFVFUI3cBzXoA9hQiGkyHgj7P+D9ql4HmB6aTDENk47mDihPwmQAPgDYfeHT7NQZFXFHYa/e7/2m+IAVZZOPnPtjv4AGZuLhpytdA2LrPLq7jvCvldXad+/k9PpWAmdjdFfUcqr96OWPiYXQKbA/L7b+uCf0ntz4YF1GXpv5zYARas7rXe6LAnjCkAKCCtwbgu3ZgGNpCxooc5leP9eqweZaw+Y4L9sIZ8AAIF+jMlJkbf4BZhdyywgFUkDKQpGuhvdy1goIgaYfF8EiAqM4P0nrIQhdhxVOpQGFo/esSrzxGOCp+7V/yFkgHqKBrQ7M08m2/co+COupkNbFE/AGlfuY9DMZcpsjfeH6IXJ/4AY5MfgZtszTw/FCISoPEBJs8EM2kOtWhlhcYbsRnCj8MW68I+QZmnjZmwOO0pr3hvp1/uOC4bxGopbDXkI/+AB2c2lRpBgjuU13xGZhXZVy1Xk00dpHPR9UiHNm0TRYgQxY7uRFeUChfywtWtBSr6xCTdmhMrQEBxBG9RMR37fsJSvXYQHmOpQyujCspembdU+5jMLqkUCJllKs5ICeZgg43Kn0w0jl+g7KHkWdCHQbREfeOefnrarZpVNqQ9KIk/Dv8BAinI2KajF9LT3wyBe9LvUKfDqOfj9mx+Yx/vyr/wUc/wwqZkAJHlOX9GDypgoXbeYzbw8uP4MJLbJPVlT4/hWf1Ng0oM/CpfvVyUBLYQEC/pt5XYw4Kw1o4sKPcv8eZKY8UDlyTFt2TzIsmLmnaetk/zQX3ojcCSxAwwDpb/4bzfo4M9mWzt3sFKWipfznyk5J8zDpR9CvdQMNIxAYVJSU8DVYI4Mmxs8sfD78RaAR35H4l9ZtEaBVASX8k6UcM08YMrwOsT8nf3lH8ANNBlnNk1CV6oEXW9viXeqC682XNEVgBwheHMwZD8AHqhyWrFv0HOxtHi5ZiLLpQKP8ANS+Z3BXkAnehAO1D0B6oYqeOM2Sgf5Kw8kbsK8pUkGh28tgdA6p1t5XGgS5nCYmOBP5Py9Jc4h4wMfzABIu8bEsczyyMOl1QgcOTYCgCJlaSxeXqCe0vvpKZzOH3Bv/gIGM8Pf8BAiJpqRTw4iDFRmBR/qEasDCgosh4pIjFLPdU4900TOxya6grX0whEmsyyBWIcSvEIXn2bHy8ZWy9mfTsEZ8xFAiMHOQDizssdrtYYnO18c4XpDT/AATGvYZykgjnf4CBHx9oBIikmcKxh+fb6fHB5tguWrHgJBLhi3prnLvqT3mAKGHeURy5qwPBAvJKggIQdXHTFTQISlgLjvVCV1hIHOfbc2tNr574eAWOzRgd39fBUdfyeEbRWCuTSqZ6WCCZJqapboXeU3CC9aJgH7N1G920L+AAoGN1l0Cg9iHxUZQ1M1NK+ex3jvNgv68k5xNW9zsWtaTvyh4ZYHHgL0YjYZRVPsF3zi296MU5sNLU+fxT53w40IEc27pQFaw7P4QtJdpNq47ACtt/RobAoO7lLISjI+Gsg5KtMw7IJP80+LTbFi/E3BETxQM1/tvbp/duG16ZDJTyqiK5nn/wUCH3+I134LhwbmOc/wA4pS84lobQCLRq2N3YK+4pneiMyz9YPkMaulhQA7mFHBO2SokyKbgrZ25EJAPAXyBf4KCQf5X/Bgdm+lUQBBpLKiykoQi/YCLFRVAQzT0pakADeofxE4MBcgQIdTX0j/68CBfv4/MY/4MO5NkD1UdNJ8ZlxC4n+GiCa11SZ61/l1xi/g5AJ79ge7DIuagiLRhC7D2Zx1MsMeNI/msB//xILkoWf8VgP9qyoVzjBFh/4CFnVxDZIMirD8qrzReYwA2EB8Hr54SLhZD3uM8Cd6t70cOv/8HAqe+lCAaBPcH8BgAnTBgfYp/50BU8eP/h02+/Ab/HwJKsuDSxFHY7FEC3w+Knmwoi4iOWiFzzeiqDjjhgAoAOIl2HvtfmOTlFPz8G+0gjtCk7IrI+3B47sEXCbf6TZgnxlj/kAFT3/xcCUqg0IBVAQdQYHt7s/nQEpV3/8eAq+2L/D4H9wGCfs98b1Jz2o6MZ1CrfEaCXC5WOVBKfHKmHZLznnmUc6hsm8zdX8R4O5mBaT3tfsN3ZcxbJoBsm2CA3DgldZb/D3vP/goIo6ereWq0vbqvQ2u/7w08OqHhuHA2Y/wUDEHBkIBMCiD/wGACCH+CA3aA/xMEJkf5UFfrv8bBj0n8C92P1aIXo+OhgmqxA9eJx9MAb1j9NOFLWqvOJwCyrTAjkRDAQjTB5DsTP393f7sewqXXV1UcaD1/hAGmEEP62iUzdNeGDE8vYE3gGRKY3TkxwPM6E8/4rEYHIw5ON1wYJlRckEWoQJkpDPB+EN/jsSIAQrlJT+BfUH+AESHh8Gmp8jD0hgBwv4OAtHoqNuoQcM1XeYhb8PBDyHLStD/FQJkiBef+CgTJEphAJW6w/+AwAEGIMC4if/FgJkiBYn+hATJGg/jxF1JNzJ/h4Afyv5HUdUB/z8CZozX+vATNHJ/z8CR5f08Bv5X+iASdLgBuQBM87Hn7WuN52oA13nz4bcy7ZX4xnTC4IFcyHzbR0PvzFQLaCXwYJVv3+DAW0FV1v8XAtoIJ/IWXh/FgZP/D+iJ/w/qJ/xiyRgyB/D4A//BGZhwBmJBK9jg9H73W9Sc2oRqyf/AE8PX91ZBQM5no04SwTl5gg0SIhBjZe00DNd3C4Q8oltXIlDm4/6gXw6wXOEDMIBNozqfwGACbcGCXIZCiV9X/gANsuUtUrW1pJsMgoIjlKyww4Y3/54AFwiACkCfj/BhBpna+v4/wAsd2Kr3dXp+U9nYD/HDusz/kIEqgQJp7eAYwUN1KKUgQjd0l3YXXImk65SqJLvRDp9oz6EADmM0w7X/xUJW8A38CPPcBVQgJf/RZ/AYAIZwYE4FX8iwTKzBkT/SAVBf/HAfb/D87CBQg2eqqz/DsZskyv8ewWsA0H8exy3OEfx8vDkv+egihgBAP47MYzsc1ln+KzKD/x8FFAfwmHVZWfE1xs+7uKtP2xeit3rragBhGm56dEKL8mv3+AgYETH2rq3wnTGZuQLJ+CqpSY9HvBLCAcROycuQS0YNEAEQyi/gCs1exGvN2DIS8DqnxnJaOGwjpYNJrRo0Tz2skRsclNflr8LBiS/N1WOLCbtvPIbaS69Xa4aZ6Liu+hvFOsoLEmKyie2FwD26f+xZZlozzl81nVaZQapBSRMBWEguq0ti74TNjk69kyF2OjwYZ/4AZ5wVX7TIPcFuh7iyU/Sj88DJnmjLxuKL/jpnCxP5iCv+D+fgr8Jv89A/aAD/HAjGBY3Ps19uI/w1szfd8V41gLf5YEZMwewTaTGrVyiweD/YfRCcnRXTHx/xsFKwf4eEYbP4TGqLbZwXXG+rg7hWZYJMHkwJY5+x0o370M8jDKWzJPBCv0MPwxwVMbGwKBLl3xLCtxZ3Iyp8sF2b6Gaa+18IWKybAtsTeS4elxAETtlE2Npi3q+RwK+S29VlhdlNcoSVrkjNP+ewv//FgtwiFGf+CgcLTbhAOjpr7+AwATHgwne2JQh1S8YVBH3gEiinUntxKAanGo3JqAuQoECiuPNv8kCPSn/HNsb/BgvNDjd8GdbHr2WgbBIlSId0v4H5iNn8VgDP/lwHp9/woMpCfxnS+mnn072J1fw69w9xT/LmE6OvoMfx9eWsH+egTqsE4P4+xXf4/rfP4frvf4QQ3VK+u5cGhdHYxOuKzOD9A1K/UrZ0/4+BY/45AW/5gA/wLEc/xEVP+PcfECewIoAOCe9tPC9g17kAjXTKsfwPiZyVfIGf4Qwe8/9Hxb9AnIZTP+AIZ4fSQdSTTBAYkKN4ADUwADZ+AA4ugA5I2+5cVqokvPUh97JLCRCO518YEaDFth4TjdSGFKI1+JUFQTB5TCBeDaKk1IKFPyCBr3BwdGsQDiA9dUYZHOWPoxwAAOBp7eFFwBX3qyzA2XTn+ADoHX0GBgMAyMSissI+AH/HQRPU7Un8fIRIp/D4XIAgHWDAYB3gYBxIIAeIBAf4oBCWAcK/h4uBAZQFACD/DAo/V/C5WMLW8onAm0ed2BgN8CgDABgO+AAQUF/w8AueP+CA6KTfvqgQgtahwft/gCNIFQfDoG4nNdQO98L/BAdHWN/5KDo6zj+AUBG/wsHR1/wHOh84B1/4KDo63EBgAGABwAgAbUBuwAAAYAIAG7Ab4AAAgAIAHoAfEAAAcAIAISAhv+BDHACLwCOP4EAGgJZAmX+BQBFlAm7+Lr5r+EwqP+fRH8BpJu0DcY/6sE2W/wmHf/xWyXfxiPRgH8fUiwAQ9FNo6iU57IL3nRp/DPqCw+muzOXQfEgr12W9o7YFbt/i4TZJ/kPHyP5fAe0Gc5MZUQ5QglbTAnCm7ICHgNsOYPZHKXN6pFCO1iBEfVAgnaEdSuXveD+18I3EkXskMKjUxrqISVJnbczfBPrIH4b/h7Le3IhT767hxjwayggOEoHk4nUApTEYgy8fQwbwURJfxjk/gVD6IGi04OkbDhISriVxD/GilcRE8/07RlcFDYy105vrT1wZSn5/E7rr62+pVthBcxVBx4E6VzK/8HAsDa1cf7QBYG1kBQuEEONWAcPkERyblsZ+SfpcfrIYX+/13CV+X5rFheLa/18CwN/x6FngLB/f+mbP6aBYG3qrmP4KLgRB+pnH+H861xeIkgHEzY3IdDLfjYqkR3lx7XTt87YzcnlLevESLwjkEHPEoLX+Mc10T701Aa/g6b/WhdQ/zyM0NGKtaeQ8wdUGjKELPrzm/gARzvaPgGHLRau7vRyDNwJDjhc1ReFb3gKmv441REcdnL+bAgof+iNRKoSC09v2NlCPwkOw59+T7PAydQ0bBuS8qHP/wBCpGrA8n6tC1lDbwSLtKlG/gAv5VZoSaELvdEx5foKEo4KtjFm1MvLv/UwJcLi9k/RZnQ3Seo8uNd/D+kaQjEX3RZr3MjZz+mFsVz9yzb2o+SAp9MH7XrKu+3YdCC7wxLYuZFkQoJT4NBS5DdSDPPDxGrowVpzjfJklUXnpG7JLuGAu/tamP9OXxzglMjaQfz2seqlXpFpg6N2UAgIOgU3BHPKMNNijFq/WlJTqYjbStaQ7PbZmgUFRvd6NDZyMxvnLzuNDI282bDNx0QHzWEyw9SDK8Z6tXd8xqsOkYI6E9jx/qib2FPF44eHTA901LVOdoRVa4a3hwA3kmaTQFgC2eOcS+ZFdRI0okrKOVdE2ohQNZfFi8QgEybLBBgo6BEp/iYRxJhh42sF/wp7iEiaKBtv4rZC/4TAbf4hGznWLf4krAeAlOp30A5/u3E9W9SO6HYAN+xsCFGoRKkgCv82jXE8li8XbRO26F98A/4ANRTE0GMqbrWdgLTSt8v4ITTBH//YE+zU0ol54i99cDaX98/2hMUAQVheJH/HhDFjOmJfwKz/l5QazlkT9e+KLp+tnzfJj0INVL02Bo2qW7gvwrN/MHGlEW72o3Q9OFTKaGk0sUzufZQ/wDJ7Fnx5wQHXsIBYYy2wYCr83+jA1Dj+OW6pBucHTP+D74z9WsLfMib6wtZeum2iqW73+zh5VyE+h3wLaBPRhezMqjNQi8Ai4QbniZo/gGYwVFrzViV5/gAknIGsCiG2bG/4ktwP4A2RAzv8FXizmccwjDom/NjYEIEi7kakFtuRBthAR7uFBoeKcNBFhnCvBAWjS80lF5WAnKlRgSpv4a3yEAofF34DAQWG/ifZtkF2T0gsciKAP+8M9SHfws3HJTir+FikuzLHP7/QwnjD+Bbmd0yFKZl08j3oL3QJkE8R+h6DzepvIsW7+HhCwMNKEwr5H+MRCz+EwG3+ISvt2NP4lBbTeTU/ZVoExHmB116lagU5IR2V6rS9TcagayADotJGgNNxGhWABvxsgm6lMYtC/wEHblzNcfxvV6f4KEP8ENJ/wQGiDkKAlQL5f4KD/rASIYAt1xEqJNaxSYuKcpXSFlO+MhsK3OAEZACAwLHG3Fk/IueTFW9rPD4ZfDyxwwsi/giGmQ7/9gIf1lp41qNJzfAruGBTnb95Xh8QWWZH8eQ0xj8IJ/AlBvwHZUrSuZ/gIaEg9WFL5qU8w8IiMo5Ck+e//YsX48c7St3C3AIfzGnrV/ENWZInnHsiQ3+AE1IcVY0Hpz/BwzSaBF8rCAdzWUP8BgAlVBgkrwf4sFH0QPEgq1WyP+fXFj+DWRQmQQ8xoTcHaSsx0GiZvMuEgelY6f4+OKc+H1M0E/wcMoxf4qED/uK12e9UAv4e0s9/+HyuGdG0WRjos2bho2M25wYP9TH40aOwm5CLLfx/dEiU0mP8MC2TKl8KORBDEDSnYeskZOmIzNsXnh3X8fFoIaLMDjGyMrQtZda+1zfX8dGBzlp++4mt573u4I85YCEkg6b/t2r/zxgz4d5VxAxkBaL/w8DP/w+OfsGiOHLPRoPG/RdZhPUwFVgtZBOfE19a3eQkMKKls4ejfwYHqD8u7513MfwAtiEOqCupG6leTfU32C3r/IAM+lsH8CAz+ftc319/xfGz/wJ0eHyFJ0EFg8IBW4hwQYdljkpy1EoI4/eKgQkewnYPM7U7CunoUnAXIUSAMnc/n8Vviv8WuGteJ8w9Efw+c+oL/DoN5zmX+VhPsL+EvmykX9muLlzRdU7R6ANxkHuBhw2oEjbSRXcq2S8wWFXb4k+D66vbv5LTbv4TqgP4dNgMEOv4+QAhdiZmNiNWFlMDkzZDllNzJmZTQxMWFkOP8qA0HP/pIO/F/lHkh/g32G2LcAztPkhsPE/DH8DDs1kOGhS1+/isAf/h975/lILj4P4+U7qQzLXRr/z4E9HfxYr5GA3XffsoBlcKhJqjINxUZr31xN/jYYSGcAy8PP4oSZs/HwdFT/D4A//D4H3/FrjlXUeyxJcB/HwN//j4EDFBNDNyGl/j5MG4P4+EIwH+OQj/+EQSP/EwxG7/JZbz/iYLMB/4dSxMJ+/kvHBwQ7pAx8qC4R2aoa1Wo2Q81eRD7T8j5VG/h8qIxLo/IWX+DAQNyWYk69828n9gGxrph0i1YB1W/h6XBAbv+SweP+MH7oGzO7+PAcMAAUD+Xv4lTAZ3CTOfewAB/xcG6R/wqFsfwoEGhdxXUhBszwH/Py66AIMnCGBx/fTNMjm9fZvBH3z8TLgE0xZKEVAZ+MLuPGOVdQAIGlfJeVsNv78zaXd0uiMTAqR2++E7CNnfuURYWhDkIOfxH/OxXJhTkvU7909IiBjGbU+m/XQcJZ8dsEBwkIATmXxv8e5qv8QaKip/j4IlMKrQNikKB0SsvEbYLDKFDTBHnKht5HAGWgiiLxyLCSWNPxzNGP4Ab8p5b8dKC3wnDyPBObzrwZrhQP4AiiLvuOTMIp5Ap/P8kA+1e8kErRE7fSijPfh1IJs8z2OEiJrvw2QPLCATBqVUGE1phKAE0Bb4cWqEQRJt4i8io5oRxShwZwFwiSxSgvH8GEtutNBnJOVzKDj7uVtPYE59II2JYfw+6H/wTaSHw1Al77dQQdc+F2DyTF9D4QzM40dTc0pP2ePSC/TOpFawb6HdXDZTHRJFpqNqpdljYhhohT/xZQMP/KyXD/CYDd/IqXD/Ad78IS7kHCXWEeCKF+f7aAhmOlaKLKYM/c79V+JxNyfCoUMCuhRlBToY3r1/oSJjPeGbn5ssopdgXyC/4IE10RqB/wlBacvJ/oQLTlbP49sXAP+HuQAEAAH+PHxcCQP9eA8qDgfx7V+AX8/uIv8MC44udZILO3Kz/jgHsx70HVDv4XEgcp271CY58Slv4cQo04U/hhHjgY1WTao8Tf8cAhiA8QjI+f7+B50XiklRrHkpVdoRzzm61I5SjEi4eyRbxQyiE5HWXBwUclP4AhcrNGa671xhsmTcSDV2bMjywEetZg8Nj6b8f9n3Fh8laZJLTl93K02463Ty7X5JX41wQkQ2ajQjb6w617nlRRD43ZABtN4qlGhi2CZZ2ZoKb7krQBL8ACw68sG6ipMDyivALnBgTPKCHEHA+1mQkKTim/fruwk9IkLQCfANmvUZQS8edVK+WUvhTHxcbndIdNzXK1u/Gwp4DsroKjyNdxeL+H8FwGCCGqbjzBsOThHU0gS8G+KmM0l/m68pnbKhSwz83a6wWJHYUjaIf1Y3H0Lnu6LAXioSmtETOscQGG42ELDmDEdrH+Uq2/DP2DvjkjUie3tIaiJl/cBh5GLt5sTI4W7IoOhPpHRbGfwZaXPUEA06hYK/MBC1aSgEEtzs8SVdhzeNavnniQeVZ6Hgh5eQ2tm25csf4CEK9moi3t+Dr8pM3NwOpvxm3iZ4BLhe8Xqu1DpntTa+/DChHwNXAa8KeJIKQVHZjtxtc2H/gXTORm5ToJ7cNWMAUSLgpS4ncztdr9Bqcf88eEY4po8CHzK/x4EBZFU/1IISRy/9+BCSNn/L29/l/yLtvpnVJz92NRzarJ+stxsFO+daUhc4OrokAmwVHjLgPWZyGNNi/wBKWWRkxjYkLTTnBasMMc1s0h22AXyAlf+CAg4EtZ4QC809ghA1pOg/gwnrEZRBxLlesjekM2tUEuyl8ScWHWd7cIC4/QH+EzZ4/xMGHG/xghsAvE8f5iFqVNEMjs86xe22W9gRYTT+GAvA/gXYqUGpyv+leP7ZIgioxjEePBIWCQQ2b1k6omQ4ZjxD+jX6gNO2ciYANPtG1hMlMBnnzC9tGq2Y/wUNkABbCCcdQyEINM3ngQYCTaUosyLbUEulpdg38PGuOREEN7MZ9mVD1u1MEzbV69wiJJSGfwYXpfwmAr5C2ynr27IdQMnnN3kPqwbjQMLv/APgztIAHygcuxi+5PKWA6qIZR2/SRxtpcraCMeIG8SNPrYV/BHssW/1s33/5+DDth1wIRuuaBwn6M28mYdoNo1O8xEHP489lpIBp/4F6ZVQwHE2TDarOmm2iDHnI4ADih7deHd6cN/+AKKeCPGyIMuMmjskK1SZVrzLVQVgulC5tQ30dUDfuvjtggQ6/0ICBqgCI/w8a/sX8OBwHrwgDmqV7jsBoF36OVIzVcwlYe0/Of0jC9CMb9HOeZmxe/zbGcY3juL0kTwxJ2g+BdW7J9KFagLvM6E9lLDZtoqpLeP+SAQNUAA2nt4FniVclOGAIhzdJCeWybSgsZFpH6Br3MtHbpPR0Wmbi25f/Fwz3b/AZgcti7/AYJaZ/wGACIP+kBIE7+VptWkNlBHId6rcb3IXjAErFitGyN3YfeqDKdk4/1zOnil4uihYf2nv4eQe/4MKJOBWkK6Npxk2io5PGHPT7zjiZ+8Hv45Quw1gS1Fu/f4HOU1TzL9n6lPYgNqTVBxGUKjCjDX2LviygYD+HjocXS0Q9JzG/jE6H/hMBu/iKWWx/4gllv4dJESv+qSRHM/nOJEbv+Xv/HM/kH/x/gMRiXQ2lRZP8qa8uBDlnifyvggW6pwKdobw5rzvZwQY0uyJ0v4cs2Ln+abNgCKZIxvhyj/PiUjpvpMSE95wDxh9lWs4D5tkyLPOyewK//mCzYAQB/jq+q/v+zY/1AEJ6T9J/BSEIOuYue/w+dpWe8NcykYEW9ddF0GyQy2UTom/Cr3J8ZSuNAHoHl8saI2bejwq2hQx3HEKY6rhkU1IMAkj+auuV20amPsH75Dr6MP+Ahj4P8rG6jmTvDyctaQJtWwtZ50fU5hQu7Iio+hp5cCsVOEEWOo8bzmfBLqh3me7aKAUh0RYmOJSugolhm5cl3wZ9iQygdap3jHGwbueSoHKA0itTAZioStGfcviCwBJQ9h3CAATWPWLUwa0tmaU6V15z/AQZTv5xrI+EA0Xzaq71UMMv8Pn2T9Fq3vLnyWWWK9cK3yONq1FKXnjp1p83J/KLjGoGAfTIqR85grpfFN3wewxqX8sy8K049qcNGTIO8Y9LNewCrH7F2G45/AHFai0K7BdVZzk04qkWON0vXm0tOk58wooN/AGSCWgU+PvcvvSS0JU3qmgYbJqVPChKVmMCQnA6xpJa0xE8bq9O2iada2rql31JEQrxPuFBlBf2Q5APLu7CRRYNPao1cl8wEaZ4Cdv3y+V7V78kFP1oIn6wdUAkNIUe9hnP0+z2KdgMNN5PhBqki+RtnWCQu+jWxRk7wh/4IC/QlP+BTXYC809u/xkF+hDysBnNZDwGA6Edbd5FdLnT15rgztP8AZfJ1Ras+NV/8EF5Zx/e092WiSVFpI1K7Yk0EuxgBI0kmLnyzl/ngKKhvzeT/gXMYY6q9PEdTImgqCqWpXZRpQnar/2b5A5mFJ2V9eF4cX8LB/8bBJ8Go/5sBhfMd3Tmp3P6aJ5T4n+fHn+Ddl17L+F+2canf8AQorhJBmZ/jwJPjUv42AiPzkozArgHB5sMdfCRkh3fhc+FExv4W145r3gmYj/FT+BzyBnwZkBQdGwofx8WfenEmkO7Dbm2D8Ex438cdu3dLZa55H/hWjzATP+BaPOWhADsCM4/gMAEQIMCXjGUixcpzRCJ43fV66lKTP0g4tS5ibHgLn+HZ+8AAYWr2nlHQP4/mT9P+APOPqbamRLToAEBea1aQF4gHMOcRC2yPcfhnBcHO5uv/DA1vuu/wB6Hu0Hp41KvRUMsOOhliXlmhL0Wf8fzJ4qlsDjFStLdZ/8LQ/fgVKrkmskr+BfB0LXpTTv3m6S9G2x/ggMMvgdGEAnLYD/+AwARIgnsMlA/KUZS6pZo2KoNQMdnEollw7/4A0zpo/wcOHsQH8C3kLX3+TwrDGm4N3+Ah0KSfoDREN+wwB4T+IUC8a6IKHVGdXOnpQiAKkXfNchDVGIIotr/AQWC/d8uofNGBfFt/AYDbCwgEkfwBvFhX8BgAgn/KLUz/AmA7/x+1M/wLgKN+q8R+fnTcqLAfN/APCTMutpY4zDYw/wiPNDKXqp4mmR90vwuFwxXP4mK02mCzskgJXNKYBoL8BfISL/4KEsAv4HFuoMII1uU/hPqh/wcJJmXIRyZrcCfv/PuvAEMH+vAQFDg/ysIgr/0j3oAGJTXsZ/DTO8GiOp6CgAf58BDNP4f3JCEtVDDXaIC4B96+yFlUvAYQBABY0fkfp970Jeo2Xln/gAcazQH+RoKf/TAIGgDhtr63kYf8GBA0G0B9gEg/icJb9caCUBUoDN/QtWkD+fi1oiMxHb14ADUyADZ+ADhyADjk6kJH7SFe3Z87thdonmizFMNrf6Sl3ByFlppo4ZiflXQS7C/BpyygCcs33pKM0N+KYjBtATwDoBFhtDx84LEO7G/8FAgaExkDK4pI8wFzvwASR/AwJT/Cc/uBAHUBgHF2z5tjrCROBIg3YkKhmEVjY4LwXIBAHUBAIIBgHEEgXjWAICQuFBBGICAlAEAgn+RAQARMgUAQAUAAP8oDEVa5/ggEBrfQMBgAUAAAUAIP8HDBFXKgYBFAoBAAwGAzP+HhiKv+FozrIGZ3mgTS7kgMBrIFAKADAc/+BAIzAAwHP/xcCBPAPSCgHB/kQECeBv4FAEYFAKD/BgIE8A/4IBoYf4A+DQEE/wULCvDACQ/4KBAiclASj+AyCkgASwBMgDqBwAgAVMBXP8EAgTwDBgMr/AL9RTYDVgIIDABAA1YDXf4KBAwtcB4P4FAI+AB6f4FAUD9Agb/BQIGETgRn/goEDUMwI8/jyQx//8CBof6GDb/j+Ew7//LAREB/CQ+7/r4EDQ/h8Xf/1sCBofx+A97n8AJXFLzsU8htv886c6GwggALBlosrGdfa4RYRDPoGi6X8Aqbu5SLhTMv1dmjnWfeFTqOzukFtMm9Qcnf+Ahw/VozFC3YoRocIJjRdozM/YItTRb1RaxGw8b5E6F/jadoQDBvJ0oMPBhWUyNDbWRIpK2JU1kp5QNr1l/1sgLWAuLMAgGUOO8PElEL+3OVE1cm8uLY1tY6npRoCcIlNv4TtSPyFQh4znf4GByRT+ExWr+KQEf+Bgg9RjNhOACBAJxAm8Im8hD+CiUzj2k6J/wUFcKkUey9XdbExAHLF4r8XILbk391xY/4KFCpAKEAlULvP4DABMWC+gD/Jw3LoKuVhC5ifPnRQ/qVcsB6BUQjAYp//kAFnFlMiyxd7zloS4DGI+O1Rjx0DYtq2/wYFinAVdCAR0VbX8BgAnDBge33fxy1AIDWfw9QAwn8P0ANlnZbAC6QR/ACAFiGThqISJKL8sdwSCldbk0Ezvof8AF0Lj5WWy0/SY1MgSPcVL3QOdHMRvB80LkNbTkT7He/qnG3+AgxP0eVf4fagf4Fdw3YCgLYUa9AYkEqDbm1DaOLc3JHC07YTRoWsK+LxT035AhyZWDxGoTotjbsBTTrTsYL+KJ3Mz/CAT++pf7SDbc3H/XwT+PgH+eBnMr//wT+//r4J/fJmP+f5g0VL2LvxJ2Sf9q4kbc6BA/iF44zl6pQgxPxneds7NvSDLunCLtGZEct0wpy3Da7wp8dv676t6aWjrm6PBaFvX8l2wB+fBGsmxQei1G31chjpLkhdnp2CgXg3o8WUUwSkHQCfA47l4CpzEn/I/wX/VIDmjv9dEQmEWM2mzb3bwagJj9IxmfSTWCTx2VidldQ+HlSddrRtkzVMhotC13y28CdlHDH7WZjgCAZ7zBMUJqxYZuvrLyj7c+9q6hoOnh2EOmP4fwzAZG6Cjnr8XOVByzuemxki2FGvmWYe09gOLLLL8aPnCKhAc8aOm6DNOxjfwRkTB3cgL58jHdBYc1qOPa9S7Sn23vgqvPht6CjXBWCLmqjNcqt91MMRPmk8RAIlD0eBZ2w/MBNAxIUgtevGcmddd1E0Vh4l1j5uQUtQ5D22zONB/Vtiw+tav1xXzGJu+Vy0ZhfHxq22S4gnebz8M1oA8eSl6jnRWZGcIAmluEamY/b17NbbYvuGefPMSem/WPatocv1ME3A/h3wQJeEA3Pfv/4FTTASM8f6o8QMqU/e3No1GfVT3atnl7Q8qKlagSv568QH6FBaMZrMuA/x8Dh6fxB0gbtCk8XGIDFETp2qWUuiZISzvxlw7iQXlNTyNOoK5Ha1fwY/6z5ggPp8gUXxXBNxuVdAXCdKzVp/ICigLOLvGyv8CqJ/3/B1h4g5v+X8oa3SBdXPnhhOQ0EFU7Cqr/PfvAAAMLTghNb3j/D3iB/j4EwuYIlgU9NEX0HMTFOZ+qO4tsWt3tFAztEczYYlgaDJ2fKLf4MLytrMsQCJ6IRKooYVmSgfNuTGD1E/kEGfS3F3/gP3iutfa+v4wmnv4DwVnq7UFw+AkLhRaQlx9VW+2xmvURgxAHYNYavamJgGQNzkv+Hahgs86pqGM3+fqhjF/l6oYzf5FqGH6OQWWUeGkezEaOlo9UyBYM40YrYpFsVPj6qeryxpY4sX/GASfM//wG5OM//FAJPm/zIF37/w8qXDwp/BOchBLiQmfYQYieT9QFAxU6tQ4YZqYUN5MNZ6l7sMBmG9nwEQMbGeZtQHWZ4yqCeX7gRBL3u3+Agjnr/Bg9j6Fx5P8wgXHlAFBKQRKDLtu+2t9ElHRDmh1BjLae7ctLTjw1vDd2x8TH8DMtUaRd6LpjuYVpRE6ptJMc+UXrt+lc1WY+hla+4nPCHJQ1H/jANbw6yEA4pfSP+YBreDxwJ6l1Zzh/DjAN9jyN/lANXs4JlnWBpDQwUgTJhY7utOCmrmMhZu3QBVKFQRBdfmz2ZNSX+ERmh/gQ9cAf4KfW+0olP4TSLP4SPXF4PjnM3yJVdISSyOhcp0r4hLPh/xkG2sDMUdBMLc/n8C3yU7RtmZ/gIT1DtNgYXLtZ+X0zKRW18T55+QkgXOcEM2blWCHXj5cFzw39Bmdae7dsHlZwOW/WONPv4HOVBin/mk5Ud/5gOVAHz0gMb9/3+cqfykcqek7/P5yox4irFC4aJfIpWwzLFg9OnUAtq/oOgSQosmKGWva3QN/J3hyaKR3gHXOq+9BihsCbeKqVIONnveEc5SImP8A2pJT80HZ+6tMl8oxWGaOxhJeOXFWcjQxAutS7eURX0UckOwKauLLQ2F3ld5S221QOR3w8Opm3vYN7XE07Y9snzhIwpFbfIYrFje0GtOS1EQLUl4Ipugu0udSlqHn5VZ9Ru3gdXRBzd7gZ0SmMM7RfC1Wb3efWFhU/FAJVXdEPedYvd1i9/w/oxH+3oYBf8TZG+E8STGj4IS4xTjARLwa7KXrY4E3d1otGsqLDu99p1g6ZtGHZPv5wMeVR7xnfYa0N8MV4OiXv/YOCye3BiCY9CCd+HhvKcN7sWstoHb1lLvue2cJkk4dgCjuZahxGiCSLvNaVwe8WDsVv1igNJ+6Tupdh5rYPVKtGxP8PghgWdclwXkMCepujH/V7FbAgIVmA+SKWzDSW/e0eM3VfylZLBDgQLDToKQVK0YWyMizkc9yYTOxflLe5JyB/wLoxULpP4CHWqYjDG+oE8whfj4gMAmUB0KSLUGK/eVWAYkar2sziv4VfHANw/gVTY/gh8cAzC3lP4TFVYC5A0RM3+xu/xdA1X/+QAZi04/h2Ik4u/+HYkTgl/4rGPf4fYcYbJUPOADKOR2W+hE6lu4j7cfFF7lL/c/zkJiK/x8yY/wMNu8+cBdfoXprYMKgod/ywLzzfxfMGosx00NcY8qv9Mf+LASOoJ2H+PVSIAAP8+CSXn8Pp4I3Z3ZGW1sQ0GE/k8XWNnjFkx7G2/GAHnfTmZh/E5vv/AeKhSyBPGzuQTs/M3MTjWRG3OxvNKu/KLA2ZxVI9WaVy4YItf4qFQ+Qv+BJqyd7D+CKIdqgyA9r/8WCeRYWiclTZB38NNjk7kmsi3fxOqqQVgf5GGT9P9ZAh34Tien30/58B99QlA/l3iz/w0E27WIERrctEBPzwnfgJI4ZFR8TEN7/DOSgL6L7U+zk4z+Vg55L/Ewmrb/HsST/BskT/CYTX/D0mTQf3+L5/3+LOf9/i+f+fAl2fk/kGVpA5kC/gGNuwP4TEe/4Thlf4TseXgANTo/gOVp3cgA3zPJOBcmER/Dj2SjIyXiqov10nwj7thyAY8UDL4+9QkjWanm3ihPtnDsnv30kwW6UnMOFHX2K/eC5ThcDo2mT28f4GladDoRZjq+ADI/gYLF/xMIJHwEAcwGAb3ywT4Rrj2cQGhkZ1s8+cU3kqlwJ+g/e5SAgIABAC4BAII4mV2//H8KZTHgAgHwwfAEAgJCBgJKDAEAk4GAnYGAnwEALgMBgKABgEV/h4XhM/xgIr5/50Bkqn+H/V4BAVwG/gSWvIAQBYACA/xMO4ff5CBmqQIDBQAQP8TAt6MIMdI58Zfvum9sklxHfY66HpsDsAQEVAQEfAKAQH8CTUYEt/wJNRgTEBAR9/EM1GBNgKAcAKAYHigUGtAIFLAoCAfwGAmlgbPgIFRgoCQf4KGvISX+JpnXoRhM4gMCugUBQAMC2/4DAWgFgAwLb/xgDOIeIKAwH8QzOCBh38CgIwKAoH8DzONB/AbXh9fwNTHfwJM4838BzOKEAK5/AbFUgAPAA94D3g4AQARIBFX8CzGSkBLf8CGaAEj4ExAR8AABABMwE2ARV/AYBHGgX8CVPBABTQFR/wGANAABU4FbgR8DABABW4FdfwLM9b4F0f4KBnWOgC8f4ErowDBQMO/gSawAy8DO/4FAETsDRP5DeMSD+Kxv//MwR3TZkrOcgHLkcyeMA1q3WzmpBXA3UrjGF5X/ITKXkfd4gPsUOmseFGY6PV7tPVoTixq1bAGCzK/4CHBkyUqCRV2Ya8Yhcc/1oCXDkmgdtzx1HCB+Ernn/b6CkxfyhrTl7ghtGiKtDMkXuoKtXKvMB6AOmXI5tbj/wBlVfhockd/hoCafwBlZn2oKJjmVK2IkLX+QBY37/FALp01PsIBScKdQFlcv+Lh4pv/OQLpyAAA/91Mh094Kdf4KHRjHWcaw90A4b9pe0iHittDnjT60Z/wgFhGfv97aI4CUERph5nB7hAl3O8PpqGnNw12lP8eUBYe1Pv8C4q9T0HBHcqFsRydQx5MJYrJy4q1HZ656B6oMjdddSM1qcIB/iYF2w/gRhXgCEA8cWC/4DABBP/BAp/I2nK9eMzZVjTwIvmHMx1J6EJCWkNk035czrRAfwLpZWLLvbwuezpRQ0UYdqsTZs8RopV3HCvsqYmhb2JSfGlJVHIuAFqQhJwvtwc0iQethh8QF9lYv+DB7jMPZf8FDQiZHhowBBgMNQfxPpY7wcl1SsCk6+h9hmSnFkFQfhnD7/Cpp8BggUm6WzfFPIAAGA8asePI6GaInyBfD4qU6IwynoXHN/goOLoAoQDMtfL/gMABNSDAtYY/ifU35em1Xprti7SoqdL2xouaO9jiien+FsCGMF1P/D5r4P4H0rrgX2ragSChpZRy49YRbEIhIj7ZcmeU4ODN1hSMa/3SHxH/FQPgxXwYJV/wJkBM8GBCPn/guB8GIT+QmyLg/ysEqt/5+DXWv8fBYVn+ZAr2kE/h4AHkz+HsjYU/h6SipP4en8oT+BG+L/HAUnm8f3+J6fyMIegCapi1/AuyZkBvQuVYb5iAXVXhK5Wmjs9ADllc8ofiQMJpbv6hSWTsEyW4r1r6hA4CW0exxKA0dUKWdBpR/BB1mWH9/jQn8jDQgEOuMd/Au+5mdjqqoxOZT0ziW+nCkIoTmhCwFkp2jO+SIzRqMbgni1G3kThLjQMsCrQOAcHSXWqxXQMu3Nf4MHAJomEAmY77P4DABPWDASI6/haymZD+DyJ//EwsL3/DdGceI1y25GoYUNOqpHpPYezP01WsQOVKDsGCHbaf4KE5wrLTPP4F4qdR9lrPDtK8QQCepkMQWMMeJZRzY+8VPRUfApXa5mXbIDnyeX+l8nMvKEw8r9OSQmgf2ZPBh//BwN1kr1UIBz5+AfwGACBf+vAxs5aAcH7vdP83HgWRc6Yxnfs9j+AeusHSpIZuTDe9Z8+9HQCybF/2AGNnfx7Xvf/+Busv9OA3WT9sW/+FAy7ALjczS/IfrfsXn/JWUFWe8Vf4BleEA4ES5ONLmiVT6h+bYuw/o7BdEvQHy9hgH+Vp+GWc0jB+/5PWyoxh56rHHLnbshyBjHnt0rLkYEvSOyqQIrtKvaqV45JCZD0W7hCwo1UA+2BjaMPXnaidEz0+RCfItSAcIcwVLexLfgjAO4UVc5XV8XOfSINqE/LOaGeznu7k2/xKnzhCVAjWYaVJWzXK6zASohtt3lXgcePDyKJ3Adfr3GSJRcy4jGZiM+6i3EwlElHQieyC97v+H7F4OnFkbH4A+2dDUEFlTY5gK1qYQg26kAY+VtrEQArO41pC17TbAMKJIcoX1YWC1e3+AJ780auRVD0zGIbbj5/Ljc+ZR0nFki4ts/rVINqk9PFA9BB1/fG/HK+5vZKNZRSsUC9kwu1WIvTCQ9bri6aLhALUAASjwO/B84hUasYTJGUmPNQB+myXi+UirQaOSMsyza3sv1zS+nCm0jEXsEzLJGnp1BzA3WZlnsfN0fawaUdV6Dk3cHFqvS95MRR7y+ZgNzEJ4P4F+8drMz/hwd30gkRs4OqH0GMe7BM5V9DGK9TDNBm2Jyw0Gaqnv9O8xMBndrH1u3VjjgRkywGHyv+CDXGt/v81x/kY1xAyR1F8D/HARkMI/3LHbcmlSE5O7bb39HuLp5jsN24r272Afx5tztruV/wL8oHBOCSvFNn+AhlcuYFd0VVc4I6lZtJnYouoRy/RAe1RNt3HIiVkB8KfUx50GcxC/83G7hTFczFx+/ggL59f72EGs82r3PvmOU8Uv5Qzopy9vUw5RLUP484d2tbNP4Fr83npd4vKOiN94lPuej1KNt2HYXROUUkqx+LmSy0qKzSjKXaEk1x8h1AU0nQkpDZxDaOWonZJP+CAx9u/vZH7WRDP4vmWFuY/PSioIH7utaU86bB/jzoHnHb6/gW2N++xo6DCcU0nRXTVeZvry5rsLnQp19kNZVyR5GB8BoxIIJhy5K1Jy6a9xzoOI+rkvkUGDcw2P4IGGZ7/9gKc/xGeMEDINsB4JW9HOLn8AX2cyqPmgIAP8edy7Re/P8C3jv8AIgk0XzjoPnMoUKtkYsCpkZoV+oONxuEsWgiuz+kpPBvx3R/DgY+3//sB1le+UFlsFYRA6VJUxFDlS7e9cBMNB/I79JwX9wVv+BMK3hA3lZm8i1IuXaFAUApXq7cbucM+U2O1DKscah6IFz/fpMf4TXK+x+hALWJNr+A7Wr+E1wOBzG0rumnYHWA/gi3a4Q99mX5yXpVJr0PnCSfRCNJ8t3FNbUsDIXF9QwPEzprqwdll+ZrENVua/8BEKRSTXTt14AAXif+Ah47X+B65pFmv9oB+WS0BBI0r+C1W3tS8g10y/60Fjuab8oh1zF6DxF5fwDYtHX0+8n+xgsbEA//8H5GX/EA/LJ7GDsZ/gr0fxTGo/zoFjY/4CEZ4mJWDv4fNkBUh9vXcEQW/wEEbMA9LRlXTXpju9HLMcLPVwWLlNTEy1l1i47qlTQ1qfTF1dLORuV9QJK+WmujOxMpY8LnMyRAAW6Ci66QLM7VIObDx4Qsrun9VmQY+/SkH/tvSfI0gg1Z3EuIFDPTOMGYemPqXJfJOVvv7BGXgemfLgC0K8DsS/oB5NsK0vyxNgse4wch+evEhraYIIKT3UHWjfPobRRDDJSh0anp10gpWftzO6+MgXQylTAaJCniGbtNBP+xrMWMbyRj/xBXNSZDIhxb8lE3BBTr7Cz4U1taZF2C7N0QSyWIHUFNtz8A2VeYj6NK4xhh0xv+mDdgrTpfzaZKKqUJwtqgEOxbLqhL8b46iXoHzhIkp18dD4d3j4bCzNFH28BO/T526leHSgtgh+vwrHxhyK4t8LYhIzlcRTu7ooMEy2Ol8mSeDNKSFNz2Y/LObkNIgd8u9qCvdXttMWSY8bGC1Wb25c2w+xkd3s6Ng1WwcU8++Y4y8XmC5+PQ0jBGTAfMvKktKm/xqF/An1/wif68WpFBGmlQ9l7F0aWwQcrthqzKa//AG26MBllMUiTw/Fafn4L/ADX5vjkMad6c60TPvVUkyoRO95z8gadBByR/4wKrpWQvcIW9d8Q7ITEoHMlIJoVi7b2Au9A3rBisk2dR+mMr/gMEk/2UFV08Il/tyCq6d1x/6AFV0qAh1ZmWEQ6MxY2FuNfTITB8EIGHgV1DAz+/ztXvpMc/kL9kjmu4n0I/9QlDuRi9lDiY6XvD74f3790Sn6JE3Tq/x+/hjbe2st4KGlIrWp5A9t/l01vVh9wMIwzpmgv9R0AeyaVCMNJ87qyYkPxloDqsv7NhjXmH8AdDQa/EJtN9DJOEp9QtND/kgKrp8uPcuK6/MGv2j0d47QspJ3vfRT4AvkK/wQORjhQtCAVoqwv8BgAkbBhhDzUp4F8eC89Zv5D8VZ+qmYh4hOkqpdUBchVImbgIX/xNk1YIh5H+cgkg9tCWu1vk/+EpI0f82hPKMZ1ONTQcsIJzT/MQjDR/FYB+Afx6hFgASif4eEqJBgW6iFQ506zBiKTjgJ5GHMaXen1/5LBY/4fAX/4xl0Qhk/w8J6y+s7QCha1C2UY1KzCtP14m9D7W0sfx6Dzmv8P/HtBrlXYu9kTkt3beAb8sbh/GAjKC/x7NqgTB/Fdjr/CQD9/D2xv/g+ITMgGEMDbWpv5rCR/4v1h2nt4jf/Pgk69/H+z3/ClAHgIAAMn8J0cv+2gkjQBpP8PConf8YgP4OBf6EGTZv4O6dnE9EimFv4rEa/8uCVJX+ehFvD+IgmT+IQqP+OBgSRP5BB3xmi/0ML+k/x1XcslTs7ab/HwyX/O4A9/iAdnAf4xAuycQ/l54xQP8UD1aH8Ihy/+CAlC/+TgxuENO6QMr/KwteL/Fo+GIAZKYVen/Hwj//mwKQC/gNOrFqoDnhK4HOZ+EcY3RybJXqRcEZmuIIixyi9XTaLN2MYdP8YBHwGff+MC+AzV6vDFmUuAnMewOE2RUsFMbwf4AGzp00IHqBI3xsEUNqNRB/gY50/2IF8B8Il/t4L4Dwlb/4AXwB6skY8p6uxvDGfwBJTDpVviRSce7qCf4CD73gDB34JIGd28lXWEEhH2gDntwsta3ojDCnEegvGgk4us9zjKQbEnD42x7x/P8/BfAZsbOpOl96EZleJrPMgi5d+5z/ZyBzeOMUlOuFHZKTIfquTcchykwvbL8fjr+aYtJMjCBb8GASkS+Dr9SjgmVdsv8fnmghWyAMKpJ2TYecyNe/IDiYku3LICLaJczWdoC5MlDodS0M0BD/ABN+3n3aPXlC/LgwD15lQoqmGHy6pB70t4YlclmM/x+A/0eiU1e7gPgF3MFTR1y8uD9ZVamo9ydPRdWo3Qpjg+j+HkNhxKn6rvvfUBMdixi++ZtlVs9gsnE+uDdxdcq2Vgj/5KIk5QRp7eESgS16t12et4XNoBmdugcUi+gSNLuM0693lnjlCeW3+FLWm2sALHFLDd9yHLQPJCSUxV3qWR/wUCIqg0IBMj2q88BgAi9BZPr/C9pJ/kYZZX/hcClRXuPtN90sADA/hQCBPkEUYLnCAeEAn9nef4DABIODBrtr/xYJ8XghL/l4T4L/4TAzv4fAp/4+SKxNvbmVJbmNoVjb/NApO0BoH8bAU//EARb/EAG+GWGiJCPyW/5yEphh1DaqYYA/8kCjEX8TBvkBBH+HhYAdMPMyYwBEDvnK8/Snh0kVMmNGm4/4rBEP+H+g/+HwF//GAnOB/GgMHmv1RVQKz/HyTB/D9GP/guEQQ6EAAIsAAE8CoP4fAn+5jweUB/xMG/0fwmFYIIDzwobaAn+Ohw0n+Bg21h/xQKCOPuX1f4wB6WPzAL0T+AA+WQOOQeGQahV+zy+nT9VL/AQQKB3/wKJ2qn+FAaJ/kdf/Jwijr/iYGjG/h9aRwFldEZmi18aafgx1h9HOxAlCVLnS/4HyEv8IAskv+CAkOD/PhCU3/kALLs4SJsl1NgLcX7lz97sCRIzLPVbXJi0OkOSRgSxScGrkyOcH/FQwA8V0FoEn8Bq+sZ/Alit/vgSBYAf9bCQLAUC/4IEgWAgAf90CQLEUAqn8OXI0UMN5H/HgA+N2V1/HgA+QB+R/DoAPktR/DgG+R8iB/DoA/2fw+eiAjKS0LfW0fnPwzD2k0tDZuOBncVKus1GiiTYTNDxWkjGObpJVpy3RrY6WGnwA3kuOdDWc3LFZbCBunk8Jy4Y58gLKKcMYEpPn6YUdaIIqlHM4JR/8Uyp6SqIyyuhrI6/O/oLcb0ghLQRJe9xq0lN9cDYw1uKlLW1px44E2maNbPY238PmBJ6xBxAnW96SqRH4atriLVhAr6lluax9VTCQKWjy9SsxpeA+gehv/uKSF+dRymZdnLpz92ELE+8U8HnMoCj3o0P6JGbBO+ZS8wyH47OTeDVYTpRw/9p315lC8lguC9G89mZ4VF/x30QCA/i+z3OZrilADQjLV6IvWYZ8YzdD/tw1I3+EAlufhJKiy7bDKSf/AREYrs/MzyVwUVtwTinbC4/CHoipuwOVw+oZQfwoKN+EFzhCr/kQG26BUu3l9MAm4DADmjYm44hRgCmurudimqQHSb60hV7IGGhAKZof8EEeWAwO0BYOdaJkIPbety5Txq25EYDNUM6+sBr73P/BUWMXFHRUB3VYMmq74vxCDiFa4oiIrKGirewfQQaMABq87H5Qan8433GC4SCX2iTZY+2cfR+C/J/wIBWDP+Ggag1eCP8BBk9ZLOFbrJoPomok9g64n1EJxw4GF7vsz9cFAfwLHMmXbjKa8FAYVHXmmch4XHOMYkIxfHQ0gvzOxSIWLiThbk/+MA5gdgf/hApGw7YfwSEgmrBGbLoP4X+AhS0dBLE+pBa9oiOcOa9vkrbOHa6MpJ40fY2KNxyHPJAqp5z4mLFL9YCV8jomJ/G/wYIhkAe5CAaCEz38BgAlP/nwF95Lf4d9sv1Av8fA8/aHHSVGr7vkkyAKW/wBnTDjRA4T1zg4lixLcdDjMx5ct0m2ViwewvoS9NqimQAMsCq4ujM09YAVNEJUQ7SWzSpyK5v/D42mN4H8J9XwF8/4MH3LwJHcS3+AwAQxoMMs0eUiI1B312TpZ2ieFYrIde6AP4AYNdiZ04C5AsE/wcMukf50CQgqDSClQtZnwFe9XGEqav0m+gT++Yr/0uLTL3oBAHG3nOv8/BSKMB/HxocKfwmCTfxU6VwH+vAxwD/t4Af/iAwKFC9jaGFybXMvcGFzc3BvcnQucG5n/jBVTDs3w62MHLnGtgH5/wEKB1WUxqSXtDS0N/AJUDQ2Ep8XxvxU1YGC2IpLrxKrTBsQWZeJjkQoScayXKf4QECSEa/2kJ76x/+whAmH+PKHH//wMUD/HEJ4DOlyLlaWk/OOBJMRZZqyb/38DFArtwLDkPvLz0jsOHvMcjrWqC9wEondefzfpA5WOMX6bFvCIYjyuqoCKMPqYgm3UyJCTlXgdzxZz0j0j9i31AtqzoBlv4d62fsKbeWLzn8HGwFZGvBaXiusiJ9JWH6zlZIWrs7VBRbaHDAycio2Y0n1aiiwELsb3DsL1CFs2hJKVE0C7BaLnJ4aP0ZH1+k3TV60Z9eDsJHHmsw6+o71UQV3JPf4CGJkHOzKzfp2bifRpV8T9ILRmHQLo4r6W9/QflOhNlvf4fcMAZkMq2Jl4pcd1ZG98UMKTyhvKwg8PAeicJXIYRcnKXxE1YKdtrWxov5a1ViyIq9napMvJCFZ//gAalo3ZKBtm1d+k0Uh9JR4cMJWstNDBcBnTujGf7DK6kHtSixjWBAY7z77pk1DSMDy2B1FoX21VQLIUn3j8IbS31sbKp0PG/19Is7PAjIDAHUzNbbw61A36dSNQUka5Nw3uP9ORWsiYegHVQLtBkXIeJqlIsN0OXq66kIpoB8kmp4WawTcEfAOhhG+fb9P4FtyD/JpIEu9+8YqFx6V50X8O18nzZWjH86ZphsvMmwd+X1vH/HQIaQlv/UhDSHO/30IaQoAN/rwQ0hzv9ECEoXCjm85e8DWhFGfR53MmoYC4mIgiDvkjimt6stRY0e2SQvc/44Ekj8+hAJnniz+AwAQzgwjmaf87CTH///A0KH/YAkx+0CjdWRiEoim9WByMLXYWPuBcAf8AR8iiPp+T6SFeiz5GRHTkfw8Cv7n8fgr9QzMCLGdItYA4qHVRYEM22Uo1BDu8/AEjPxGUMFsB+DaSHs/w6CvwxFAuIjLCmxmvj1wuFcyktmcYMG4ccilwa28lIUO6gk0Sn/v4RkFEbfk/wEJkMFFKY40Z2xvBcXoHbVjbHwyTwgqNSx0FVz8g4AjJzfoY/MeAFLbpDqNDu2ZWAuc+bLnbdraCHnY5eBcOoEHK/uyHSJyE7S3R5KVqxcSHJkJnSL4BBm2Lmn0Ezx64oF63EFeBgdScNv8JwoqEvobxBtdSYEvlx3Dcqe2x8GoMA9bMMudb0e8s/mhp/H0WLwSvXpC6u1guI1T48uhl+ATOAKxpfTwX70kvZcjdtE3dVVGiPGBelKEEJYahW8IBIpV9PHWNJr7GWsSVwp97QOdk1XwPNX2uO2stPc8qanPoxMbkE2ByW+4x2YWwgM8WsZorASNt+O/Pr/wCL+1nKfiBEHBjbRf7MwPKwS7FW1KxLmULx6JoFFFj6YisNY1TtYIOYBS4QMezvm9HKzU/LntGJ75Arjx5LKnGBmnsETdpiK0QOvH9r7hSpdSpVEuLVPvowKBUnxYwdWQ9eC1Q56szkUR54pBThNKnIDmH+AgogzXMjCmT4M4TSnqH6zokYpyApF5iALuBVYhAVDBLD0itEobv4AmwI83qnz4a0ArF/nC05XVaotICmhVE98by17Mm5muzomZEYul1z0dEeaYgADT0KRDyMvRX1FmYu/ZjfErLyHmBiKoWFGxVAyIN87Jg/G7Qk3fl2Ab7LulXUPfZzl1mpZe0wxcqMnTdBwkwAKrc631EDlagBelXuSwU4FijIT44fwgiEXRWlgj2g0sGqmiB8uyZt2BCDqNAyK9Zv1UsrY1HLGe+i+8N4nWNuS+BPHCsHE0u2VQDHjYzaxfHG0mm5k2EbnsfL4uktHvEX5YY8eRMBeKmABX1IPX4yZ8xUHihmgBkJcejpsgNUMUYeoieeYandSpnwrxp+H7IG0U/wEDevKEdOFgDiObA5TsiWBX95xsuGKfioYgTMgwdhyhp/wBPsoKhsxmw+54MtUW8+UrOhq8KqPdzrfecPRbpqboPbQxVmpzee1zAxyFdGxyjkc/ro90HzdcolNpYoOlaLmRTSiFpPvT6dXpxDr3BSPD10G/YnEgd/2ZuB3SwLWkQCZ/s0t7klm4qTFd1v/AEQhA9vxlQrtiMr3fOxk4QaeEr7Ufk4d0Ud9i+JTOSYHCxDCW1AqANIaXPW9kgGQPIFD8tFMtnDE070Paka1IYdAtthMCDaxGEDKhctG3nJsmtmDC/e8qrKmxHkr01TpnEvXYfwCBklwlZZ6ZGQ+p6cjTQtoDyh5/yKjhUfVMDm3gytUxMuPPP4z1hrDOBPQUH+6jNIu06CbGBtgofZvQ1sO9Qsh5UTTHKbWev5AGUGzpiGy7IhVqGMnEp8y3b3Ywcw53171Waylw/VeXsO7uov/wAg2xfL4jxSNLHR3G1+ZyNfMuTN/GPJF5yO05CejjEgak1ECvfaiXWodanqiFoVPzgev8AITvJQkPCPGQfF51V/gILXEzYyFtrgiGA//AQPMVgEGNe8ZwWLErc4WJ9MfY9R0XXhdfRVZ3r81Ep7mR35b9cFB/Uo2247He/HUYsuOUEnZ5iYowoIW09zD/wBEyCqF1Ei4ljIu6e2h6tsGI2x/gIkkqGxgn4pBUBiuIAyRWoxA7BdPoVCvV3acew5RSDXHYf4ApwUIgfTqCbpjixn4QMO0ZwlvKKu8kxVBHuzHf9c6pmikefo9WTX2h3FATCAH2YNurW9I0KOintjHKYI6MXtPY32idIUuQCEzec57IFnP8BA/GR2ayY5Cohlrt1cMobvQVAJLjdyFSg25CtVnNyDghNc2N7cmo3W/M2XXLv0XghPU7VhaWjGS3arFPY842OG8aVc3pmoABBgzGl5xZ5wW12afH4nzZi9xM0FtNFCp0T5zHK+QBVZfLc4YLBWhsWC68nbNZ6A/d5kFAX5NSGyCBB8+28DHvoInpyMPdwtTMX3W1Fn18tgzuomIgzGPexbNr98YOgNqP8C8VKWuqKey6izhs0RuRPEIHoLW0vwZpSez2CM4kQBcZc30jX/jgU829f726OSK+MC5wNowLPchbiZ9FDSQx03ANP88Ag8b9KQn8C9HKqfGV+tGu7cr1QNxZ+VMvISZTtQtKaJM0azSC+/SSxygdZmlMMTEE+KWv0vi0JSdrsQ+zcPf/gwE7qBniEAn6NhoSvnEjIIMBInD/Jwcoz49h1Ofnsms6T8Ngk0hoWqDRaC/wsCBqtItXYWfensf4MCSNOFY6zjXKPjyHb0ZMGMX2/Af3sJTJMqJLbaPXO/DycAvE0Q/xwKfyV/80o90p/sIGYbAAAAf7/R7v5yR7kSuhNjpo3RE/g1Xszchc/wkEOWiCd9PE0fz+j3WKMikwMNKQ+tdlI3sBPwfwAvHYT/60nLTnMzcK/xmEm+WmjOaFNjttYWxlOMLE67NneEO6MyIs0kFic2sbyagDHcDY0F/Ivh4kverfETq95tqYYJXthzTtJOJogVGSTRlMt8xP93wvDVorcb1qu/BSJASSYFdVlAfQUHtfwAv7kLtehfcbfufr9Vv72FEJc/6VvKZNROTj80vQQfJLAftTiNoEWNxiaIqu1oC5jY64wrdSB8Kmp7przMFqMck+U3fmTJhwcgU/+QBD8BBGNU8i6504JDpG79XoUWAZcN6PzTTyJ891fduUhSNkEPtI/NiKZgpohnAfTT7xVwFsjXHfNJJ+9A0kVj8tArzv4M6D/AQsFjB9xebCz7GfnW6ycfOa4JkR+DxkIzsJuq2qwj35s+Pq+OhDmLpSkHtRIleL/gILZgSmN/I+dOWnxHO9WAGlWsdrstL1AqBwmxLzWuPcOVUMbdl/pX2sl6J1rI9bT4V2sGAdPs0zy1Hgqu5i/QvNIPOwyrK5nZChC3VmakZKbsCsFxbl9dg3kHTV8xrbAOjV15BYlL+CBufi//YCsejRp6hRfYj7k0bSJuuPMDYGhUZpR/x4y0d3K/fwL31Lh9cxZ3p2RF1dNvFcKdsmy96jPfBxs1xvqgdg3mttXixOUhc6+H8IG7KzAHhlukRv4+1pK2Pw2wQFDQgE0RR/BgTHm/4+GGYv8bCw8h/jAQbBG61w/gDPLXv8GD93+VkeT/QUtT0QHhPTOjD04bfwA2hlVKZfwLHdJEIBLk24/wGACPX+VgVc312OIJGAhco31VB9N+xyZ10qrdA/+FrFTfbcZ2fzGs/ge6kSA/l/TQGRNq/gDRnfVPEJZWXw5G4P/CQG9eQwJP8CayBxscJv8cBagCf//4Pj7/0cHx9h+NzfwLgaMgLeKFw0opHKmqzlzp6mRVA0SgXDXW1diXWN+WyLeYc9PjejBFXS02obnDuFB808+eV0S8GP8DPjoVcf4UBqRQKSk/46fHFw/h58ddP8fBtdwcNfNVQ+ypgv3aCImrbUjwRnn+AiEC5i1CpSWbYg6sYaXRmL6eKUxxkw7rPMkqvAHkZq0+OxbQi+38GxouEWdCRdER/yED9L/wHixNBnVVrkOGP+AgEtPSd60urhbDE05JvXhw5ZDYp3eWJwIRzN1/h1dFLv4qV0XP/n5dFFv5eXRc/+RV0VtkkvNO3FBNPJ4MZQ6cQytlJxyxI5I57WuCdMPjjp8n2d/xkGtawX8C0RX/Vg1rF/zIITjf58DWtRwg/gmVdc5zrps8OFwu9nB6AKSEeVYfTZiJFUSsvXtJQozhgocnxOOSTQf62OUEJ08YjfHWejSV/UoT+eCA4uEBP4AiNM//BAyzP/CYBBAfwVLfwvBhN6fDVIYRkX1NzszyWAfoa9NxfBGmwCF3ix2Ra6dvP+IoUCPkf8FCgR+5CAdyIL38BgAjPBZEx/iwN3y/xACvCiRsGPX2ucg91uFK6G+ud1A/sbWv/w3wviOipjlwCMl9Gy8qhvXDifKxNI/1M0TW8aY9drcPg6+p9InX0WB/AmIiRLx10GZowmg7cwokJihocD//AHe2P0YK/6un1bvMjhmfr8Ukf44EeVKL/aAjyotgK/4jqxTHki3952EKVgCiMBCkJZ4eV6fTBR7Ur7sflRA3+xgxjj//wjyp/xIMY4gRWG37+Ch9yuF/4fmugZQLOLO2JJmDqnpvS5qvulp7RhHcHyzLa94gKJmwZ1PnICbf8BBT46MTOb2Osje3xOgKNDkZyr1yXYMcbqFIuyA058R79RvTW8nu2AXtYRvD3zQRpQgf8AuCZEe706PLi1P5rrdGqFlEUl8gCrDS94ZLZTx4awnYER1z9PrMdTCU2JlolEz2aLPX14Hhc47ag3hKEqKFULQhxDdd34IbQ+YfWT63+X57DsOPO1JZa2UF9BkGNtWBv57TvqApEvg/8FXAeytFGH8PnSaR3V5XuHh+OAiZxMRFfPiRe2uvLX8Af1Hor8V9efrW3dfFtIVc8cA8BZTykbLFQPNAkx5bw5Lay3+AhxexdI3xCoXosePwYtIBo/fZfxazfz2LjfSKmsqGdubEhEIpOr+lQTlzUPEK7xr/GkrLLonlly1tNJi6SXYT6TVq2WdaEkLplnoGMI4APSooW5N75ZJgl2IES/vzb3shm9lYnA5qDdLm0ELOQZcPMsfOirkCc6WlmRfw8cGRAE7tR9hLBQm+lKb7WQJ7Az4AntNzvujElSlcrjlor+X3Qs7rNgZr/BwuMekQgGjEKx/wGACNn8fUsRr/D1LFNfw/SxMDIj/wAcqzRmIr+Sgv+AhabkvKxuH5dqY+0tFF22Ll+Zlg7dxMTiS7yB1x7hyFvwTNyZLdqan9OpIZo2v+XARuzUNAo6pP4gpYv8cBDRCc//sCl7TlDbiLsQl6T61xRNjibBIVMFCMQ/zwJ5gYyjWH+Ogw0dQ/vZEFYJxBRw9uOv8BDcgXUz38lcY3HhvUPtv8eVN0WgG/8C/Pca9q9LHxjSQXk+EaDXWnDOo1OxBsJ/gIlxeNZjVlRUSmzga2P8TBtE3yBF8GC3FFCAUnqppBgUwMf5CF0Dv44nhiEF/ysG1Z0/8AJLbMdsj6vBEEdfhqyvuolC8aEgf/FYA//D4C+hf48FfLP8sAiCZ3PolXMTvP/wyAP+F9YHxFZfxWCP/4iGWX6Ep/x4HVH08/D34fIfx9aLAH8E4ssiPW4hXjkpCT0VEGb3YkW+saHxD+uYYnY3wKD3OtJsutz/Ds2UX/1TNlaH8/TZRF/L02FaH8izZUxJQ4hDQqc1IZtDs4rpfgoCuNs/ex57Jets03axi30Dbj4Je3NPHi0+PntBo+IdRRKt/b9Bf9gDu3+SQEg4/gzh1n2wiONTZQk0MP8h1pST7IaunC/85CZPpcOrsp3Sh/A153PY4/ej26GVJf2ve0sr1bV/wAvUrIUeMGvXH6dcGy+cwizo5j/ChQd4fwIUGpMIBI/b/gIjCIAhH7geDAqLZlO3Xy8nEdUfQtVLVvCvnYTX0nBWx/wYQbMH8CbD9Ffg/paU0y61h/zZhKk/Wcd1BE4vm7qtYgWY08eAugsNd/CYuWBfIDwQXOECMIENHsofwGACK8GB6EhzwoM/Q8AKLXJPaq1cgNb/g4hjR/kcYl/h8HAQv8rCyqF/AwFN/DJzGG7YgSdWp/FYwL/j4OH5/xIDk7w/8djEs/kIE6h/iEYlHcDeAEH+A//rAK40SZVhmhKdG3grVqIRztQMAYUGAvkD/gPdsfoQCv3Uw/gMAEOYMCchH/FgurGgeJ/14XVt/hMDy/3sLq2gb+zVv8vWxgjvia5+bEsegHB4v5c5V187UFq1g/PDyRdo285LqUQvTtYUMe7yK+JSKkubZ7tp9C9K/bLEgSHOLa5jGHNQsq8MCLeY11iQ20Fodh65d5k3pfJew6Sr/gISdrcYKdX+H2b9S/qtxRfYo3V3aScV1WOXi2tK9pNtZd5IvB6dQe+24WmLLQ6jRd2SLSMpxCJ/cwcnJjZMqprjl4oPEyI6cgvO/gG8VPvNPvOh/wEQTMCOZgVEEC+ptE5nXeYC94yOUv29/oMmemHgAg0/I0RMDV4ovRreANoNOHp21WmAd9/XtomOzLSmonVSHvdkdeyDUb/rMZKowEfaaFs5JmSX/qVidmrrmRMk/XJJ7l35o/ME8Nls114eL0js99tahhQfreE7lvGAToF88p2I6Ce4Ba8CF97eeL4opyutcByFoVLt6tXOwjWKP6whwLb6lsRWakLgwDDV/gIF1Qh8yfQgKed3ukoW+5nmVPLnQzajb7n2z0TTbbztA4/RaBh37SRxS1fy8MD37jD981z2AtxbacW2Zym2/YVU4sxldOi5h11VAmxs4gHi7KhyFwkbBPLTbrdYC/fNolG/wEKvHXwY4Nn2nzSejmonICN5hZVF+WqV0zYWoUuthP6AD9p3SdHuJB6suFRdK87TcpwkHLWeSXO8DB5OfjMrLQm0hH28KQlOALL8dZnmUKJznzft71vd/SrmjKuCqOUdAUdlnXQWRqdYN+X5nNKAHHIZbBe1k0pea/f8C9llRLtUcyegiHXsui8sSAZatoLSU0ve7v15D68NEg3321ev/Cg0j8gWl/AiFClQgEm3C1/AGACMcGCiiN/i4PgepDB/OP8MBm+n8Jjc38faJuB/FaO3dKTsFNFc4ndbC8t9z7e6ElEEvp3/FYA//qQPGs/ywAqUX+fgUTr/HwfD9/j4EaM5O60Ya0eTvi473GjNpTMxhK8FHE/r8C//gX48rHdmdCFCAc+ncR/4GLT4ZSsQ5bD+KbJsUT7CEOh06D8P/8UBhwblA80IBYVaYQUm9/4qAuRp7D4nH/wEC9MoP+IguRr+EwE5Edd+CTLfC2e9y3yg6RznLBVeQz9XQnk+dkBc7tIyi+X+Cf1a4hSMWtUTb9M+vnWcNeCIzORa6Ed9H5Xf6oPycBsgBBcX/hUO9/wUQ9vYQlwgElVlH/AYAJWQYHl5P8Kh3txycn8ARJ64X/yzB3lB/Ffe1/j4Sd1/jF/yQkD+BaOu7PcAsZJVAACCYEFGkdXpywOo9BIKQ6m+3BzLIgMhyJZ+/xgLPm9kIBn2Yh3++hZskAof8aCzckZmRan/jgK5dB65QDf89Cz5vbCAF/gIY+JAP+ZCz5r0o+vWtjP+PBGUjR/ygLPm8OHvVPHx7YoQKtLKYUBEiSEJoIbi+ULvW32n4v6N9nctOTiKgJHKdaTAWcDadvF7YG7iuoawH/A9Yysp0IBmFBz/wGAAnl/rwWJNWwL6P/HcyUA6G4oRyLZGh/rmYAlp8y/rJfBhFn6tKkj0hj/YQsSaAf/+Blmv+KAyzUGt77n/BUCP0mQs/50EXTHTkWXdU/h+o5M4H3OatyZX8YGhxhz9BBByWcXp1PZHjx6rMctSlxBLHlg1a9k79Wgrgvy1Etra3iKP8BvSwVFVWiz+AKMnhcyjbgLqIJC/jvZuJcTZAxcgcC3w+xKUhDCS82F1eBMBeAALylRlDg22r6RVwUfel9TgmM118sCDXOT3aWnba40FI7LJWl9VnTl+xMOw9UMIMRPU/dsSP8CX4awRRfZ8UcC6NVy18v4H14NRDp7PokbUEpjdNlZ0CigTvKZYDwPkNZNqW0nb+H6xkxZSvEjtSikSX/wEOtZfiRqVl0uE2kMCw0KOKdksJpiirlaN7Vfe9NMjX6+YpaDJK8WDkDPziUMO1mDfExnaEfoCslLQIE6moxYB6PKngCjRdbf8AsDu0QHCouiGs1qDzwGfHZSbTe4w2nre6ATjB074bgZ0tizgHLVE5QLvllLlrnqXc6ZEDbJJifBgmBQ+M5WLK41VamWXqZDYdWOKlyGz2YBOFjwII2Xdk9drWeWYCOf0hlCDFhRH7a3QFl+C6H1yHdQz38CzoRjnnkXDcG5ctbz1uE89xtcB70yZ8CsJnwUD7CKPxJP4CCzsYH/HAvX8fwgE2WOR/AYAJnwYENW3+dhev72/w8C8r0FSKf8f8y/+QBd0b/HgvR91cxCk+gAvRSPawqSy/coue3Fk9n5OG2bP/yiV0BoEr7/vwXr+9/8eBtVjB/A8Cvqi65vSglb5Z5jIAjchXBtqRFBuqYb8/ZbYEt0Xoq/NcJn8fgv79jviRglBoqKGKUMLmakNA+Q8+05SOm8i+mXBWvZn4O0JBLuc6IooBziq1VzlzeF5pzzJ8sHdmzMSCVmzRpL/D+ASAoANvbksKDgOzwv+AigyX75a7q9DwQ6jV6oxh+cxcvGFaSVGx/1cL1/fxEFrgf+ghev7fYPyDlHuSAAEAA/9WC9fwH/qwXr+A/8zC9f3+RCVQ7+MwFHAf4gCL/41ArwIzyML4JC/NXKpocjouUo/pXnD4QxJ4ddLnXjBp0uF+oP+HaWHRbysC2PMjni6C0i653RB87hgG3omPi8+yY93NXxKzKMaGbls/gDAKMHSJj5o0FlUe3BfsuZnmcn+BGExX/wgDPwyf+MgTgBKrImALEH/BgJqk6uhkn4d+BbEO1QKDM+AQ4NJCnavVss//AB2kjNysmYh1Wxd/gIZv0UxBCJa/6n1meb5a0LJ6UjIaY0v4IxW3HPvbFbfgWTrM2AIKKzKaCcE9frs8ey4wr+PIb8eMSX+AXnT5bfDVkfryBoekzD7l6QFfbc9qrvxIxMFkG4CJlWAGPZvP4ZrXYP+ka13/Pwrqr/GKcd/kIV01/wgGP4cPRF8PP+keZR1gFb7hlfuFMD+CIWsEePje7x6GcFxXVDDshO/Qa7iSy8TL7s/KUHw4aflIBtL/A0uGCzH+kBX+/tz4dIdf4ASXQP/Hwr/eeOHGGMbXCMlap6EN/AABa7eaVA8xV3zVT2gOuyGgllTNjtom1+x+qaYf0Npx6Sr9U+NUth4BAj1DdfqGyyEWMLrbsH8QS4f+LCcGky/wQGbTl2AfwUI6W9N/konBq/3MIPbgDr/j4QIlZf45WCkgsU2RAMD+HwB/ApP3MVEK7WtadnbVM3h1B32Hl5pz5DRSCDRUhAKXZUeDBX28/xQRA/Q3+CiIl+5DORzbqwLMIwmOXqS+WNUI0WEyUroe6Hw++m0P4ACJ6W3m3iw++lIO6f+S2Oh/hMCx/xoH6Ag2/GZaB//4OJy/ysIDWf5+Dicv5N7Ez/CBEmN/rYOJyhA6K5n/+S1TL/LgcSkC6Q7dfxR4pkiFFEYIB31ZOCdoSufjoEjFoFq3/xWAP/w/5efxbFphIJPipzAH+ug4lIIZACjLmz/PwKmmAf58EBKgEJDP48CwgDBwgDCv87CX/H8JyNj1F/xQApf4oIfCAaG/if7LCvyE5OsDBv4qA/wIFAwb+O1Hv+FJWyB/ijEA4fAnPoECAAE/jhT74XAG9N/oqiv4BYnkl62Y1HD2b+96gf9EBLF0/iQbJD/i5aaAf4+zgAKg/z0E3nf4+Dd0wNg/z0OxjAgQf56IAmQSg/ysIHk/42IIyP42bUv8/DAJBI4UEEY/5Og5HL+HQL8nF+rWuSAN0jWq1GjJgJ6kAP8BC/tNY02sf1TafSH5JFbHMfxWB//xkBNkH8ek4YAQfx9TL/xgC/z8fjs/8HhH6qkZBvVg0Z6tRYSPO+EQVJJF+kB9tuouZAwY7Kg8YvsX8vBP5Ajo0+Q8QTqHwtev6ZR9LXf8fjSA6+pDAUaordvZw835P/HQN//EZBH/D/fZ/kAfLt/h8ET6twWQxckRPHrW1SJRbxKv8CCAiseaA0+s/4MBe5S8yP4lu+OwbG5t07ihJiCxHKyn9SMxYy+hTabRRCzZWNyskmS6iU7+Hfn0w/qn590f5++fUL+Xvn3R/kH59/gOz8zANcP4Boq1Jpah5DLwZ4d88MFQtH47e0AWsjEvEhgxMKXms/xsP2Eq9L/BCo/HwYTaJ3+ah+wnpm0tkZvTk/j4i+cD+PZnUDYP9dA+YP8CCcBwbFyo/3KJQRSZyJJ3y4CYI/lhC+//Ay/V9zXh0T8wPACNXwSh3faXjWHYVvEyIKjQSBmK7ATsLcN/BlaMkvofSuGCUQYUdUeAywVSleRkhX+fhBqZBQzlYA2o7NnpjUlmNW+nuV/k+qRBRUgRUFyvM4UOXLDQAFMe0NqiYljpb7RICf8BAjC5RQtIKV6lCBaLNYO0EasAqcg3/MIJ//D4H//DgK//D/pD/FYL//GULMoNZZlYaHSYtFcAN3TkbDViAz2tDvq91Z1pt+VlmiT8AbwdEpCqBO4JyQ+lqpmoLyFo15lcyZz5/5gZR99NxBNwASpvQ/zWCv/5sG0hP48Bf/4dDvlSi/l7kl/j8iAQHcRLRS/fqY7N2CGVhAiuGqc07U5cYEdfKmZypOgPaswkdD20oSs9s9GfkJprttCJjyOraPyyMsPd3/wEH72a0Izr5ef5ECi7zjqy4Ft9kAn88pTZzXvpoNc2nDX/wYMM+Kd/jQXbFO3tRed0t/h0UeGlf+Po/v+H0uXpAf8TBVTJNeVyJs8gk5LNbvJpXpl892bZ86f8v7mLMtkzHocqpLZ7T6qi56WE3waUCTpqHu28JztOMJceTbP8cYN4vwLEY5PGjfwva7wmTj7ihI1pZrUMtmwywOnTjqqP18dsEDB9CARxWDUGBu2//HTHp/DneI09v+AgVYX+H2PTf/AW9PswCzjGrCNXa0Adabw+oogN4s2QM7lrqVmikbgUNPKQv1EvlrLvkIUbOzHS8ufIS1pL5wdZXVrwS1+Pd7/JQcQL/wH30e1ri3uXgznsst8Ab4Ii7E6yfS0JKSQKMXKHOHOW5hOp4/8OevCj/f/3F/I33EATq9/Az1l/ABiVNvLgMVBJC7xYxSND7QwDK9hMUexRr84gdYsB6ruV9ER/igQFNnCCDr6EAqWtgYMCnCiUFTR3mJpjGysoIyiR0IvImBAq+oh/wYNeTXCDCUd3/kAi4KYmNfYjgzenNnbjALAIAIf4NAAf4IcYuICFdQldjc2z0Ivr+vpnhSE1vThpMTidLmh5fNMMznF9a3/hNle+O2CDRaEAnGJrv8+C4VHB/A84sjf+PgTGK6a+ostI3eCz3Awgvs9LV9YCTCp6+1+ciZbfesf8BC5oYp7I1GdDC6LkeE+0eaeTqDEBcEfTbXi7qufsnxUHOxHT1KfxIGySV3l+zKkIvNNz1qrYDg3uyHeg4L/4KEfeQSEAmESof4DABKqDASb2/hS+Bhm72aF8T6T+Eb4D+EwFYGTx1q+dc5tDfJ138dvzSv57t8KqrrxVqLCgY6gIjQzvA/h31HMf6p9R9L+fvUdF/l71H0v5EPUfiBZfH1cj48ZPId7gJiWDJ31OTeHGjlw7GXqppKRQ+fCIdv8cAlf27CAfxCG/8BgAiBBaOT/k4KKE1bK87qirMsr8VIgf36MSryqfVj+HF1YGLhmQu7ba76xyCipxjGrGsd5nNYuQKDL4PQSOVmRw1TaWWH/BQuotxBO3cQZId5unFpQcVRFBXZRAc00Qo6DtK+z5Ufx7DJ2nT6qndPRdguQr8gsumkftyfZKRjGIHWXf4QCUuv8UA4+TsTHp4LLTAf5EF0UzL+qUW3T/n5FtS/5eRbdP+RQx4bi7g+wouh0EDLLvZR72TZm2D4lcS9OnWTht6HEWoNqd0bMTyfiLdWOklsQeL5Nktl1Q88jm/wYNjwBcIBZGNJfwGACTcBgT4Yf+QGx4P4TA8v98DY8ACTkv+vhseCMU8wtnUJA0pH7aGakxkyG0PutBlpxF9qzRUyt+x0rPMIzCsKmlcVz4BkqsyVTtR/f7Mjdgpey2ghNfjt0d72w2aqYI33VT/LKWUPhcIJHUOzFm2Gg6yBmgmSMuRLtjnqgdkszNoExEn1OYkzmfxWHYTurkJnl4vQZgJlG+dPfZJoQ+ERrmLKSXMge/gMSYytNIUiPpd564gHee+3YmzeiOeEmrHJ0ljJioamajCPlWwNZAPERMqHCOvx+SlhWKKqMpxx50Py5k3p1SpAOcs3etbFYDdMYt5E7iFNDMxkq33PgztTTs3ug59EuiTwwMY+9PBv8bIMb8y8VJ1za/ICB285yy9fP0KLtwBxdPBW0MBKnDJcgMjpqXepSDdgHO1DJSqEJ82W8waXrD3iUhlF5kmsReQRbdG5hUscNhC+SL8LkX3/WLzZC6JiAfw6hlzTXTYE2CXbLHBQEPGlFe3s2Mi6XNMyEAfGMhlfS/O+F6be7KYCgyx2Jngjdndq29MlCYiR8QErx4zheY8vHYWU9a2jf5dN9yNJ4Ee/FYjgbM4diHkrCbkPjpnC+mfsvuSDtdXfgM4uuynteRC6p53pHWDJGZtsNHdHcCUkbf8BArptdtpVMFnCSrgOF33fBI6djAR3x2VQmAhJXEgDJO82FwLUvkoS/JyfVpcRiSk0gUv5eyMyxlNUT96UHV+nOb7BGQiHZpCIOeDh/wAvneVlLv2KYyOX7iiI7W4Cf6AGx4P4ELLCHflF/n0K+qxLsYfpqBXJqlX4i0Z03WFeRstgy7HMTVIVrdTf3LhhrLJLYKjza5u0mpsvBe0C+Qf/BBFz6MUcIBh58QwgHD3vFBh5DVSiC91jl6/OAgWfag48IfFQzPYmHtfLmC/wcObQA5JtHeyOf58CQMtA/l5CB4P49wTQb+LOXv/MgQVL/CmvuFx9N8VC4YnhxfzAnwgo//hojwsBQ0C6V6l00URAdWfl6ubjDlH7PJGBpcsZC2eTfgPQnccGDdCMKh3HZVeD8wmt6H+0HE2cH0WB2AV6gA6Rkh2JFY/Z9AANO5sVBwfNY3kytr7oEVLWRZgDjMQa02pYRKeB5C8bZyBZhuYjdxmZVZLDpzD24FwJP38kQjm8x5zDSjTEtsrDmIABAv6CxusQDYb2fhozPkeLP6DB+3s15S7+piQD6IWh2NUTYy+ABeFo+zFYQR3UhZthPVcJDpoICFLmAtaZ54cBCegAFIlgtrZT0WJrIv2K+V1K667ktaRH+PV9Njjz7vdGfKqnGrRiFBD4/fe3gEAH+CwO5vMfC2l9/4AsvjkBpLoxlE1JSeYBBoAiaOT6gSUP+8fuk1PlsfeUNfZp1t+KCEYoyeoiAMUJQI/HnArJFrH/twRJkePYNhli0rnxWoOKvceiO3ecwFXQAHLJwkijqWIGDM0pY9oWiWKqdAw51luPcfJFT4e6G1mnUPraylwgV++utr/8isGWmXvRDfVnzlUPc7ErB4IxVb3NAAkD7LugWUu50BXJseMJwvXfVLOhacSBV4HzSjL5beI6+YSJs+UZXI1C2dgqfjIzsGnDyK01o0H0MTwKqzYkSr3HyAQoepae6vmPR4uqjdiC5uYza9ZxId5lV766fPmpJj0AVvtAiO1adQnK8teuh9cOaZbkp7/pVTPB3FsUeigHUYycGIADLF2Krt/7Kv2usr9/ccd/PYBD6VXlLzipYNUlxMoD0DFjMVdudpkYsFW6flDK6WTdYCp3/wRB7m7g37NEnEfvwKAEN3fbsJogGa/0UyHQfpm1cmlB9pi8hy5RzYvK2xwm5wxPEseAREowk7RSZK08pslLCx/UWLg5nDHdBSmnnjCCN2eAA4QU3C89zZwDeWu/gC5hCUnocmVZ/wECdIMxVlFTW24qQdFBop4a8UG880T00I3xisJBG9IwH0g6DJ5r4v5aJWrPIF1jRh2AB/0goOC6/dzLn3CTTSkSA1rrp5nt109Gdk9UF0ROgF8jxXH9JIcnBxPbFrnaffDL9sx4ZavOQTuO+1tKF6XQ8RnwCIRorGLJOp9tjP6Ca0QBxQit/AF3X9XZy8g3FyyHZh+GMcGDSGULOEuLY/G9DJzcs048WbipxqjObZhI35iEUsPpkjBkANPbxD3+Uh1P5DA/KQv8JDr/IiOum4AABOIVQUUf4CGyloPhBR3oswXFMxzG62KedIBAFUASeUGU3Vfv4AYq2Wge+LyCKyMSzfxDIAk0MEyhfuZjyr/gUWLdf4GsOwJZ/wcSyafwOBJP8EATYK8dn8DE9/SG98g7jlpjU92kbpcLb7py5k69Do3L7cIVeKjsBoJH12ujLChdfdzv2U3cQHM1qCZzl0pTReP5cfJMcgOd9suiru8Fj5OwiNTEPrN418/GTJyxcbTBUaV9v6kTQ2GTu4AzZnKNAh/gij4qbAmWdovVODx4Ew93hmLK4e0OcnWLM8QkbhtuV55qcwjGeu7Oh7eb3nrTytbMY5TISg2rq5a5cvAaYgyn0Y4O1LVz+SsNc3pemO0ARiNjD7N9rejMv29+P1/4CDI5oFIY3w/JGPaUOh7Jsgh1/oI3f+AMKEFKSOxJMyhp0Pqa7MoCL1051H6oteHszbImFy2mWRqHx7WE/KsyV9GrFAwnanEOj/PwSZmPYH+DAWVthP4Ewav/dA/p6Z/jQi52obP/BArH5/AZ5nk1k0LN8y7Zmy0wPNzE7N6gvhpjdDJvE32APH/4SCj7eIpWBspogZXaKoJQ1Gs0w7vj5g8iAMR3C3tgMgU5OZrqLR/IXUBxeEAm9ciIMD2+/+OlAfh/h7BhOP4fUBym+GBQzccw9tpUupf8P7rEPRYRfAGZH6RTsnCxVszALnmSgWSSD2qssWUXGn+AIel6J7PWcoWIVUZA0rTD1YPO8tXE/yQDCWH8CIyBaWeAgeguxO8iZyc303jmFxHCQWFHANlQ80fVaX3lKdMvXZ/cNaJr+15RvLTtHi3j4n0JwBv8EaR8if3+u2fyOu2Zy2A/wKmQGTG1rqgLNgwNSA8aKgpo2PXFQwipFgx9HopabykRSX20JFvx8LWzCgKisF8TxuRvfgKL/D7vw5wQXcQgFU4ZNBgRGn/5aE1MP4zeKv4JDZzaiFg8i2dA452lDT5/+6GGasuTdTfN3FxqBr3cMoR4zmrPh2QvM/6pkL9T+fpC9Y/l6Qv1P5FV4DoMiYksLdMAzU/326VAG1JaZG+pelDyDG4qMF7QpRI0H/jla+koEtsCAfAyTNYV7v6m0fBjH/BQKMwBUn+EAkNGK/wQRlH2/SqPxwzdwcycpzYqMu6jcUnLN+QEJIzCBW/wKsmHwZqGRUxYn2NazIW/3Yfj6zu1+jFZmYRrC22vSVrCduT3z2+GDZtElGEkhuZyQBd01SKOjAHw5QOLCATh5cf81hlX8BtE0hIGOgAOL0aLz+cGJNEryObVdkwFRvrc/TruzlPwyEB3k6qd0tolUyqDKoJoJIoTOlT4e6N5/ggVU/NP/GCqh/2QftOga/rYKxFxWHjKIBCKeXwpQGAJe7kuR7Q5bOqNC09vEf/7KFVP+I+/28Kqf4jV/8AVU/pE+9Cka68rbMmfaQpXvPn5sJ9hqp/ZxBhRYLV76EIOAe6DmBPZteB1SIxPYxOok7Rm7zjdB6iZw2gKTkEjV4O5/z8Kqf65NGPA9X3LIMPo58ZlvY0fnBebkDwgYsysV9w2BDTupMW/182ZpaP9RqFfdvdXOK6NR+5rIwvSmliQGsZ4dZB8fx/HwDQShYy0nW6K1LkgxArrQ+A2HyHrfL27EuSeKz8Fho2DAG+3+DpVYnVAss54JarZVVh+iNxA3D2Ymq8vaQwsdANv8fgP/S7m7a4As/uAL7jwixot/AzuDQwXcx+n+mOryNjPBek7CMNkwwS3UoF3BeK2mWqCopJoEBGH09fA2sGOdHDGZiH+SBUdrtjsMoX+EhUAQk3cERGLiZf+NJBPkA06CC2r/xgqp+qlZt1YwWAbnHDYKf5tXxT63qwmwFBHRWKPMBVjlYH4N/AwlJ/sQVU/4j7/bwqp/vyX/QBVT+vIs/Td4D3lYsKgsA6ST/AQPVRYGZpjPBOeh8BRH1MYssBw4URgpuKmJw+wwoBZSh70SisZ53aPdJlaS5al3r8cRjyH8fh8gppFmLBX3oo43HqBTlWwjQe5c1nn3y3nV+1PA6FmgxzyharuHntKygNq9WuAgaR3YAnsJ09Tq7nhO8K5EGhdZHf+SAXoP+EzuP/CBU9GwtVN/AYAJXwYClsfzsAUf04kZQ2YWCLdUOP38sMLU4yw5BDtfQTxS78UO31Lb6XQYDDD+F41AY/kM9I4v4e/qmD+Hz0gsXp9vyIkV8z8DFSg627D/YXVunfCJf8BCQ+NYslIpH6Mo6Nx8MG+AapBKpIfHj5Lv9g3H7p7yuyZX0AupcDGtrlRRfxRNa8Em4cvAvQCIGfBPi5eu1LtybARNu64sAfxmKsPJQ1TTsiFRDj4UCn/AGMsS+/JqyYmhIFdSs4aFJPw1nMIBQNGj0GAl4H/k4JxH/xgOx8/wlZZbujATXfGfwBkFnf8EykAooUIcKlblCpbWHhT6YeV1eMtmgzOS0UkQk8TirZ1/gCJqxbX+EwKb5ApA6/gUCnBX0I/xYP02ATR/x4Jpk/hDBcP4fBKP8ZApbSrEWrMAeP+ugmmQB/hTkl7wAA2QAAJ3/CBBjZ/CbR1mtcwAFPNzcDmdZHYfAV9KCGhL/4wFEPZjHVLhqtbAogKg/h8Dt/38E1ZfwkNRN6L5fxFGMmnzVOkPq/z9pDsP8vaQ+r/IExxoGzTNPViVshr4KBBhU2Ix4u4T3Z/BFfde/3tX3G/gTtHtEtjSXC/nRvU08GNCYxtOfx5FKlNvM/wLAKqrSQ7O6c8GIaUTs5YjaT5gTKEhb+BuaqcVFlA2S2t5BpP+LBbNr+BFhFHnv9oDIFC4AyOkV4YkNBRCb1+DYLZ2R/ztAHrHBevzomrlRmFgPDW/8+xhCzP//wyBR/xITgMdV6//woL7+BgT8/5AC66G8/8S/JRSiq0krU9CcPI8yVhjZy9Qhswo+2Pw4QoBiH0hYpESOOrI/wECqdiXMXvxYapHinmu61v5yqWGR7xqeTzjo/Lice8ypTyc39Oy6VkJwT0jud63h5y22igMgOv2scHTucz00EdfZr2fcweW3q4uw/cb27RFqu8Ur24jEDxtn8GNKXb9/hzWrPNJPwwn9Zbuo0uGK+4zuhHdD7MfauO0UcTcxbRaQvH9fBpSwhssZTX0xKjYklb+B9CIV9SPQiAQPfw+BvjVvxSRa87f1ULWQfCFFk2BmgZrvlP3VIekjwNG7rcXZB7zBLkiPcZM15Zf4vZkMGvcCN7avP5BnEGc4q5r7bSbhZskVNaLXUX7mKz75tu9WrVX3xjEevVFA0TiZUtXrQ8n+RquEcEZzSDXgN5T+pJv7Yy6rUuBS+wWcrO1+ydhpstI246hmcuBcWAyCC5lEYSmz7yhzycKiS/wAHfNKkMj/481cBVzj4U1aAlnDybyOXe4e/wjwA6j1G2RS+YY4rbTeMS/gJZpqMz93ILe8ZJpeEQPWTR4HNED/AIsVrX+RAT3cGTi6gV2dJgMAlK1RcWc/xWKO413g1Wws6i2FjdL5Ci2Bq4QCPAw2gwxnUv8WEQhAUCf4OIhCP4h/piPELRosGxv0hS2X4d3iCf30RP5lv5dktQbA/z0CLO/+GgRLI/xueA/hsAP/1EJR1gCDoE5cGLqrTh3QwWLIw1ylOwn3uz7SicxXIpoahSf2Vtaz42pIHfrPX2PUfwAImMsbIgg3x8XlAojg3RB/ACJ+zl1DYMa6yfx7x7/wbDOn8JhFX8fJOP8GC0p5LR/8GcI6F7n/B/C4LeDLa4aUbp4n+JAoihqppK/MP8FAvt38HDAX+YiXsz+H0oTm8PzPQIiwAALD6zH+EBANGqjZwEAGpzc5o6BfTH2MLrEPUC3H/Xglzfgf4+FIwv4NHRgQu/k4L//1wKs9gzBYYDd8og3/wQpva5ZyLBnBlU/A66r8XxvGu8fN6v8VkIv8ZCR//BlMr/iYqej/hy4oCQT+HFvY5ZjbAO51u/gcTEARSPWUcxjV6I62qH4WbKAbn5sH45P5rA3/8yAxmP+QATHwgP89CCLrB/JZQL/qILhJAWD/XgJPx/Hgt//DYe+DKZwl6XWy29/HrYsBf5YChbP4yIPwH+NwN/+ZjD2y/wwS1PDa5srn/iAeizMTA3MC42NjM0Mv8TEtepgbHJcZnJiYZibkX+GiMmk0NTgwNjExNjg0MzI2Mjg5MDMi/wsPPrhkbsRkZszKYJTI8JSQlqyedGZiYmBmcmxmxuyMlnLe5CP/h4egcNzcxOTc2MzJ9/xYJ+IcS6eNexYtZivlooQMc2OZp+MNFhN6hWpNslspEagW3R1HJ/jIVo/hf+RAsFT/D8+F/qIIOJHbD+Hppa8/gT4E/hfuiGf4DnmuD/PgsD68f8Pfg7v/D8804IUH7XElANaeUuQAX02e96wkBT2I1bFDzCg++vxjRZmGW2VnEOd/gIoA1pVbW2wyAeFnikivithHbFZ5Ei35GpcTVf4lnmv4D4XWE0+ym8d7ILfOUEMZnI+wU5vCfYg2B+XaueG5J2/P3kprX6mr+AHERGN088j69OY2sJGqBSlzSiQL5CniC5wgD/woOsthYHSf4qCuIYBsNwlSzpuuQpF/y8Osxfxafj/wKAzf6eJaCP8uAKU+wf5WBOXP8yAnPn8WQXtBcNMjjEMWbYOjOpRdPWGcGwN1pH+NBCVHJAvCHqP453UwQTIlqyxv89ERdIQCC8iSsf/HpEX/D44WDQCnesNaaHK+tlyRDBG17Md7IL6CyK2Tmfk0c+Suq7Hjvf8lmxf8fkhf8Jhvv+Oh0r9zz/9cgVLx/kAQfK/z4Ung/4kIkn/9ZAkMf+fhOlv/TAeDn/BIHB/5+BZqP8JFbYP+aA8Xf+EgKP+HxMf/MAq+kIN8zP58E5W/4f3ZeT/QQnJ2CG38lgof8YgkeTif88E5O38AZ3JPBb0AAL/BRC0b/CYOL/lghaN/xMR2b/wqEN6zKQwMnAv8iMXH8Piv/8Piz/8Phff8WD0f8CQEXZJ/wwReNNSJcR1/LT/HqcXw/w5mxNPcCvn8fMuH8eIJadgf44CMrrehAO/GzX+AwAQ+/yoEddfxfi0AQQlOlNaBfOZQ9zBcrAU+SIEWMVFfqs4gEo6u6JYn72nRlm2n+CgQ7tlD2NvUa4xSkzFitoScdBR58vAlQ0Lsny2lgMd/b/AgR2MM3+KhX2QIOv4FAzt/+CQM6DDAi6/xUI79GmmGUQCGAFwhAJ/wIR36B6D+XZ7oEoP9YCulf8DAo38euhwHSf6EDNvBIAuMBQP8nCPW38Jv8XAAanAAbPwAb2QAcHKijXkAMYodoHmpfrAbeqDmbvjcrIt32qJA1nHWZxTUZwUyKWiER2KysDl1o1MB+aA0fMA5pEi8xOyIYU1g+0ff+ChXSL45LgAgw3YPgAyP4GBJv8VCOK/OBgG//woPaVAQYDAIKAwDfBAEFAID+FFjD/IQrp/UAoAAf8EFeC/4DAywHPAUAQAQB0QCA/xMNCCfwIBVQH+RhXergAhGC7QMCfQUAMADAp7/Ige5hWAoBwAYFPf4wFd5tYFAQD/IgjqP5/wIgI3+CBXd3/BQjqOQf4cJG0pQBlAcAIAHJAf4EMvkACRECYAJgBwAgAnQCev8ECu/AE9AT7/goR0aowKs/wUK7PGaBa3+ChXfjqAvP/BQe0+KAZB/goPaxIAMp/kGC4/8/CvBf+hgg8v+P5IL/MgkX1/oAR06/nqLy/kAB7/x4CTdhr/VPIXrfz9yFtP8vchet/IvIW1PN+xKnMX79DDXlGEaxoMEPbU1mM+h9Cj/NcaIo5aJVPr7VeImeynugxY4+L2Wi3mVEuF5Z9ZD/joMARNGJ07Ls+/gr7qcBGXUhNwd1HRIc0R7BLxA5QxrtJe3DNXwhUz6d0f0WQBv8cB0vP7f3tSf1z8gl16vXIr167bgxBHiuMYU5PP/ngPNm0gYaP4F9wXgsCcgIi9zOFI6dMiFAzdnf9rc7gBTshhTt/gAzkrlEVB+NdcXySCa3tzL3dOqtH9fAWcE66JyB/gekP1aJ/mmk/AjAtQ5LgvxcE07WbSHjJlFeJ8r3olncP+gPxiUlFJkFkP9hDctACAAH9/0n//EhYUzm+f8KCQzRFCCx/D6UKDG41nsGLZBoreU3eqt4/dHeP0JHjWDv5peXMCM4d9u2nLt77U5UZjz0OWmRKALjcJSv5YeW79qxbG0lZuRTZ+jOZrrCriKX0WrI0xcTy9Ly6IYULdGdcNMV9QwekUnZDi4kPxrbHBrNmlyDF3yYKhDi7DFJ6PPXXAtRBxmC9BL6g8SxjHrlIYY2tLV7KSwtkNrLoVgkMbvu9qQIM1o5DBwmkEBkzyyKCZA5tnNyk6yYGUK2HQLo8MVFghczPhlZ9Av/D5wYDOzzsyzYOeEoNrrKzQrQ7FWUCAuSO7nZ51SsWxGJRm0/JGJwYQBqUandiWSJclh5H3Etz3U+bR4axrIYgSWKZX/gIYK6AvE/9lgobIMmB+VsWfS09CnqdjYIssFZgfGx7y8MrcYSQeLUhn2tq/ULJJdqERG+Bcdq76xef6Fn+z1O3B6tBVxpBCxsTYXdxaJttGz3bH8C5Si9ao4pr6YGzbVLz2RAIjtpjXQVzAGivJHgds4/XT9X1OsKxB0zpbzmEOGslXT+Bf5F+z0rCJyWeZa9gS50FRF3w2SY3RqFzzAiit0vBXMFI34n/xwL7bCX/7AsLXwfs6pdoJaEC/iMPCZUFlMlW8KmP+PGZ11t43+Bd1GO5RZHZYD3w3rc42swept1d2mf6q0T3MLDn3lsLYFIVNkUKI7TOQi6697ExKLQXU9+i/HOP/AQakH/A4Me0//+Bc07/SQTeD/gwEJFksYS6sPtXWosTzk/z66aaOG02JGxKRcaOOTgVYOhfy6Z94dG6/02/2aaZuVq5fibS7iYisB+DtggHdhAI2sYb/PgoNfm/w9xw1X+Ph2Hw08uXCQug2EPCXjj69KmnAts4KVH13fukK4Kci+wKjoxj2tafEIPix8SQDeEDu28+axbA/aZJwHSRkrvapnBou7+JeOH+A4vOldfTUjXwa75m/SGVOEK6tF3bsCyQCE8yVXswk/wKWUm6sUQXlbRzIt6l9a5symCKcOfQVXx/wUIzDsUIBLD1R/8BgAlxBZWrKf4mIdaP8mFMU/+QCKJz+E68L4bIGshACOIYPgwFlfv8nCUsX+PiDij/EAvOr0y4N/BMhXM1JEV9gG1CFupj0Vj9RUCv2WlTJo7c9VE2So4YEBOwYs/4whAlKG/gXoS/8sIEpfylI1/68ECUh4U/yQFQa/w7xDGN/VPENr/z9xDPf8vcQ2v/ogG0O4p2muQLvkKkcAjLdkcbmXYcvTOjGkBZbQlpKpAFvYkAWdj+FWjUQc/4FbBOAhAOnC0X+AwAROgwjoCP8WC8zQgif7eF5mqfQh+zZY4NSlgTMxefISrZvBU6Lf4rAH/4S9L/4eeww+g/jmzXHQxpn0Lbv4ddK/Cp/1sIUckdVhv3++heZoLIP+vC8zX+DAhL0/jkFj/jgBb/mED/Cf6i2O/8fB1loR/Fmp3/B4NH/xEDf/yBsdzd8Ad+LmrLLuX77ZCLLtEKFbOEa2M4wUzR8K5fFatXiFDSKa1SpSnIF3A6d/A1RYKyQAV6XV5FBE1U0tQF61j4Dn+lBoNH+MWpL+LaK8h4jm3/gzarc01BP4cBf6mW/hWhvAlb+ZtjvnSf89A9in8/7HYD/F2x2ASt/JwX//HjKP/AYp//Rep3/j4cAj/yALI1/wnIY/1Vinf8YogP826nf8XvqX8Pjz5GQx3N/Xep3/FZD//KzCTEv8F6nbK2MP+IikDQ1OTguNDYyMjb+EdTj+BABo1MTkNjn+DtTc1OTgzMTY3NTD+DNS4zOWIyZGY4NWJtOUZvTI1RjphZjNjNTBjOTVBWkF4N1Bz/hTUuNzIxfQD+/4cO/sWHO/jE3j/kGHOfVhSp2Eq92OhDvhGaikokqjphbISSVx7NOjkUcrgFVv/luHO/l4B7QSoBXSBus6hZK7GTCVwnO+u9asSRJU1Q58bhnUdAx/gqVdTeJIog8lxWtZOJwBS7VDs4U6jK0saOfNuq52KLEZUv8fA85/32YOqZiAQq53vxnCFkNwyxRUIoCXGVIgLzu4X2jOVfuB/iwF9KjT/CAL6VpieJGCCrK/AEXJhKcCnCmOrrnw0Mm+JMkNcPfwVAv0JnjcUMKAjIkp/GmaOz5EyldSgbsD2JGtrGvU7gnU+EzYeebLOMLxd52aGEQTpThUNyrgaw/DgC/44CgceGrM3SgeAA/xEDki/wmAmCxbJJVafsXO4SJ/W5xzqJxDqeVPERFOnS7U+CCcFgoG/wTCF0/vApa3osmv8orYkZm7so9WX/BebSUvT2y4C6j441UGA38KLDvxA1sIBFS4xUGEXgF/iwoqv0P8IFFV/+JCd/j+E2msE/gNYOBDvIniaAUeuAAH7ZulEAhv5Fc+02sKyBlOTC/fTWv4Dz0M+F/LAD/x8Wv2/4mJWVP4rQBf4JiP6pz88gWSS5YL7NhutYl2Ep6E8zZkT6A/RqnIFKlJAb0cb/HAh1zXf/+EcCv/SQjhW7/wLdMWGUNkTo7bzt4Fxib9r4Bje9sOG5fMcXRRab+YYYb1gORNIEs6djIELavJAfL5DwPHvrVb7rzXBAcXCASV/gIlY1grY5lIVreZNF6yD3TB4MXsCexBvLOIHjICE/BI1u/4EkNf4VZpo+C5whjhAJDv0v+AwARUIMINEX/VBFp6Rw3+IC7Q3+E2jf+BZUG6kyBKzFwg2Wzk3kV45FQuH8eTwT9BSmZBPOBH6S+EIFOILwgEWGj7QM5cQxA0RhsUN0D+vj+QQvAYQCtEqQgwKPWP8VEpJ4EU7m1h0pUXIIAG+MKLX+vA4S7/Hwyseif5eMARpiZ9i7Dxa3C/8BEp0MM7gRgTYj8kYQx/HGniAALMGNrz8/w9wPEl/nwP3XRKO4v7r+GTAguDV6hGq//SwzdKhf5iIWLAK/jro9Nh0bAAT/XRCE4E6P+fBlJP+DDW7+Ewzj+KwBz/zEQgecTP/Dy9iO/yvLH/5AK7E/4E/br5JxR8jzb2OKMoaes91Jsh477es1ZeBH5z4o4X4++LZXb+EyS7/cQIzK/42KF/fJT/gMeNYP4EABIMBhwz+Fx43EDUm3ESywRVfXDiL2O8vTo00avRiilur7PiW8XR3zAr3XV7H2f5dDLv+HfYtqs6O9/x8DRh+LfoTlhXgpa0je+jqvY0Imeoe9F/Vka4Bvo3wPfs4SkqxC2iQkCThcEoIfC81mqc4zDJX4AFeMIygDzI5N/4CEbhWb+u4r0CAGY7/gWxjxPbpLr/AQYTzkckfK4OKhAdFWz+zX0yWvwuhbAEKWh6O3VoHf4sDGsYz/JwY1jxZBR9By8wf4CBPs0ep0+yl/wEFZgJB/gIgXTkKWKXX1x0pmtFgSU4W1+dtw3jrSt4L5E62P9eA3N6SBcMhv/wQHY4qX8BwZCv/AYAIy/4yCfHfKyIteaiyA/gW0xy3jXjCvQN+GWRxKhCoYrpyWdIHbTj5likoPZflHeeGRhv4csy/f+/yov/SQkOEqfwLySZ8Yb4aJoCI9zi6q+/clZgbnjFKPjD/mrbEEFdXubpLiGw/BGM3qHTOmw2pKVEnYXYiFCmz8Px2wQOQwgE3rqFBge3k/ydA7M8c/h6Ub4/4flGwvJLyd8G4g3FGa4xK/i9xY+z/MYRapiuTWZCPfDxLQXjog7FEoAOIqv0E9tc4eckeXM4hTMUEsi6NWE00IAA3Ev4klG/4cpPhP+/567+SZ66u/wFA8WmnnYwf/gITMrySYRzfzXTTQXxITaMGOTDlmYBlJzkMcTFvkmf4dzZTf+qc2XY/n7Nlj/5ezZdj/XQLgzc397VlOtjIXe4IN0Rf4CJPYtD9H+O0nZQHCHH+PN5akQ3u/xgCuh10IBHtqLQYXHU///gV0P/BhTAJU/rk6Y4vPwuAH/yvCDk2rjq808oMy2wgK/ACH30uKzJrBgMVUCsFRbkik12C2PVFgWnBct5CvIBNGBo8TnIZBCVBmE5AfTaRVPhKH761f9OBXQwQeLkpf7vD+CN38EjSWwhPqdwPOGwW8rZ6PM/hVx+2P5nhB3P/ywK6H/jwFTDSLkSsND8k4L//1kNbft3uzEoXWwX/VgV0P/KxVpQB81lLprSMyj/5YFdD/iGOH/n2IH/00MNp/86BXQ/4jD3yswFOT+t4IH/iYS//l/pIp/8hAroZuZHJcYnJoaG3+JDDZg3MjgsjM1MTM2OP4OiCTcyODIxMjQyN/4MiCTU0ZGQ5YmNmYRvRk8xcFFCOjUyN2VkODAzMUFsQWRyMVH+FYgk2M30AP4hcbdf7A0pu2DUFRwjizNenBYVPJk4DTnHb2nV/T/M7i3zqf8dBYf7l/Wzjb/n4aF2LTDlYY8XMs8aaPz6cABjsBeE3Ow/jeGbdeH2InIJ/wYGWkcW2s6NUNIJotUrQKoTwDxsFjhbpce9e/tSItbdOsLFYuPN/jgZYtUv7/d83/SQmdJFfwL3W+Il8SF4i0KXd12pWjnrL/4CCdR65FhNzdOdRBKxqw++b1irk/xwL3ml397PxvSJDth7X9U/WNzafTwQROhgNgTvv8eSXdEts38C+nvoEQQk+oP12rEc+LlXVVX+OfSY5u0YA5364HUiYjZ9WPD/ioQ2hArj/BAUzB/4SIv4AIOwf8WGJBdhN0kogDNyBQv+uAWYT+BgGG/z4GP9h38CUQv+cAWYX+DSKj+E5Bf+KwB//Lwo+p/ggEIRcdqUcyVDZgi1ouD/gD1gGtl7Ms1YDf4ZBP3k/vimlAT+OZ7A3W+XtrBX/PQLMIAfwhKXfwmE+u6+dCGK0LqytdfRUnv1ChCQzwceQz4aSEhITCYaObL/jgZXFWf4Bdub6wERlsgZMuAVdtP8BBEosy16EdBPJcpt4Dj/qNgBiDmKybkH+OCH/lAhAJmaSn+AwAQwgwjmd/87EP/P/PwPhZ/2AQ/8tNAN4oWlZLBt0J/QuifmpipVmBJBJ18onKPxO8uKn2ian8PAr+7/voewOiZf3ZA+qN0UJUR22QA6iNHEHRoZI1YPhCRYT5oaoXwt7JT/fwfNyQwjZnTrrJHi1pCdNy+HHsaQchauRqS0iJ6S6tNhq/XEQ+aEFC9IFRSgnwSiCtPkQBPEzOo7OA1/gIFeVUnmBEgF5SQAESB8BpzSjR7BxWyl/V5k7pfItY2quLfoM2tEktPw3AquVBixVVpwl18V6LnPDUtdV8QvwYu8ETtQnOLeopwuybd8iVGhSAXKsByW9QXDujil/ISiwbXih2qSGGS8x6pwpJzWvie5XlLQwCRgr9Rg5ljUuYIQEginTg/Xi5WLmEyQLNCoMKgxVleEJ8LqQYh6vb/ik3KF3j5pHzoHY1A0XigkJOCUrvEbF5rva3pbJ7EJUCriNWnA/b23SIlF0eqHwfjeACuDpBk252zjdXZK5VOeGcqDPvT54bFk+Hvetd0hwCsW7NE2qfCmrpv4A4scMQ8FZ+5BtY2t8Wh8ReXRS2lgTLB9vEXCJkh1c/wEFyJNeH07O2IFWZLg0X68LM5qe/BRMfqBWLqch1zIiCaIC11QOLbLVDGXp0/NGg68fsUizLbela40BIdSI14GC3ZuM/KegJyFeSorkPdsa1bi21k0WTNOjYCNQpZUcyCByXfkbjTRAhx4KMdWWxRBLB+/s5BP8FmBbGWFsL7+iGEiX+Ah4JGH5S3/AQNkBHzw2aqwfKLwgslpxEj80kO0NsI2sD3TszESTdjaU51g+oRgSoRknzlq1xj87MjYRTSN9vIak7hjUDYxAgR+3wHHg921lIRcKN4HMgOXfY+1OuIVyIycBcpGMsz4sMIy8AAb3n7VqLl14tzBr603VSF4mJV3yrkof92kyY01CNpCMu+ICWys9W4BGmNVIIO3+0cAEq6IUXw6EoVtkq7U9ngLt2mEVfTfO4m/zDcCd7A2fwANrHIVDlfoCR+IMQ4PBh0AIDlXNFVfVQMS5IsqMYDxIMTjVojobOPI6g8Vq1Qra8wuzxkwFhQ7Yel7L0pNizgm66q0ZQEd8FqThZFtAMxGTmCvMkwgWMRe5dMIIb6bNvHnHxHpJXF9l9X/BHlnlhjq4ESBNOBvVMK4yoiOENUP0qyMMrAx04MNnV+LxgxsdBQVftpQ4OGXuiwF8AUV0t9S8Qz+exDiY02KndU2x70k63rc55SnXSrCod/gISeIiHx8ocG8vzv5HH6kIscWBjtVfMcueAc7c6wqNv9j7673rgJVn2HcZR2JHmgGnnGOPdyQe5kgfxTMxpVvsSvFOkwRtjCpUbDAgiH30tDbY7+9zZjEzBhmhihbTTaG0sZ16nPZojLL5QnduJsu0hdt7e6Gb0tUYK6/iuMr/gDs3wRf44Ho2bzxQQ6BwB5gL2j1u2mo6Ivs+3GePIB3VNYOVgTHeBBzLMozVl12ugEOdNCxloPRa2yvtPdYU9RdKKIH43QtwEJR9xZ75F3Fk0hFTgI8QlYV2KhuXo5rKNv2SKm+W1XdnDW2TR+iThsK0rNJ9Ax1ejBUi7DZrKK/P6tx3gzzo7FO/XIfer+YdSe8bYCJlQWTidxObuDzJb6mvUtrz5+gW3wDUFDll79dPwEQ9QAkLNufWIaL/vM9hRXpyG0/gDv4sjRCUxXXoHMp9v30HhO5Skg0yuxkfnEFH7nmlcdB4Bz13RXdxvwTyJXZAaVHyyu6bxjEcAo6Qh6YaFhMOHdqHRbkaepqaqv1UONeJHIRquUtYLWreIUZ41yoTfbJdqAB/vvZTNTtl7rd88/rh0AubLTE87Tg6wvl0ONOdSP3vn8w1NHkoGOMLeUGco1YHCf0VvzAQYG3NcAG43sBopVF4kXSj0VCxGQSWk0PE+3frEgOBCo5Ovm2wsXAXf2uZiRPtjLPEXStUI2sygBUbtYzfNcMy7hHOq/7iDeWZBYQRVv975mGAkFMQygy4nNbKBlIgKDzPGg1XIFNgnmgDGws0eg8Xe7XOw6jGI1FeapuXXgQBIH+Ng+AN5hAJJeHCEo/IH9oMBIn//Jw7o6GS5gt6D/gsFeYQIVzRb27Poo863/CuI/0HjnzoFu3XH+Dgnv7+AALYNSM1s5T55uvLkXATWZA4VKVvCgcyhqzsXX3JktaJMCrpt+STcDmiZ6A+jP19CszOh1N5P8JD9Ekt/e1rXGgwaQiB68/OxbyqKocMHkMnmmP8eZTIFtlWl/Ali/xgRwYblsiibvXcpc/PgxM+GyWHmKWTzJ6WSLcFJehS2KUvPigH+0cTv/HQaVGJBh3VusB/CWG5/CYCbZ1Ho9XAQ9EAD5xs+WFZ7UbNgkqGssEfkTlybmRpN9/n/BGMRxgzEincEDc7gonxPF0iRyBOJnwYpU/tIAQFoCBMNPq8k9MRLYaOjEWFWXcYU/XxU3Ml0AqQLP8EJnT//f99R/pIT5ghP+BL6jjCr6/VcCIDWTmCjDDTwy5KkQkvodm6WznPthmztXqIyfUeFmGhXEuwqCyS9Q0nBvWFEpwict/wcLD2CwE/7QIzBF0BwlKHQZPMwAn9f6H8YEMiyVqXRLg2jTMrXjC7ij+AMPVwEWz/YQmyiAf/+IzBP+KCbKLxl7dD/AUNmZ9z/DDh8cHQxI1wyK/42HPqAjqhV86/h+Ef8UPEIiI4XpFGndrT8Sk0580RHFNEUXi4dE/8LRPNcC9od2IFZcJ53C0v4AzggrZTNpsGhYPlYjPYkzFhYg0wCm6o8hSw70tXogQMTS2exIHc4SNoDZt5MTZyiKftB472hIV1H1rOmQxqeZlqGNBtw0rUctTKU2l+31JWOSMnPasOrUcpMOoxaFY03gkwjjLpeG5Lik1c3/w2mvA7CZME5hlovSbJdXWJ1xpcpJKmcjwCW8e7UwDVz/RQd+zYk2eMRMN/BI/h+H/PaGMy2hZluBuAluLLeb4MaGhgZ1p+b5XvcgsNLt+JB2b7B95rmFDJHb273hcrsSdhge3tY27SwT0auyx8mTchLXTtWjbGdNvN4dRrmok1S/N0qRA19vtdDPZaKGavBCqeOEVSD0jhzV1Vbqm4PnixDcXBnHterpG4/gA0sRUATTlYlQ1LbyrBkBV/xfgbrd/OOoAdhKt1K4oIPxVWh+/qQsWGKoqOI7f+JubUAPKsPu2YYOLsQYxc+4n7OtXwFiGvJvZDP/FwJppX/5gBNNP8TDpZ3+SCYt/+BBURDTm1RTy1vdiJ7+Gm+SsFoqzPS3AIG/BlW4cwMFM/xBNDkv989kjx064XSJq/AAJhs+R9XNy/wcY9LDeh/kolBB6T/QhKIPnf58C2bP8fBfNuAA/54M8qAIA/14ImIwE/Hz0hgf0uKY/4WLePP8YBnNlxTf4wCKF2UNl3OkwX4Rv4bHkKfs0P9/CW7oNkfnBb9IvUQs30tM3d2ZqdF72kJo/T/nNRvsZZeEZ1SDsvsFWH9alIIRMgxik4ZWLQwYUELm2TFnjj0ZGfOAvfBB+7tEChQmfcL6UwmOXswkehhwrQos+t1sbOdIYCvQtVR/4CFXFamxiTi2waY5nUVTALPbHEIVu8z2kl1IPPrLEftnC8ZHkrfmbICeJJR6iNUjgvGM5PWWQjOtBHiUdj2nJdMXCoJU5fll698wbuP8ledDytskQY3X5LmiUiOIkh9oOuf8gCd8aG6rz5kIsTPC2RPEHIcyfla6lmjtTX+qHYa1xzURa7lhFSrwzw1fzhL1ITdgb2n0PtJg81cTQ5eofp9B0/mWlE/0ltZXy2WmVMaAAhbuhdRNhu9HZ6x87+HLpx/fXudRClJvBt9JeQHD7LE8ExA2ws3xToz9T/AZdP5yRA93YFqFzj1kqzob7sErMXy7CgGjcz+4LahvnGA6cZPuPGl3ybO49yjUuvWjTsBh7C/c3yykuBKF82ngP7IuGATcA+CI1YD5qt9cCK/kL/OAhPYUvwGZA6NqOyaQE+O2CAxP/QgTk+Aiv8O61P+AgYf8/h+9YxWe4LEDwP+O67m0xBGB+1Sf7gi5gvWuoT5hi2A9dRoOboWDJBER8BFiAzhwZdTtkz3gX7gSs4V+iNGng1nzKOp/ygE5P8Yj0alsybx/scb9NEGPX5JwQrh8wFE3AwysoUh+0GSMgxz/jIiz1h/4FYgf+LEWev+Ph0ZD/1IDvJgf+ehVToeeP8kCqrX8CHporZDbC6e5XHcH0zFwD/vp7ZaBUrJgVAf0UG1uJh7r8UBp/Dt80cf1TfNbM/P181Sfy9fNbP+iAwDzjG4d3cyyARtpAkfga8Ez5yMRHPTuW8PPnRQ3od5anvkFwv8cAtdTV/f+Z3/I+Z3bTAPfwuXw2EIBHzFCwYCI9P+TiUQb+OS+H+A03Af1ICJbYMaexX/MB9NpSMPGgr+IxIY9AoZAMBpgVIO3dMtu5wzCydZKrYvWnRDHvsiIMuIn1+O2CATf/QgPf4ASn8PDe5Ofw/soWECbZ231H+P5go+dWyBmsw5w8nvB6kLY+ia4U0a9xKXuMc9B/hp6WN2Id/z2yNgZSoO4kKPHPpxb4QRANtZWfZ/5QB7/OMiJFo/23Y968Z1MI9hTjaUepu6UsIN1Hm+WAOwN+s5ByIP7pkyr0eRb9LRpKHxlfmmhqvDjQ/wUFvigP+DAW2zQk0mCYlL+FVKICH8EiweBj/hFSi/gTAVtDKevB/1AQ/dyMKRUiWyh2iZ8Ou99Dn9/PpMgfbPr23sD+HA8Zr/v/mV/0wCIQ/wGtak2cW50TElNVewHIzGDUmA6K0Q6R9Iv1mVJWRyBAxcmv2/8O7gRz/VO4FtT8/bgVV/L24FtfyKOzlTZbHUjXnrJF+KZr0PrTjmRA7Hm4y/SYGwOR443BZwljP+JhErX/BRRh2T/4eHAOv4mYwP/Ew4Eb/kARMH/xMLET/4MH5vANR/l8CI/wsLE9/wCL35cbz31QnYMUW4mZRuXNkTRSP0i+HDwSrREOCV/OrD2rTv/jgkrEEf5pXvwLRVIVzb8c4s3FUaqDKN4srsVwjzEDqY/0CJaM5su/i/H/YhJWIAgAB/f69//Q69+9cXyGn8FUTHbjIv+WiAAORd3HC/wsKF3OX+QB2UwhPbcB6Lp40OzHwTKoGJOwJTVb8+sdQPgDuOweSSj6PeTjrc6UHrqpDjkN4zUXyEK6+oGdNBsxLw/URe8bLiNI+Ve0lQCRuEjFARaSYB1WYDLazTYxuW8vzL4nqUdH9beiq5vggEi2FioktdpyOB3MfrTvE5rYH0w3QClVIe4ydMZTWYf0gASE3XW7fqZg5vWzaCSlB5Xg+4XYw3UpP1Zdab196uxje8YytXX2WmcL5tZgGO76l9KOZeR/4AMTEk9TarqbfPIBEhiIqZdndYy9t3g3SnJN76YssCYouaTnxG9Gj5oo/Sq5Tqb48XWs2O8SgCJCeLinbsKpWmMnBnFEIwLtNwC0ZlOAPxLkBrd9mLFzcWZ3JVKnXCPmcJlTlDMzuezH6A7eKsyk9Gt+9RBZT8VXeBpWEQnjOxPdKEHtvYiye2o849hlQa1qJKfHFWPUU0727FawxBgM44dTM0R4uNUxMWihM27eThGBmqk6oUL3s1GjMDdw1GC6MFsKq05U/8AGlHefB6mMR/7/46GeFjr+/yYD/TAIYt/AcBscXjc4OzYcuAhNArfmPC4uWQR+LKbDvQc72YIj3Ez4mUTBejgDETw9awkzcGEaNQBNzelFBjfwRfSEn//hn6b/Rgz9MCYN7Hn8CvxJ4ZglyAnYi1erOOjgzM19/7J0oPLN7sBYd0BUv9CwqDHD/i4nptGp8BhALhSwb/HRPTahgiX/vggtV/wrFUbf8ZGjDT3G83/pYs49/hflkPMilhZH15SqHiaDqCUsPSTYItcPS3cEPYGR3hpIdKSj/z0CbPf4OLGVQMO8b/DRT+h/jYnyFAP86FFGf+WB/9t2sdwMT2JeDlTCBfyADkSGTbKkfI1ydy4yR9lnOrJv4zFxLmIKSOP3+G1W2AoWREpaoJCGTbpXiZqUiCOVGD60eTs5/8kDm2v+Li1UYGp8C/gYLl/x0Wqsf3+Fy/3+Fy/x+5KwVsbkLT/gIij1GFaJ5ImH63/AQ6DeyJe+gW8URbYc/ZCBLGmBHCQOjGa2mvrmHv+AifmjasURGdlMukwUUI4MN3QUG8/waKR0fByiaFTknt4cU3z7NzYwKmApQ3bXSwfKHu1YP72QULgO1MrUFBb22UK6bq1tlL2fDUyORtWyURhirv2HMnxdtCtE4b/yYOht8Z+z1b8YrhkYAkH9alRY1BzRPKyjYzfkd37kBkJfRmdy38ALQv+LCfNE1SLz+BRjAItmD/IBPmg/w0uf/x8KF/zAJ1/wJ3Yfw5VGS/9/p638n8/XGhzeOA+ED+LDBvyZqEE3GlWI6jmFg6IZ2X5jakaoUHiyZf4sien01SL3+BSMUG5zD/JhPT7/EJGL/mQpFE/iId0/htwqA72tR1Cp48D74gu9F/sVH5Zjssbh+cRlu38mjRpWDBLxyy/1wKl8AU66f5cKjupsTnUjO1YOlhgTVidW8lswF/aApf8mBPu38Pfk3GlvrvwjesW45zom1Qt2RQUyVQQSvjvwEbfVNEbOyrbLCS8c4dKb+Ah87NeV2EQq/S56YLppOfwQYwvv9/ruX8k19Yp/AtxQn1e1J42h05IbCe18GRPGXkvqBHFFwzQqnMHliDC46RDA/4wIlKIj+BIIX/yQRKUfwHaKKd2zs7+VwBPFhJM14VXkhzzdpboPYCF/gCJE7VylaSI76AKXMkf2tm8k9nLqz1IIDgOB+kXeswf8gbGPFWCEETc5mf4DABKGDATcY/ycS1w7AYpkYBEitawkTIMa3nCh7oP1D/Dk+AKewd4f46J64vb/9g/FZvXj8I8NhXT+uC5dP3Rgi+QGZ6P+egvPeXxbH+BcWlfun4+QHEvY3fRmdrbOn8A9zR5ggXG/PTvzW/t/8BGw1DZhmXzjY2AYW6iEAuFK/wIba/8KWEwYE4iX+LBiJsCZHYO0nI/xwMRNoEvafx9cWUH+uh8Bn+LUCr/Mgwwv/n4GsTZAx/wEZIx7v+Vhh7v/HwxE3/pwexWQSleTPVgL1a3ZI7RBooH8tUMHK3crJhuUBu2ks9y81okB0BGxCI3t+ggOFOBfjNd2bvBeXwjx2RzrdG8Ott+rLNBv+JItpigyKnAkOPe5wpGZH+O25O+PSJ/S2lXSKuf0iSLzLf3ju/xwI/g/DCAczO3v8BgAmNBZEX/k4JEMJTEuXKnzGHPc7hj4rMY8Iifa8j/HQd+s21FXfGw1NRSJR6O5nMKB97hCzmXg/u2nTTIFWtkMDzFG1rH/ioKkdAY2BsYQCKzJ+gwteyyU/hOHygLkBZG1wxBX/XgrUOZfw7NthC9/DwA//jgLfLyE3IcT+GLUIFrL80o6/k/kuup/y8FQutPeaaX8E6/w0bhWCXTFkp9mYZTrOIt9sEkUQ1+j8sZjhskGzNr9ieBZ/w5r/QH/7iaQMVOXVNjW/Glow0amnlbTjLKyAu/jwNrsGIp/8EDeLfG2IGsBC6AVM5FYr9qB+kvqnXVOSE08S/InPwHm+/Y/4CDLM6gW1sWN0iAw/J+b3UozXn3zG+Qjo35BI6CA77/xhC4eUrxAjYWyIqoLCdCYz/wBPb355PTRjX/el9X8AT4KyxI00zCXAptPbxtf+yiFw/jav9vEC4fjOn/wCFw+h8Va7opM8bF4OQ1SeQALwAw0X1gVIZA68omUroS/+lcjUxaFZTZmVjQaahS+Gw+vTrC+PKLC6GA1RliyGgsN4j+P1zN7noFcDd+scbt2sPghp1lmX+e4EOL/PQqvXolXoyUDnIygnv3AeRL+M+TFc4KVFA772VS7sq3hguQJMlzoqn6HD+P17OexPGQ8ZHMuIktKeuVoZHwJJ1UIlUYEFS1Go2W9whUCUlliXwR+xcSXwwIG1YdwSFekaDsZfICA38LIqDw+wfTT+P2DMioVl38cENEdHVztHEHmsy0Yz4YpNpCVd4wD3ML5kjM0aIkTPiZQAn6IMFblYGGhM1jFZ2E/SGsx7fJwYfnLRBb+JEG2NY/11Ym881Wo8JwbSSW4vCG7SRH/BAthKYf+MIXD9m3RTMDeyzid/T1dX/ARMYJ6Rxnzz8ARtb0XGF+RP6S1e2B/AwlJ/LEIXD+Nq/28QuH8Np/0AhcPpZzSIClsfONzfYOt4vIQqhQEePVDSZY2AmNyl/TY7KLdoCDa5+gLhpd/Bs6NFxgLHQnjwq3LWJnT65/U+4WqZR/z8Q26D09q5xjzIq76hHHtopg9aXT7VpI9bAcmnQeu7RLTBbNuksBtmOGBbmoQ72rWXNM/y8wxAKW74WxxinBCqo+pq+fxInpfw4YF8XzBZgX/lAJ0YXOMYMW+kwlUP0aOdmu1oqoarS/8cENwANaWl1WzeS8k38B5AZm3DfUFm/QsOPbCny2kB3il33JQ+CxfT/wL9rOVY7jc37wHvSbZ5TdnPBzwJ53maKi5+yBi5IJ/eYYTGmX+KAh+b5AsH/BBk6oG2Se6fwGACPcGBJV9yjMPSeZSFX1li5CQ98SUGGUduHhG/4QMoBQKO/wsF7zf8p1+TT29X8PvSoEF/z6GQUH8fYtIF/lYG9o/h9TM/4TB2f47AXwFaUiz+PgP/+H+Sn/QBbrM6tzT/AunEBmAMguhSGaBkDcojRz79L5oP+j1zUv8ER2pP/e12NPMJB1eEYO1IKpkIzhJJTBcGmBKp/HkqLEXiX/At2M/XCC/uyvmEkG7zkxn/Zc0u8gR8eGFF3DrLap4TCnKhjcN5Joujb7DI1IoAIa7vZjpd9gfqQ/wQdoWf//iQTH/RxIIYxnAPfwLrWwSXBfYYUo5CX+vG0ep+dsPasYWoxxOoPSD2l3bQo18RCP+KgRM8G/gT99WAr+JLQ8GpP5CtDwOYDLPTT9v8mzmedOkDqTdbpv/AQImJDrca0vV2PfwAocsG/VtUffx7kDf5AD9K8D+PbQ8D4P7/tD/8fEYYf+XBFSG8CLr/gq0PzmSP/i20PnKlDCv4ltDzVraurzJoL+DbKPkMhOU/4Knn6PEJ6z/OgIqeCxxeVOv+WhQcDBfxj+AkxZEFEz/SL/Ogi0NAFj1j3n+dgQR0yv37+GE2CdcFdjY4lO/4b15g6z4PpE/hikG/xMYs2Jacl/4WNeUh4Q56A/y0UlysJtQQn8E+l18il3+MBqRzurCkNkM5L/g5/cQ90v/4KIxacI4Zr+H708NKNvU8maHtYdTA8fWrUbyXKRPZoKP3S0zm+jrq93XLhi5EVe326t6NO4Qj/qhvSAoKFa3c0hmQ2bWCT8mDH+uyuK93m4oRtUsMOzuXfnpA85MX25pJxvU9w9SSP8zLbxET3Zlua50NACFAwkfU/gBlMBAWs+XL1V5UC1yy/wEWw89VrQQrcsoD5NEI42RqUnWXDsMnK2VUWyjYVGL+gD8rKbXMBUVChkyamJs5m+DT4NoxEbp+2C+NcNHizH7WyNoed6lD5H1/D4G+nuloNTuR2Ria/DuN4meZoy5VDUDfVzZ+twrZ6MBcf4CBtvV0qMyXArz7N3xFzKAz6o+WtTbTjAYAjHfjj683gDeW6XZe0D4IWMz08Qj8xq/WcXtf8QJqduoF3wzmNegHaKe7Rq4uuBLaqNzp6H06fCuTHTHdJOt4f5/WfYcI+wYLZiHkBK2mH+WfngRmDEQoDEUTO4eaUiJKU+DLlHti35mZc7WGevE0/gBpjc0fpEOmPt8g5CBoAsJeHHdTR69EdYiJd2jo6wA/+tfX5En0ypqS+8iNxviGx1HqoP8FDs34HCBl8UWPPAYAJdf4wDxIocBVds85me1/hLld/hIBXEsr1oexsQ421a+Zj2DXW+bzFnlKKI2XUpWypouOcnrz/gXfzlTYtz4AQASMbm6DJitgxHpbqzhfXNHJ14WZ8ZJCfAqk8OpQFf4W74LL/RASnv/FPfBRvMkbb9QKMF4pS8a1S8+/xIUi1NasAE7u3cPgmBg+MuP/44JYZqP/aBLDMvAOdLNWZTKZuk/4peSGKAM8l7SvaQ/SH+ZJUeA3TZ9mbAP+xBkZH//8Om4/8UDIPYjo0of+ChoXZCP4fvUgbyaX8tDirCgAdw6SGxR19MVue3ud2L+FT2oq1IqzvfjBWwaKlM4bYSEocDxgmwT/AQLB/pPNFDfW/KPlqyzCzyoy2WdI+RMTJ9fBmOV6uwmDS+rcGj6USt4eVxoa5KGTN5mOiCumHgcXeJLQMQORjbEEI7kiDqdtx95Aduf9qhMLmztm9XvBxA+Dwx3SJH/gInIV1dn0geBD0iYbs29yA+FM+N4R8wKTN+zFtfGJCQniyy5227ppu5vQxq8CkE38HP92/sB7v8gBLDMbMZbvKazSk5KHPLwKCECCsDP/AD8tjOqTCf4CJxUEaLwKRMTnCIgNoGAvygO0n9p7Us67LMy3ku4iyH2LDUxjVdaWFmelDHyfVsE8y75iwjzDwGUY3YuOMuPzONA2EOCuZ+ErTlLVpvef13qpkekS0W9lYYrCgKlRl2XuGRh3QYckiq+Le7c/i6QCn9+57/AQ+s8+i2z7JZYipwGZEmI17kAI+/lv3CP2SRrjKfglSlgU2mn4eos5r+RtnhUG9wUpFi6rIsoCLMHP8C+ldPJ0rHIQLU6R6MVLJR9nRBwiyVrh96j6uotcQKVt7TOap/Cgpv9wNnCARXdcEGFMCbf4sCEEdP/GwQgj/Cb3SCAe41ZYOxVk4VTINGf5uCEBQEAYH0e6b/BQQgwABQCAgf+GAhB//EwYb9/CcK9/iiMInP4Lof7fp2HcsOCVEUJXPovheUSPSJVRxZsq7uKiIPKN82Qs89L+FVAQF/gNQEZYQShS4w/gMAEN4MFR4j8hVQEBSgiFtQ2X7kF/mNQE/h9cGBQsJAQEM/0cGQifxgyGf+fAjQcDQP89CGHn8dKMn8DA438sKMn+CiGJD+GnBKQoA5vFHD/l5RkAv9eA27/8egj5ao/wzFyglGcVmksHT/L6sJ/jYwEQ/0EL88fy4EftJgixdj/OgI+F3H+jhT+YQP48BH/4NXBP4E64MFLpSK38pzGRVYaGrCG1CnfmEKXKeiw1RNecTpZyf0lsz+AfHnzxSl3SivMaHS7F2jwhMRqmyk/wcN1jjXB/tIqSQ7P9kE0fn//hZpr+iC6WHB7NEv4KR6pU61v9/DkjikpisIND9cK9ZRNuxEgE4Z9RAEWEXBHICmS1P/gINx652PH7y07Nu3R1NAjP9+Ta5Uk+YCZrbIEKO+L1iYm/JmIbnR3QWrBzMo7BTXfCKTctJhWI8pJlep/b8VI4e3c0UCpEo9WEqvJcrCDNDVt+tdxFITk1pkgIqIhEDoeNVlPheiuoQVp5jSa30YIpOwhzUsIw5+XO/jMrdbYMtkGtkgN4OTTv9Gi/SVPKwUxxxW9urhbNbRH2oYQtkUnrXM2d5s1gD/IBBigw7gP1WL02+unjNpHXchCtwppNuELCZnNx4lovSaFz4k1QeSxpf+Ajon4K2GvodHe+cPDZ64kBTHvXJaIeff+AK8XFgyNLKjEh9iQIVtJaHJT1yoOg3DisYQP8hMKF31kdzWOT2glFr9HIFD7QcDeLhEbJ0Br2GZ7iqlBANTAuW0mtipAeUpRm8tUCeZGnvzPtFevqemYohTR4KQ2Iw5/sV1wKiR1lOAbZ+ACgta8HNvAoENj23M5w1m4lqteYvWgnbjLf/QkS/gX4IMUtebeDCR1WagSE/NGbeNDvCx8bGbzY3VtGlZkdGFISg+Jk1CkGLRtSvcLwkY3Wwo0/wEWamz9B/gocBCUv4CLnoNgqGV/xcCaS/58F2/ujUpRAAwP8ZESP0T/Av1L/MWDKPv8/ApZX+oiJH7/Cg4eRx5OOq4bSXEVE7vun/AEcmEcp6f4CKYLevvcuUxR8ffwow6eMnYDWn/HAq2Fx/z7B7RS1P3O9DN12pyWyvkfBAdh+/gGUvYev/PQVnoZFP/AvG3nBHezmKcKz1V4jShARglDP9BQ1SJs+34wWSYIHH4hQAwXXJ4uE78gOR6ICqLT9ELUIVf7zP/g4dKba/zSAuZCwf48BW+Ov/HwqgHMjB2Vghz+Cva36NzUTrZmhG5/E5NhscOlDQE4dkV6BmUeWzVSyqGvn66qo8ntshKQhU7AbOYaF1mosOOKkmWYf/kgoPq49TUNmStrejXnZkA4l1WT7FprX7We3r4ZLtkFbHTRA5TcP+FBK7/Bg8iZ/AQJW/4wHkTHUPt1JRFoD+BbMbYq46O2L8wB+kNcg3coOksftG5IMn1Gn2E9eek/HmDmOYpSbhexZwfsAHaACrT2cOJHW3FAA+G2CAZmEAoW4HoMBETKUqtCU9qdaFEF9DnwTmkPwhbwgKQagLj/PQUgD/gwd82/gTkS/hMCC+RI7oIBmv4FAgwhC5/8WBs4Qlif9CCjhv8fBTlQj/DxSIB38Eq5QOLOJEp2AB/DAAf/qwIJ+s65Vd5Ww59zH00tFKZx9JE0mXKSPV5q+EwVMtzYQKWyUUNrZgiEB2RVFoM4idh2iXb1NRL7mr/AQ9+xgtBGZ8riVH3+ehs8oN/gxbCv4TCrv9IBmGwNd3COBfwVSqA7iU/tJWfwyC3ghc0Q2AIP+JhpwJGJnir/2UGzhd5qnf4+Hv//9eBRw3+fCIqUCv4fgLv4MSvv4UEi/4L6/ulIwP5ODH/85EbKH8DAP+TcMK2/wN9hAM2JHUAdglWmtowS9Jg95NRP4Rbbf66CjKYI/ixpKyrL6Luq3BltFy89uOTbd4lBrT2z/Pgb2Egc+Vi9Gz/MQVFN/GIM/xaUqozG0mucUp5UQmgCCc/xXtgn8JhP/8GC9qbVGy05Ytiaa3Ex+Q2yb0P5HJV/FYA//BWd4D/We6DP/jQJLoNfFl5+Hx/gtwJASz+MRH8B4P9cQUfMBUH+ehI5sFQP5eQJET0ohXD/kkGD/jEB/KWY3vz+eQH8B5JYdWx7+aznv+HxIP+MhKP+X8OhEz2/G4/5XA+f+IkwQkY1ksK/4b2TiV+fU7+dk0wBel3k3NClg5HoK/UQu9TC4oDgzMVVfh36dhWn8xgxiHv/30I5dfxDliBaKRDdcb6wyJFYVy5NedfaEhownhUPO7ErhkjsUs1ClKmYVmfnwwXvH4UzZh0fumvmCT1iDWPtw+pVY0E3vLKQ45/H2WIIqOGFkdaazrhcfwBoS1sxmUnL1LoDcFb4LdiOHzwD2gNngY5vi5IwwOW3JR8IKUmJ/qwK3oYHN6zX2HXgVvZ1QIpKI3/rgZgM5Bz/KwcIF/AxYx/D5Z//MpM8/wiA/gX+pCLMX+LFsP/EwT+5/xYK0G/h8j//y0FBOfyyA4kIg/jg4//xIaJs/wb3xkdoj+1O8f56L3uwJg/3gBWg38Smj/8Gbwf+Vg9In+Jgq//Pw+ApV/5AM9ezYzNwuODQzNTT/EgcUAZFxuaGpt/iAIs7MTE3NjkyNjM3OTANjU3ODb/DAcT2zMJoYHJmYHKmwuCWxN5s1nRmbmhkYLCcmqw3uCYZsKOsf4mDiezgwMzB9/zcP9dnX/UijF/Y/9+C/nvJ/rwoxf2/9EFEQHHuInG7USdO5CLGYUzF/4AjchEjPTDI0MbYaMLj+UL6y+CTQOz/HQ+6HMfwH0rX3PAfS3jwYEuTUpBYCYSRJpzsXlnm7n05feviY+aGf+PgfdDDeAS/gGx0VMD+H/Nf/IA6PdNH/iMsr+2WnkyXvtDfK1mQEa2TLHIUn0LhROW6uSef8GAP6f1nPa35r1at5HexizixnTRQF7L/+fh4VJLcY2Vpaz/GQ8Xl/AvXxwcI9Kyd5BVPsIEd5/a6LetGLMOvye0vSP0oKaN3Fc5j/iyGwIf4E2j0Wb/mmy6Ahve+BZ97iLzeNZu1TO604u538G+iPYyp1pHLO6GWGqGf9hEWgv//gy7j+i4RSGVfz+Cg45HmCN/h9uzBuvA4vXxMufml+dpgtm2Aeg5KabbSf4AOhWcbhMa/WKaFil9nISWG5aTquhbluJYNXDjA1xe88OoBELlvVhTTZSq4UM5W0EAWSIiSeCr5zs/f4A6Te36oEXIAkhbP4A6Vg+mvkvoE5NhsJLFDn/QlWq5d7dg7spwSYM9nUU+/S2+KQfPst43MqEiQqPAFnLlOBo8QBE14+AQ3xr7vtoF6ej5bl5ahzuoAILw+5hwIF2O4zlRv7P+Mx10IeC1wTJpwGPKxQmzb0jslzv4fzXyu/FLjsu+wbfjMHVtbpS8xAa1vDKQoIcBken0VDfh8nwIF7RvtX/F9KOfbkKKCZVEfLazFsDgQq7snPjvw7ztEQOEl5Jtyg6K8okBLRswQNFj4HoPS9wFP36nNaVa66Ue2aYm/n3z7wPxhgvAf5rRvmjcjNdnV39uiL49guVhLBCx2iWVjnTqdsAJk2/NQdfAITgPc0Xgi6R1XEs+G/ZRDp2Z8MZomyMI495+ujJMrj10ykkwW5322SxEhAm3XnNsLcD/HAMB+S/z/ga1D/SQNag+fwL5E8cPgjPFm3tCIcamtaRUYmkWkBBtwAmy/o52hVPb3CoGxE66bcVtG0LZWH1gT5smjxgY4+vSf+EAyNlwM0j+AwARRgpiq/ycSXCP79zdt5AGoGcTg4fwBnq4AYHJauXx7T/EpArzuza68I9rF64r9K2dOcIsVgQ7v5zgzi/e5pU8z23+kLo/4oBNzwJtgbSEAibGMmDLde9/xYCbngX/BCRJv+IBlaE/wcd+xf40JmlQv/SwRCe/5kJQX/8cD+L/+FDU1X+NAL8GU+4a/xkJSST6HrMqDEX8tzibT3mq5/koFjC72KIAcIYoCdiKSohLX/DU0T8czQwudZiwv2FAcK0wsIMv8cCKLMr/f4nJ/p4cuO4z1d2PIzfZQ/wEIx0n1Wz0EXoBAzT924zGw4sty2TZIRNdydUn+Mgzb5mhAJU8yn+AwAQygu5U/ifhI/xMFoLf4RC9ltp7eP5ebifK6TUZ9TK/gAYgn3K8a4+RhRQIv5HxEspnQteexkdY7L/EwI37/As9cRwgE4FAj/AYAINQXIJH+piURT/BAX23ho38DG0v6FXAKrReTL+He+TVYJ25o6ZavNv+AipAJ4Ga7M6IUdmrdsNv4dNpZn+CzaVdw5RTx4ZF8AMIavyX0mh2N5GCZ6ZxFf+HTaUQg+DtrJ5EM4A/h82l/yAHxkJKx19ItXswZK2VeFWFNtAAbmgZjPcUk/W72Yewi/D3T/wYP8fpnz8fPwVtzSBqDWuThbgToemx/oge74kZX/BBFmz9/4uKbduQANKHe3I6kVAGKEhzkHLIRCslDilfTsgZZyoyF5zDTU42wGasXThUGbGoxDoOERvpDmDeouM/wUWQfuut/hosg//xMC0bww4tS3hTUEBgH8JgDn+EB+bH/CBZB92gX+UBoIr/KQ0v9/gokCJ69XAP4VAhPkEsf8OB48IRLSX+Lhpf6F/zANL/fxcTS/wGAU/+/Bpf6g/1sGHDfw+Bviih0WH+eCz4UwByP/PRY/7z5AGl/v8xCFzP8fAof+lBpf7+JQYN/+ez/Xghcz/CmUEeoAAY4AATP+Ohpf7+BgJ//BAhaX/iYMcD/hQ0WoPL4mtm/jEKqE+z+EGFugbE7KJ/4TsGP8UCHI5BrXt+8fxQAFv+GCysf/Gg0y9gYH+OC2MCw/mnQv9v9kDEbYB/f+IX/8OLYwHCL6f4JRAgPNX86w/n/Qv0UblRcbBCgfCSwPCr+QThuLINUwOuOaJtEYm7Qt8NKRxm3zhwz8NlZVWT75uVF/wEUYz/t5LWF0DhfeXlqJ/gI5vWBnEU3u0scBO2/acbxNCSYOe/wESBDwcbzlK1KnYX3km8FGuebHmaoecPYGUTah4gLJ1UnJ/FIqRonoFkwJg5JMH462GvvfQTPblcS6SxsPWip3JGH6X/oniYEJo5cvYUEZFUDL9VV3Cw/CbEAD4GTYrq8j6t5RPQGq9+Wz8YklU4Sm0Exr/5AA4BNL6UmdUkwQsf8BC8cmIpPalDIaVG2oGQOZJ0B1MTpE/x8V+G/Vmd1ogESt8uhUGOX69QllOluGDc0CpBA1MoYNznzgHFeFeIUFNpolP8FFDxG21Z2HQnAw1NHVS/c87DTKWcqrlAzFBkA1Kx4SuIFVwb6H7Yw0l2011/T8oAumyWONAGJPuHkuTZwTKy0N0lproQpnfYSwLv+CC63AJOunoN/4mN6pkfCohjIFctGljIq7b41ooCL7dvTVi1E6K9ahFp9nfwKUyAmBHpQKHv7VRR2J43dIBkNaAuWnvRlaesbDXS+4HUlInX+KC5YsCD4Me/gBLR9YQCV6gO/gE494WXRClnfCpwGvRwuC8xGOdDB4JugLkB5P8HGv0H8LbcGA/74FADf4NAff8rGIqX+Ri9ls786oH/0sLOD/5QAUlYAMv4oUw/8YFz2PRBj3+AgrcGEEsF8R4LDUJT/EMYW//4MGCLqdSz4YU2NaV5fF/qjgZlFv4/U4j+QQuggG1oQDCOjCgwNH3v/IFy4n8Jgdf++BM+McsP9fBtiAmRScP3XGpElJMYnJeeONeGUlds7LJcT0x6EOGQjxY7zrCJSXQPMTPWfJoAcBVQre6VCr9mBrLM/ynWKO8n3z2jIts7DstRw9osrQEEe7e5afAnMOwYQOSn8VZ0EpmU3R1gzYnT3gFOW+dn8/RNuLRZJ9zuTo4ZgupxYkkPPGq45YHGnYrjVEWJFWm+8MP3R9Nh4mA58+KeUxijc38IlwIvUuYxtKrsa0nqeYsZuLK5z5KDoVWhy4qLTtwQXkjf68iwGU1yglkIJjQw+qQ5aZUoF31mRHezozht0LwFPpuJVCPGCYyI6YbQYr8ALEI+AESg8AvMHKOdYWilPVh2zK22EMGqMFrGaAPG8951i9ZlAC0nceUlCiay/saV7MYgiUNA6jaRQODpM/n7TJYTYtMZ7VfwA9EMT8Gh8wqQ2A+Bk7H1OjF5Kl0/J3jKxst64YRLtbGATnG2nTN6CgYlfXTU0GWZGm1ih95A32pcHScmILLhWAy61WZx3hdDaymh/gIRFSiRK90qLZsIpZXLrFTVjZr2Qj5LC5oQxa57yNy8mSIg6WwMZXzAFjJ8NbChLSwQGmGK7Jji/wED4k5lmUtgEMdDB8Acc7FYw5GArMJ1F0wuZ+PqNIUWod/BY5sBNIQAWk3pqZCmrIFSfYpWJqAZj2QksLBGGztpULv7ch7tCwDiNGTxg+DiwzLEojNojwuIBP2Fn+AgYTh/POy33TH/AEOxsZGVqT/UaYJCJ/4CHGiLkkU1W6mi6zZkNXHmv3fDGI8FrsSj5moDvUBq3QpibwiKQ8/8fCZ8f+CjksV0WIH1gFkFiELJDCEtU+3a8H4VP4IaSBj/9gVqbHiR1NpXQBoPn4Bd/AJKmgn5wa8reBfxtJRAbUuNFjqi+B/AsNnwgQjTqSEOTVUCv18We1qbHePV4YS6MqlpKDDVNnLRbCMYr/jgkfxSP7/b2P9JBWpsl/AoMeLIYFExQzhom1SjUKwISl1gHog3aPb29l2BlRjP5pSOnsQYXiqo8hTu3idDWPCG3xfifZUIaf4KO4VQcA/wgNyvw8EpBEo9zLc9wOGZACn/m2RhC5Jkcf0Dop/Ah1vAfwKD9DjFXM+yopqoYA63pSz32DZSjiZDJU9j55UruSlSn6p2Or33ixOtRF7DhgCbehcLtMthDguEn8CKS5LCCHWKSn8BgAk5/AYDQjFKZ9KSNVUQ5aLX8t6rUS6/qHoHEAKW3YV+MUH8DgNhEGZumj3pWfMN8lHGXwVtyUbyPyAt4xoxgg+inOykfQxB/44LjYEGEAl9Wvv4DABMWDAmdg/zsXGwe/+eBZsGCpBU/3wUondu+wEMvgXjGhCwulHlZpwFEdxZqA7e4RLmQxIPuYxxVY09X/78LjYPj/Hg5zxd/w8CvquA/ZS9FtIyFv5szOzPd8cSxtYop31S8CYr+x+lAe+wUX8fgv/iGqKFlTXVr9FP0F7VRar73x/6tDRo3lJLDqAjCI3rBMxTUDV2MzAFri7VlQNyO1/gI2VCxMwyprqT1Z3zQnrkdLsB/4fKRaF78KbkImoI1vnL7lHW6lx0RUNJYHKvfOAvlFOQE9q1/9XC0MP8RBa/+hC42Dhu/4OBz73/5xcbB/kQUoYf80FxsH8QBF/+bC42CpzbIyQlfeVmMz5WSjS8phJjBObDz0ifmY0HY/YGtI8/+HKcvkJ+tDmR+tI6bb5M+EIrJeBWD1p1sdgcG53Zmu5ovTmVFE/8cBS+8x/e0fgiA0LyWHs5b+1GOkaioudsrKjKbx/HhAuLHIf/ApNOX2nyY5yB1RRT5SqBzfHrnPiGcaN+qULkQmCXdSKxfM0Bf4KjUlKugtWN/gVtxDbug/38N3ugP+si3DYPAQgCBwEEAYJAAP/cxbhsg8Oh/DgA+j4jh/HgA+kPPh/HgA+k1nxzx4APr6SBfx2AP/wHkrotUfZn8ATnmFQ796nfMOPzzxJITasxgVRhzvRmrdtymF8X8AnIAZkEBDvQHiB932rs8/gIn927lk8ZSVOEvUeVXsUNVTJGM87Vt6okfVJTFj6xxrb5nbdlgU+RjyvlZm6MylrfbNa3g5Y7PFRvUsmcX3MyFhmOfbQGr2ppwZ6EETvvx02oTUpr+HwJ9c3OTqEM0/YeGOz7pciDk9wPD75+ae99M7wnB97aZ3+1HO6Qk72T9vn+AcJ/pdAcqJkA4ruLLSFI6FTsjtbJJuJZKfyFMRLVDAeD4prCBedl73kD3SOGw/gs0Zllvwvh0AJ0NsvxM+tcrI5ZKHCQ79gYfhNKZeeSGYbNcKDcBBckltZ/giSBuQzH74xWqz8g6sw+wV3RUgyIw8LMbXgYDl8vhIvYoHB8gQ/xwRqPF/97UtbqKT/ddkHD/AASOu7cSYFTal1nMhgp/nobD0c8ib/gS9l5DvVJuQv9qnYpVcO92U/pxjGSmrdtss26Hh7Pu6b6Jt7Bf/GxXpsp/5sKKzw08KEzOGMLcWlUi17ruXOCUx++PP4WcItPTtxfPtRhAAMAoLgvQtYxSsDsE5qzxAuzaN9plJH/Bg0x64psIF9eEAQoCDIRWAEF4+H8TgQ/ZoMna5umBR8s+pcSSYs32ST+B/jAPxNdAqtIbO3P4GwnOQ/axNF77AtWUMSZIM4LLMWJ+GsCkZY+L36caJBLN27eHZ/xMfIV9sFWN0IBkQYa/4IGIe/zExjznCppAfXHhAf4wKj6/8FD6ckV/gwLrD/8oHlI/4PicR3/Ug+nJ/Dzj2CvH+iB9OT+BEodTX2VZYDrUK+ec7/0vW1QEQHkGn6ngXp8WdZVmoHDdJ69/xwWh03f9/3DB/pIK4qGf4FUF1Y3uDGWQ6AXwK3ndFzKRBNAYjj15CEa92eJkOq+L9opKuMlILifMfK5yL97Ehhah+En6UfjMsGv8ICoFGJiqvMZXQj18L8ZD1iw9Yw2bIBNQ4f8wdTStEaQH8Ejda1SJRARENhtZGidAEp0/MpjFBFaguimZpNwQKnJ+JrfwBe2hf+LC4Zb/BRXPGJfzRpgr4CFiujKoX5618PBoH5n/RySfjFz+hL87lZCNM89vm/XUx/MWmCAID/f+mD/RGmC8FT/UfwVUEClwi/w+hLJ0ohCYxTOT/HXfQxx4I5OgRQx1WTYvTNa7Xb7WPZ3w8UhO5aWRUW7w8liXJFwqAp3M5WjaLVMMrpJr0fwGsqQJNyEIP6CuGLiBb53WSFMHCp7jq53k17YYOajnzStiBsBhjVa6vnn3NLufVT+dq6LHmP03uVeyw/BwVau8Gld4+CCI/xFscuJ0Xv0Ac25OsWn0A6bAYlx4oTlbZfKTUX4JLKVpDXiOmNBLutN9I5Vp0DqVZx6Vbfj3ddWryyYrzy3Dfw+i7CCZZEn65wWILq7bXDqvyJtC2g+rGRVQKViyz97SIcO9rbRsyZEINxuhb4GB+XyrZBZytd1E6LSnJLu7ZHPwnLKj/niIkTdYhAmUOBILnaylgoDhcBEreYFDfyhGoKyxu8gzyAjJ9CKoIVfL7hUAExol1JShtGZah24cG0AU81vsxp5UNmufdTPpJK8XpoD9U2TD3o8ROuaTeiA5xhZ3gFJTHGAkHSl6IvZSEl90nnZ8P6xlNGacLhBxPC0xTfuo5wLfwCshzhX+5YrKWlqLGQkVOOJB/ugJlaLeWsBol8oSVxTwd7U7j/jgfoBsf//Bbd3+jAtu4Aoomh/gVqeepvxY5AzOOrJPUdgwwjPcmAqADXbsAhvprajsuVgLw+vf8YFW9NF/8eKt7v8fFYCP8Y6LgImf4QJAmuR+LzbMz1tq5J7U7FFDMGm4RlzjrQiCi+QgFdKWLJIgPs0BxbuGlpWBydz8P8sI23VnPjLzsG64IHjoQCmLGQgnz1/x8dn4/w4n3Mz/+wkOTYFEQw0qTF528wJDAkDZyj1t8e7T+PMvuBaQZ/x0Joe3/97AtKfOGatOd7hyAJdj9r/SsS2JKh5/HjRkg5LeeP8CvgMIw+lRverxUaISAMPicmkrQz9qp9Ct7LGzX8Xnk9jqzLP/jgHdkKP//Cw3n+khBN2x/gWCRj78mxcEpLeXpIAiZZZ993/8EA+ZacsePkqtGx+Wi1/HmvzHBdH/x/4UEZtFo8lO8Y8wqksDhsGMACPKe1Mz19pUJDIcXvjsJQGNtoBsbxFjCTNNGJABPa2b6cfn/AQ5iD/AB3uhrPor8MmgMnSp9wehP8Co/0mnpvNZqNrtH6k/rAzs+9RsvE7wHw+yjPFaFYvz+GFibF/iYPCG/wYX3b+EIATC5w38BgAkfBblj/E/bX/lgvu3/wYHAcYH8KAQvz/gIMgo4Hx/woOy8AaeP/4sL7twLF/y8JKqfxfNiDuaygB/vwMvP4P9YBl54ILdveLAN9/Fc/ISmFg5NgcICFSs3LFw401LrGbiiV/DgJ/FUZR/HlGIhYOx/nwMvPhOgBeUS/ysHnCfxgERxVGUQ0TJuN5eRkGdw747u0VywpFm2ph+8pFpsXs2QaZq5F/jgkf5hf8EF/2nJLz7R9zReVLLJLhHGvJc6U7w3BTgPw86Ngd/FPCOlARqJWA5XVGNiPd8p6WmKEBlMhAhms4XfwNu6cf8Rbuh0aDtPYGH8N7ugJzwVu6bl7/K4f2KXYZlfwtu6CSjBc4QC/wVu6CVV0/wsB/QkS/y+H9fwGBT/z6H9CP8GV7/+JifmL+KwB/+E3AU/iuvfJ5G8ofLeRmHtiKMMmaepRJqoQXT+HN7SKQxj+Dyy/MMmy/j3e0D+T55Lq0/mu2D/x8IzXf5iEZzg9g/nLto/4hBw/9tCCwf8ZXQf/QBBYP+IgWP//wgsH/xgQWA/4PBr/4iBv/9AByPbTVr8eqNoycNDUemaMFL047cF4iUayQTHllXYv5TDJ4Sn9GWQWMtr6k95+aaMvXL56P8gDK33J+s697WNtwTK6g3/1QQWDE4NZ/wwPSBisiWbP8MoL+FJf8K5nyx/swTu/6UD/ywgsH/kIQQDAVj+Tgv/+DzPn/gMU//4sILB6Sm0AMlMrpqeSvHY0GmfeTUuC+N/OWDje/4fHg/+fBxvf8XiX/8Pjz4oQcI//3YQWD/ish8/5W92ab/IAgsGZmhoXGpicGJkY/4kPWhTQ0LjQ1NDgxzH/DgLB8ZmhoanJgcmRl/hoQWEZDU0MDZkN2xraEc3DdkbzpkOTE2MWFjM2U5Uk5KSkVM/xQCwbmRkaPv/phAWDAiRzt7sv/n7rH/20QVC/5iDe8v4/6h2RaHuOs/ysBRpv8J69uZwqmInAQQolX46YOp+kgTt+6CQEn0CbPE6N2kHS92SP8tfRP+eBKE62aMD1Ouc/TZ7XLq0rm4mowYeeV8u61zK2C4U8bkZFv8gFYef8dge8vgXgOqc8O57oR+aubYqmNc1ukJ9Uqf4IFd8Mz/xgThAHvgpSp2aHhnQE8xshiWgv5LYYt5eyOgRszMsTlOgxe1/Aaiz/sorMw4SW/28VmYZER/8ArMwIkh06r6CKZFdP2IYEiljnghphhyMK+gIgwDLkNifrdoGgHftKxHfUeEmjUNDEIKaC86IM6CeuvfCgoh5dbRP4Auo4b/PxWZg4kB4RWYEQMvWSG0ohHBM0H+AjawVbSRz2NZPqHQ34y6t5nC8ckyouBE9NT5j2YCSvmMdtt4q3hr1QchZkfK+gj3xY/4/Af06ExMtHKO6W45wys3fzDuWQmfFcdFRB1AZirw5F4JG/gDsDIskpvDmeF1mmnrypdXBaW2eqTtzXu7cMwo0xSIGwyZG/g/Af/4Av5SgOzZOyBIue8H5rTerpiVpdtqaNhibPpjNBLkv92IdPhE054PamJWsWqNjevCVFi763wWLM03eJZ0KketvY+LAf88FZmAD/4wrMwfRK3CKvG+Hp31T5wzH6BjisIpdlgnzUeVvdXfEdfkIW/gYQ8/2IVmYckt/t4rMMOPU/6AVmYWb4PH0sVobAaLu44lmqojn7o71FkelG2I4+3kVAZf17jcfboBqorQf4epKlXWNxlH4M2RwMReq8T3K27oS43s9gH8fhsmmUe7o8kDoBYzR/AALC/xh1ycvLNXjNENv7Y9fG5bZ8c4q1kIghy7P/gInJ/Wr6NVQwGYu+vQLc/gAGJForqj1IxSpLy22w+Af8nFCt/JN6TWlWSQy2Y/37banWEAZ6NzoTe7ZKwzKlnA1avyf4CNLpcb/BYUv7Cf8Cv1fJ/hQTiRCmRo/xYUv7BPJ/t4pf2/lFh0/9bFL+xOF5BzrLv+PBTxPx/ltj3/iF6W/20Uv7f1Gxi/+Xil5r+IgWP/3xS/s5AQ//iopf2/hMHv/xYRjd/4KBOsrgAOBpmidxNOrA2PLcgYxX2Ed9bIacibrpLTdSZEHDmxRckE/rIVVsYNgSLQSIqSZnwTiOwY8eNL/ARRUxgHCpIH+AglJIl02j/BRS/ty4vAOn1LJfABwfwGDPg3+MwTqx/h8LfAQDo/wcJ1YgHZ/ioTnl/yIUv4x4KAAH+QgToGHYpE+wMBuf8EErwO1/kQlfb6P8EFL021/jApftBL/JRS/aX/wKAjf4eKX7eQA6P8FErorYAvYC9g4AQANIANt/gopfttgG5/wUSuVb4D/gwjCUCQIS/wUUv2kwEU/+CiVypQAlz+BQBFcAmX//wjA3/vYb22/j/UZ//8JzxX+niMl9yn8kcbkOtNRFrY2HqBsOXKCaNc63Jcbt7xuK82ccaFYgJ1rmiUYlTQbM4CmmQbL3Aa13r0NZU/VsFktFqY+FBP/IBGS//gImK0FwdcWuu8xWhuqSYEQH1pW+jLf4FlSFj/Chktj/rwIbP/gT1/oZceb+soN7pfItDDhFKbfeEXMRDtINZYqRIEYfPdNr6X/8UFHa/8C996Glf4bAzp0G/UunlzoNOakXNLvDW8K8qpnjsP8XBnCimQKFBqVTmfwMBXUruEC4+SRxq1kc68QhgB50US1odBduQbVI77IE9DrsZiH8KArPhBc4Qs/5GMnCh2gHn/qcYD+BfQKqPe6yHpcXzY9ESjoOIfMc4GSkEqoMe71iOnbXx6FJgptf8VHCgtdBa5Z/hI4G3/30Y9eAP+tgQoUFAAQHAgYAQgJ/3QCFC/Fl/DkP1FBPcR/x4APhywHfx6AP6gh/HgAPihR0fw4Bvis/0fw+pQKyw5mvK/iz9kP0B/gIiGiggyxEg03LyiPdSq73mxhDj6/J2m6o4zBbeGH0Jc4yzGqEIrfPNG/8BEWfywPRZ8neO2QmTPzKrWKPcnN0Bz3JYomp4Q123qQu5WUt7VPtATdlAvhjvbzNOizFOxdMb/ACBtYuywatAheAgSdQNVS/3uV0k6GH1XZD5/D6qAz8CxbDYy8nSPACy2nXVlk2MtA7uSbFKAUAI3kWLYg7cwsUJ44AMTH2/dd9dbTWnh9K0ezsNKA0eR7YvWpwVO3cHdWgCAQ3NCknABfJr5ShQzNmYMF5lYOrJt/nz5q4uWcDrUPBHHe11HWdU7bRyjTD1pYxq0Fn/H81hqI03jXWkf/jYU80bX/CAvmi9Tv8A+UpeJSExk4rQSFzfYjkGEdMTDJvAhs/4/EB/BKA7WcEWEZen4Cqkn4ds0A+XrLFWRGjhn6KB4l+UukZgYoOf7MQUv08agAzJAUYgqfV40m8oKEx/gMP4HuuEW8LmTYR74Lby/xkTw6MTSAaiOgf4kEdkv4TAFiFjQr7OUa3H3IWm1LEt6uXpQ3Ci5Rj/Bz81qEtPQ/DNXA/hPAq/gQUmAv8NE0Sg+vRZRd8GdWN0btVuYJV8Et1ZGmkOMBCwUhNv/AMCz/Avp5YArmB2H+AjDTrQiF0Rf8BEcta9axmY8+2mxMQssfGTl+E0q9/4yyVMI0bdCQHzXJMhm9vXBh8wVaQO/wRkpFP97fZjIJHKuVTqjyK6wX2UlkhIQWx4xjf+eCtnfAXiX+Bfsy2uKwwkIOC+KUyA1hn2B+eNIWQCM5Lk2M3e/xAxZ3EI4c/hjhFjP6P4RfPWRAAt/Af/rfIsB3LuZh4+/65yRI9/xDMZadSEydNAQ025UUS/nEG3Kl5z9iAi+yqBHRHU6fkYnn5DihsL/g4htBQG/roQDrbPH/gQAEgwKvbpSL4CS1xUa11Fy7IxY+BpNyo+lBSqAuQEEoCY4Pv8rEFgH+ZCFg7+DDt7+EwLU/h5bY3P4WSx9xm7fZ8Rv8bEheDF+iSOdGFD/kgnXu5DCqrvalVsTVHogGKICcaKvmiJFGdx28Nvt4bH3lP4AXB903cdXY1B6oPAUtBU6vHEQjReX/wEWKOMyd/gw1q8ARNhAJhcb7+AwATH/lrBhAmgpycD+FAIb/BRXMxBAidzwXovJ0D/PGi9/AYFT/bGi9/iQdSR/h9Tn/x8bM+fwgOn7Fiyv8eH4YKyGn8n6L38Jhf/8eBEbFiystUGca0CgsaH0JVysunihL4MkRqfXM/Ezh4cAjnRiH/w4dVMv+Lg7VO/wQSqIax7/BGh1J/wIZ17T9oeW+1enfhqc7I6j/E+vnGXcVQFyBsn8hGdZB/kQWZG/jszr/gv7cZyteg7cDsFVXapCpMxMlQIAAJeNAwEDBAIJAAUG/1YNWv/w5cg4N8aQh/LgA/+InB/HoA/TRH8OgL+TYfx2AP/wHC2LMKPpgVKWSc5fzJEQu3Z8q48Gly8aYU/BQ0PmGlHlUOPE4Eu38VRkWHvSY0VDDIaX2LLYpyYS4qICPwaKUJwa3RQaXICEJE8hvx9UMk4RPP+Ajhj8ZH0tZ7G9RaELmKUr9N7K8lGNw0khK5owpkQk0tcxUzYIr3MJoZ7cGa0LQQm8m10/h+IMax8z53ZsznjtNqFotiMNXCZoFmbQiEc3Aky/ANFKGVfXdCpDHPcApmOOpYxW7caR0ducY9Azniu7rFFyOtnOW2eNRV3wji0QEYPlJceXqiW9ixBhQoC+nMJU1wp0WKezyAyfJAkr1oSN9MGU/94FzsU+YkiSb2bbODgYj0KzyvY7/gXu+xbMOJfGTVw757DVKwi8eHE1FB0s3HqfkrfOPVBLWXEmQP8cAkMIn/tIEhhApsMH0SRPDF+EchJJw9IW6jT1ovWA1QoODu3QLaMDhC5/9kAkMP//ifRQ/4kR27GFD/hQqk7HHt54f8fGeLOlU8BED0vBfKZToIroIGCx4gPWa84lFhUAaj2bDb7M8MKRwAocod63RLocvjk/RlEAVHc9wjwzPKpchgUrwbhqmV6XrXsoEH3bWiCgyp62aI/vDAGD+elryHQIaBNo9GmY577jf+u/IfehoDewZ0IwXvs3J5G+1RGtjODbq7fXw8sR7fwDeskl91f4AjJzxcOXfyjcoo6ydYBEZyUoaHuNz8kvpUPGfOT7+7weSXR7aowdzejXHJLRf13XPdu+TxvoBzmh9/h9G7Bg3ISfL6zYW5qZUmrTKKG4+yqNNBSkhM6HbUc+3DNlHiA8vcOCnk2PRFrQatQLWytgDnHCXWR1btQTBVI0yuNfrAgla7e1uCrdPj55U9q6SCx0jf5Mk4L4FuCElc0XOiIBBEP7GDyxT7EXm+gY4lNw4uNZd6LekTxXmjlu6Sjk/A2wZz0TtVOdDbCt9ptUkF3yT23ObuRfb0VekPB0hfspNxA639J7g1pHaRUN5vJS3fIxISkkF47ac+kPtGTmR8+nnPA/h2EN5YQEeGjB/gMAEyoMKZFr+/4Q3+NsYEPczOY022/x4U/qjv8t9Av8Qlov9/whv8gwhv8RAsf9/whv8WQIbzQquKE7DUuoTdAELQZwsqchj+CbWj40CR0sy1r3TE3oqB+Ug6Ms2c1p4gyoW2pvQhsF+wrukntIu1FuOZGsM2H/A0Ib5cAEuUTx8AHB/AYM//e0IaBMV78f3/CG/3zwhv9/whv9wQhqe3dXYE3wCUGDFDPCAxN74NWWhkTB6OnYKiGpZeRD63RD++hHjwoYRa5HEVeyrQfnaYl3xclccCE16A6ll+1KXw3/kYxXQ5Mm1soTCXj/OhnDOhVuvZy1quuuU/MNJMJaHRCuQmtfv14dsg+WPj/jy1gGFBIruCw9AJojQb4S4HnhAJtzcaDAoValJ+nLqllkeSG/wEgU1K8+RUSUFvb9Ngl/hhDUv/xcF8b7P8E18uaPz1YMAKwT2T2iFgNnL0gJCtVUn8AbEcQE/4CGcrKhr7x1sJY8PK9GfH1ZZsYwPIao4HokXaKqr+q1/gRs5A/4NbOcipz0zvEtvLV+TqlccgyuvlBObVYcCqHqlISr4gP8dD+0XJz/sKT9AcruTY+ZTxSbAoy4oZ0aBVjb4IF/jzZ+qS2D/IFi5dNr9YrT84pj8BzTh6GXgfGZH4K2Ax5RimxGteCQ8d3B7fesC+DE2dmrfX5rdSpOzjeKAh/7v8IGp4M9/tANTwYAT/2IYwW//+B7AP+aE8Jox/wAgPc/MovZgZHExJ1HziX246UGjsn4jeEg2Lw6uu5XSESqa9w1CX9hAosI4FK3Vxl4jwC1H+RoL3k0DlBic6guqBzPdAK3mHb6kXyyX9p3OuxYjD2WGwZZW/wQNO/RlrAQM3iiVIHbJwHS+LHfh63xURtVacXVnSh1MMJrf3au2EFqW2kAAECrYmVwiuH0mxp+0ZJvh+rsgzSL6f9+m9QAPl/HLEDaui7RJtxg9o9mu2l/Rzzna4IQDhVKkhyR4mN+la7qB/h9FXWEUKfdW+6GDyuezFCe2w3jGSNHClnbY3dVNmTTwRIbhF9csYxscv8BEGYFZpaX0FXCA6T0R9sS6m8yLsGg8cEVJ+jEtE7i4Ie8F0eZBP3srM/O93w+nn8MmhEAwrutBq9LMJ8ikRSW/4CDmnSOC7NGhoE7pjNI0Q2FF8Sh47J3RJP4ANAH4rRKFPGfNL9OaGU47gNqi890KwCOqLEqSFYO+ly35xOLilBSdlGRhvN2dc2ohJqM44zfDiybJSkyaPSujCcA++sNfKU/gWaa3bsnzY+MoNuWUZmNzCx6RNvGMpFUnvky/jY/Q5GNKoSiNn0oGaCt7ncoRXwo1WnnrT2QxcD/8HBEFn8EIvMF15n+Tg/Fb/LQJhC5h0p+P6L/joMuwP73h+Pc4c7FJv32pI2dqpIkGi2++/ZY9/z0ZIIloF/ALRX+NL7e5t7nGWEaTCZlX+AkRlyS9+qZcBiEBreVQvA01o88s5z/hiH4jf/iGEFv+Ch/kbU5Gw57QAJi0DYOoHmaHF+NqdDin47YICtP9DCC2/8ON0jT3AIZ/j4gLCDEzDYZ+EBjZV6lO9Ukv2cZp52S+DFhL4kLC4nsIw74BITV8CU0yLihph09cwJuWMoNJr596i62wZE4W3NarmyM/MoDRK/J8c+0palE+UolByOo0sUBjwpLyTSkUlLx6x5oeIIP+PW7f4qFaLK+C5whnhAJFcnH+AwAR6gwU7ov8WB1iP/oBD64BwwkQ/4r43P8VCgUv8BxO3444Fv3qWaSXLW/wBLQktuMy4rUB8lAnjOa4UwI/lHOlI9q5/+KjXb4/wUVRsDNj/tAqjYgBn/XxVGwAAAAA//+Ko2P+XFUbGj+oH5f/+Ko2ALWAPYbR1FOXhQbCb2kmb+ngb1gwNoL9HGAclniMPmd5NEKxaRMIGBWBRfHPJT4FRqwak9npnxwCF7PR1mxoobIZK5kuH15u4FFoJr+QersWICc3ynAmDz3H0uk0rOc3z512GMoRorlh1Q0KiMf4SFAaYrV45lweLYdjFhPZGKf0Hv41TQM4glov/nRDOujtBJjuT9QHpKI5a5Qif6YjigdcIJkEZK62yGApUvx5YweaJ4og0aPcRQ+RQlBJ/v3Z1/yAEzsKkozty3gqG9hN5uFRs7Gtto5kKarO9B/wB8NEmK0IeN6USwPg53lv6fZbdfzARcTbe281OzC/jPfghc10TmhvniH0uIA8Syu7qxkt8Na3umOvCC4vjkEEvXpzS73FJRoVQYRaJjwe1V/ZRQFJYG8C4XzDJdE4B3HDf4CG+HzgeGKkP0CHHIHM2/wEFu28Dub6BLIJoCxJVT93nMOkPqGfUCtH+AggvL68IxyVlM+um+l9lhlKQT3aFT1ei4MTKsndqskqxWrmmCLzzI28D+E0YP5ALtgeiEAmbvlIMG1xWUQ4Zw1B1RGAA7L0LMBGb7rddg0v0gLkCxP9CBLXv8ftJKB/nwOgbpJjgTf4eqf2H/GRXxQERDX8EwYAv+uAlf3t6+/+Og8NkvxMWwf8rG6Lv+QBPWL/hJOy/ksCf/z8SCc/xeBP/5EMwSDv/qRmCRuf76MMEhB/18N+J7n+iDLGnlCD3zXTT/AR2aGUJOmofRCtXRJcN1Yv8wnbQKZt1dBZ/FD7JX/io0qikv4Fg6Ob/CgID7AIn6/5ODqSyRDrOxp1Uz/ARsF/CtzZI21xzpl9THE/DrgzbVf1w9djX4/PS7dn2E+p6WfZCFq7sMOtLQ9LUS9BxUn1g38CgZwYmJdzGK8/pd+30MgOYtT1Rs7irMkOHwb630GW0wpsioIC+0OomVyIS7dwsouNvGvLLwffrn8gwhmVBWqEAn5mb/4DABHiDAn8e/xUgLK/wigeq++wSQzL1sCxvKcnLCSCf+AiAcsJEU7/JATDVyinZgMZjfP3DU7pNryll2nroTbgqQ5fQV/ecOhdJKS9pLjYNcpVy+cz/cKM446ayFbA4Vvf/P+DiMfz+IN+8FLoKJ7Bve/gXT7+UYAWOYPmz8naj8skqT30D5M+8dPgstOCbnJvbcdEJCK2gdR8UtBqkjZcB00lc29qSLLBNCdz/Bh48YIL8twH8BgAnR/ggzDQ/xQhMmjiH5L2oGAEBgfwojN/IWofwCGrf8EhFohLn/8LRiwWSf+CBcnAqg/2sP90/wmlPfz7BWwMwf68JIAv5+itv4kBY/6aitgRE/0IL9H+YGngX8IkAl/CYPf/iwX6P/wUTrAaABvxuVOaHdclYD/gIZrvLDagujBwTb1xjAyEoiDQ4uPD63cZj4pX7TkkU3pr7Gu8IYULM/9MBTtEa1iHgwmxFxW1zfjZz+BYrbmMegEVpH4ADQ/gUGgBgMA3v+EC/O4B0/wLFb7wIAdoBAf4oMdcPyIYreQBf4V1Xgtin+WNG4cCwc23XZOnhRt1kXeMAgG3AIBywUAQP8FFN2PR/gopux7wIBy/4iix/kFAKAFAID8wUTqGkf8BxPwDb/4MKbhQIwBQDg/wQTqHgRoBgH+KEDO9AgJG/4MW6U9IMBgS6CgIACAS+AQH+JgWnr+AwVABED/JBOuiCRrsBgYWCgMABgZZ/AYI8A0AGBln8Lx8wGgQKA4H+RCddFf/gUBGBQGA/wcU4h8H8Bo8fKAdP8Cx8w2gL4Avn8Bx81iA2/+CinFXfAf4GkAewB7wHLAAAgAfDAfkBt/4Dj7xoC/gT4PIAInAjD+AwBoAAjYCRgHLBgAIAJaAl3/BQv51vgXcBdwOAEAGBAYR/AkhuBhAGF/4KBfx8wAzn+BJDcDOQNC/wUTrtKwGv/wKANQAHGfwKAIxIByv//gV9z/egK+5/H8D7/wYFfc/hNu2/h+re9GHjl0yfUhkQFeyWiszQdVexqrWglxkhA3VQlK5MHDz/5cCvjllyS1vlrSLvc6jLuRXHXYF6KGxRUr+SyVfhqd9k7g7P5ALgfv4fA9/4FNvRs7VJZtYI52txC6NkzAYCOsWlAIr6MQMo2nwVvwAgIzTr+FDO7/Bh47MCtcIBb05WfwGAAlrBjL8fShcO9xCd1GvXhJpiTRGMPdX1ZUOyQFyDAnlIhN/DKs4ejfzoYP+Gi1yP+EzTf/PAW8KD0P8NJni/OQo1831/DvgH5lP8ffjv+fizHhg/30OMPALB/FYM//D4SV0h+XN0GJ2jbuXIyGdCtXgESZBKt/L4B//BGikAk4hP7kHaYP4sBP/8fHBMP/IhyD4BYP4+Av/4MFS2cz6y50bFiXx/l8YrSyyur2lENGf5nAPwECJxDWbQ4kVNng7uRFEDNIyFfRufSCoP+Xgh+g/l7axQP4rCX/4fPb/4TFfM8ogr8K/4DTpIGzIG6gjKjzE9hggI73Qx/HLG+89Kcsf4EebOUfg0BvD3mElA5sk7BH15RrDKA2e90AqrmXYGh+luFtlhJ/hbUX0/4F1F+f+ClC4R6r7/Fxdw5Sf2FqLP+PkF5Ug/jmeqMMkwH7xw/h7UXdv5cq2gSNdf5+SYIA/t/UX/iIFj/ptJgZP5B1FwQQA5ABoP4n1F/4Vm+tTQz4DSYODoAOJD23BZ3Ev06hT4tsPJM/eOJ/J1dO/k+PPHBKnf11Y1VcDM14AVdXQKgxWlpdxaYNm0A0Uj5+E7SAzCY22iZCpXob/gVJgS9B0DSpsbDX4/gNAVdQYDAOH+gXUY/gZJg4f4ESYP421GJD/hQyGx/AaZYA2/+CBkNi5ACAqvPVNCIx6OmMgLN0StucdxjAg/D+BEwuA/mACo/weEp5f8EAVFACATEAQGizn72/hhnIORT3J3X8Qy6RBBT3f8CJ2nB/igZMU/gNJP/iTaG0HU5Qh/xLgMDKAUBIADA0n+IdogDWQUBQAMDSf4YSoYMFAWD+IkqGg/gUBGBQBIP4HSoYD+GkqH+A0mI5AkgCSP4D2f1wCYv4FAEeICzEC8f4DSdAX+A0msFAAADHwMl/gVJ9JQMo/gVJ9UANZP4FSfXoDg/4FSfZcDoP4FSmMEDzf4FSj80D1v/LC38M/j4rPr/xIgkLf6kKz6/7b119KKiA2LMBqqVENnZD9ZQe1IJv4E9Pd2hBdC5x20xy8fmhelu0UAa0FLtumcErSMlYcxD0lmB8YyQxuDJ0xxzyOf4g11/8TCfxH+DkdP4kYQDsguA/gQAEgwFcsv8nArHP+JgkfT/IAn+3/iYFHBz8IDf17IxgP8BgAgVBgJffSn+JhUAj/OQW0eN4Lazp1Nf4MPKqf4FMHi1s9VemjIy6xIovaseqMn1azBEWYpH4R7nAtQXk4hsFXT+G331shAWa1fb+AwAT/ARFz9/Ab7y/4mOx5/5Hfff8YG75/d/zwg79eIkrBMH/HxchM7f5IDdsv/GREVjR/4OIisRT/iBEVjSCSTaoNa7ZfjFChNh0dL3W06Zw3/FYA//tgiKxKQ7ERlkCVP4ZAHwNiGCR1MT/PhEVijfw7fHgH+PBavHlMv9IERWPKat6918dXfqOvD911zYcR8yNGIAFI78taHkvxA+j7kzj4/4sFAjP4E6EBYC/9oGl7zAA5bS2w9vHMoYkNZC6DY1X/wB687b5++A5ThuuZ9u089iTUhl/sYJH6//8XnhfznxQf5CIRAwh/Kn7VS7/IBHg6SpGjTnoEX1Rtiyr3CYCt22UjNgR2w7kfGIrnFT4glnSP2xRb9a2shttheYmiIxJ+n/1BAZGaDxsjGWr3+AEA9+TCYVsrwGGBazb2r/DwyZ+tCycoBy2RFDCm5cVkgPilHt1nMiJaeY3TIdMQjvxnJOeK3gd5lOs0Y+lPj9byf016eTN139/AW+lYYFqfEhHrm3DFMgmxDAKRCHEYckRXsl9A4UuTX71aJoUMItcm09ZNygmoFzlRHDYQ2ksajOjvP8P3UZBt7JGp2uJI3QfqLYuiraYY8iMEVfU9G7VG8QWdKUVXYs000+p9ElY2h6lFE38a7VF9vzAD394GqCiYix+s5tPrBuGlAd35o9YYdaIi/vXVbAq5EMrDZomdis1l5PF0/4+xg8DRqpZYOnPJnK0r/ZjPkzw+BD1dkpH+AgzF+lJPJnISqeWsA3t5hKmkvrpBTMZ+yQUEvvKeSFnqb0m2VkdrvXXnhsikCwbxFDxsHwYY/2zKISmoUEY1o2loGt+por/bcuz6nKn/BBrI7ynJMAZceGUgXtbiVNKDiBtLQZsrIL/Z0PXJvCKCRGBHr7jb6f4CKxeAxt25LbuD+AH71IVkTJRCEXp0v/CBU9oqh/b/AYAJO3+VAvV//LwKsQBhTSZ4H8KAQnz/A07l/hQFWGFKv/QgCrDfxfvvfwGBT//WBVhv4f4/v8vAqwwGBzMh/HnwmA2fR/6eBVhv4TC+/48CIwOZkMAg35tOSVZ7ivsk3MmELlfjuSdfrCvk8EqpA/OleKL/hxIz5cD/HQLUTXwgHW4Ib/wIACQYFXsX/dAWonl/hZ0lA92CCVJm0P8bA2CjFzjtAnJW8/xBbp/wqcGfwMJsPVJPfwGACU8FddH8unBfPAdywe3qTvNRNHsM5D1TAr+0rB/4CNsPSBy7cwk+/oIqHv9bKYyUH+GeYNohAJGAZ/+AwASogwSrF/5X5gwAAAEZ/kHmD/gO9EQI/HUQSZp+JHWWHePeoUROWodQXxRoA/GPnDc0OLidlOv8cDuY8t//4Tij/0wUx4/wHd1onz423T+LONXk9MS8FA8DdcvGEh/wBPTIFcDudcYlUqYJaTYf+KAR21sgemEAneutoMBE1GU/xMgQkf5+RSHQJoNNJdyz/BSMw5/CmzWA5sD1fwKBBA323/4sPTGv8RIo1n8CYFV/D4EH/H168Qfx8XTf5YJalP4y1Z/85EJK8fKeewnYgYfw0EyZemXVRt/FJ42oXI4bbFHSw9KEWdBoQI7DBl/AAZHeiPBeNqH5gNzVHK32r9v1hIQJIVjfKlTCzCoBzxDStKA2R//gw8QFBu6EAlxQzf4DABNb/CCDtl/jQyvHv8YEcHBMlfwK/fqlXBrlRs1g5mX+tU8cmRTeTOpSBYfKvb0WhnuZN0j9MYyufAxXOMOmwR2oIBuGpR036rQ6dwF8gSzBc4RA1f+FEJOYCpuH/FR/OqAqW+EWACkYNwgRJXBcjy/x6b3k/+OA27zJR5kPlFt2dGts0LSCqAqTIF6G9SYYLaFTjqv8mhsGDEKMgLr/2sNvZ/w/4v/wmCsf8MUTf8DAo3+Rj5WD/Ww19ux/5qG3a/4JFkf8fHYc/zjg3/FQoQCZTLr/gMAE8oMI5rL/Oxv+L//wV5r/2Ab/IrUA88kp3HlbxjJXDAJAqVD5IZKEmT/EuQtA0cpzKWEyW//DwK/vfz6CvxfrV2ALgSloohVEZpMJZQ1Ym5iaqDl77mS35JbuYU+hIhl/v4lmFLf8BFp7yQHapdYjDLwoEmGuIWjp451Dxww6gWwT9+sSiieJCxo4n6H8IFbBWhA+nZKbNv9Kh161lFcT9emEu6kT94ZTft4NFJ68Bmim+xiaxGZopN/Uc0gyVYIk6vVF8pdog5WmA3zF/vzieaGyF6T9E7nV5YYes+4DDx6hG17F/T0TohoKaNv+0CF2FakTlmK3Ejt4w1Deg3YtReTmty92dbfZQwTnabNVpsx4nRIe685b+fGWXFYFbzT4aUmiNkrqM2At4v7nVCFzha3XG9WPhlhfqcNf33slj6EDU2JHRlbxCIj3Y9JeItf/gI4ephaCJCVyv/M8sPemg+x0yL1zKtZciSH9FGITY/E4uwvLY1Iv6u0aAl56j6p9FgwsqEi9RG+wjokFG4CW6YQfDC19kVdfDMrJfqYI/uh08xinxwydqLb4FUDPVbHyKBUgZ05uPlAB6ZHP9IN+k1859MzbsTdvoBWCeCnySm5zpDuNJK/uTEFrZZSFLa+GZR95CKNbifYKR7N0KUGjn73UN5IPSic0uDSsgMdbsOEg6fDWfZzjCdfv4ZHGNIC8vbgQtfOkdlmHn43SYxIBf6KBwr/CsSinu92ySCCrQVuotR91MObawdEn4ZlOvYPrpEQKJjw0ZATMMBpFILFbEdA1H7zoqrPu6mAAl5GpKl2f7VotvrojCUUwFCeaKzVGZ0wGnPqTbbH0Uys7lgq1ishsV+UW8y4J/tA9jaVB72P6fzARaqkLcMgIy5hex9lgx6N6gbwWLIXE2xjUbSzpnoqtCKaI6fBrlxlWAoovIKkqnOVzD06Zr8vTEw/FnlO6gmUTW0bsqvhWTGspAG74yoUMxKgl5T3ZM3W5GsIk394YciAhQISpqpk1byUl0Ho5aJB35pzg0MQuTKCpCybAyYALJ3hqdLSW7hjO8xnI8p5/gIlE3g9t94xbIw4nW7l4hBk0awkZwYBubEvUMqw6Vtxu91JPSsX66lGpyvQtHYONdRdgxN3p2OUAGpUEdbHiBGRb4QOa8zsFydYnwYGLtO8GAoQyGF7J5mIOQJLlPyBpVhy9vea3R+Fa8qFW3de1lnQdyHyL83k2qLpc6Ksb3cEtYJ3xohzOECZ8nsw8FM0Kcyy09QRJHfH34I0Ik761aeG6aWLDNpGuqOiWZrjUgKnyKs45CehghKUQODng0WinWQDOxyXBkWB5sAauadSW+z0/ZFexYKccxhkVs1ITrIQMPllOFBMqGma+CSZpbpC4wxVDCFR7Aoiltj+kyJs0auXVdAarpVNCEwDTB8i800woK6Us3B92y2g8M4bN4V0gUkNULpVsCrkukuYGNQYbvNotBrcVQWdZMe3XSIPcWQHU7cvf3eZRPsDfVnCNeygbOQ12wVZwq6g30VaHgy2iZeNv8BIQlZqGp7YTFEP5Kj2RMaRxxxgRPQRfYMxV0f9dvuFncu6uF+A1M7/gIN1TGGkG52CHNZo+YlfUwci20LzP5DR+vw3p8HdKetpXqnoDUTnP8BBZ4/fVDIaI/AyGwl7ehiMSo9yictpUMP0WqcEFOTQri7Xu3mzYGw/R4KBOe0ilojqWEGB0RNX9UyJIYQEcgOBvoXnVI+fF8paymS8FqYrELgkZSeNIckk7CsjuzzjkSAQRTX3FtbSmiPEkWYp+sopskFLwQ5hI2IWF7tsMMMMxWyn5b84HCZVgut0QwPid8dMlJu89ulR/AGqlHU8ErLh4ZnFcDKtEEvB7G2NSqAWqdh1omr8nuaEbeWg5eBPlX6OZOYSH5sn6lOzAqBCMw3mCRgxbuOnAAuZSbPbwHx9SJcR6NN5KhTsqRVlfwgrZdrijPRm8C2ecwkrL5E8EOQ1KGdAj1hllYZT9M0Ch6OBNqYO1lxlePxEQDLYlLw7iyAHaVARogV8D49R+rFi2d3+OhSGds/v8/D/0kJ9zCz/Av+StN81MOAnPj3iVoWZnrzKUqEvu0GDqHStLVPAuxMazaQEnHkqiFyfIe1x9BBF64puwRaXXu98WUDB/6jDC63+EwG7/SBhdbysEqYKLsBIIQiFhTzFPx/gIK8rcGpZTBm9g7d1jzYbxODHSlQ/hlPYAv5qT2P4DRXGNv6Y2+yOS69kiJzCz2mloN5nlr1I13e80LtIQOc/XYrH+FBXO+FH+BVc2w/wkc+iwYFinv+KgUaH/HB6Z8GR5paQNNP85gf44MMwEn+/01T/SQuQ5IfwKmqNJATlYKRatlqA5+TbK4g0NjK00daue/TuqnuJpdnW6yC3+MjggKO/9ETBAX+ZDJuAD/14VVDoH+PBkWBphAJE/fT+AwAT9gwXRH7/UwyLAAp1d1quGeSP8SrmziP27OM6Ohj6SShmoc4Q9hzU6qwXY9r+D8xzi/wmc1vMx/jgThnfv7/XNv9PBbBRyv5PT7dPRG8kaSPx0YgBckgGIuLPgeO10QFxumxOPLeuUf+OCRGc6//8Jtbf6OE2tqTljv4F1bYmUYgopCP8AR2fLrz0mxGg751GzFrIygLTQp/coeg7xkMn+Aht9HzhZjG64QCd9fBgwERt/8nJGs3+VgoT4C9RKd2N6lo/4GWMYKrYHs/gUCCBesa/xcO5vQn/QgoR4HAP8nIgL/+M4yyyAvMuFw1ykgb/Tg0RF/kAYWa3oB9OuyxSa/1yU7j+O9FevOt2hbSwp+IGcJP36h7u+FdCKLKiflgJaBPYgCmQTf8OBnWIXTbj9mjG86seLxRIm/z4O5vfx8zjfxSszwFmteSMyZ7/BDl8BjHgygKjzEH8Mgv6j4k8pxfwngAgKvYSD/swQMbNv3+Hh/Eg/50IGNgOB/FfAH/CgkV/hESNnH3Rkjn+Lgy//WAUJ9/A4dfjTJqrf70GmwiAAAP8ByF92TiC5r4Pf2RPhTXnY59jlbTzm7drGk/5WHb3v4Bgtj+Hwv//ZgUU9/CAD///4Prb/iCuZ/x8CpXf8+D62P+K38b+ByGP+Ggj8fYhDVl/9rB9bf8Phv/8VwQ38TEAf/z4Prbp/8gB9bZgXGRiZmBkaf4iHdWv4DAGTA0Mv8DwUU0cmBgcmxicmZgZmxwZGn+GB3VMxYTJlY2Y2MjZjHRGemFmRzpiOTc5MDM3MWNJaFAwSjJ1/xQH1rmhsY/4PBhhr/HAqZeTf/sBqrdtdyDiWBGtUi37XBGDD/GhAjzNFf48jdLZai/4FoGI0mMdvxUlAehrUmcFBASVow1uCPO7iQUe7P0C55fusolrQrSeK7KSqZzgc0lr3RLRyAWIndT4soGJ/h5XYFkXtzXto/jFXY/hMBu/iJXYlv4lzGiRAO/KDOp2NWmGZQhBQxqEs7z2D+oNk0I919EfQ8YPrD+FOdcC+RH/BgTKv0QgHLowL/AYAIHQYmwVf+Lht7P88IDwkiH8P7Wn+dAeElRQGr4KCX/HgPCT0/y5R9y3BHT+/Bt7P/sBmKdQSNdqky5W2cSIAEtl7psbQdQz8v8fALH/HIC3/sILIvxONmWkP+RwN/+FggO4/isIDyp1pc6IN1+CacUMETzlfWHU5RgM/j1eCCMQ18QT/PRo4D/D4IByrKO/w6Ch8/8NcEYDo1KUQf5YB17f8yBY+fB/nwQ6QGD/Ph/sYCfxWID/46PQzv4hxRP4yIhg5VVaqq/wga1kP8Y8WAh/AwBuB/4IO+LgIA/08Yqsf58NaSgIA/z4JOh/+fCiHABJDb98/8dF06xklfwYUVf5gSe3v9AEe3YABH8Vjlf8PjZfJnwD8ajas5BYvSyEoLzz9s7puT/xWFx8+lizLv+AwI5wfqn/HQ0y+ySv+fhTif+PwV8//hMgfCz7z5+k+cVrD0Cmca1A4IBaCRhlAABk/08CW+fx4FfltN8MXxSBvu/1WFf/w+E/m91jsiSmRtpype85e0st9U/QFYW/60BfXv49CvwUHMYNVkBX+fwr/+EwROuM09wp/xBPzC355qN7wDJJfPgPEAADL/cgJx5//4H9J//8O0hf+Kgf0k/ALqW4oiJnIGuWf35FDODEGGqTjE1G4KJY8Eqo9g8MsnXsQAwjsc/XLXPj3SA12I1HXFzy7vT1G0HI4DyUhIib/JAqX3yy2nFideBtaKTI/C0+ujaTRAS24w7b9KU76xW6TPOkFc9v+KjG8j/Bglg7Bf7SB87C7/Yge/b8/+KSuP+aA+dj6l/7+Byb0Qa9/KbEOGrOQat54v880AyAR3cSJObcB5lQrrkR56Lq69MGYs1/XH7a9jDnDUlb+Pr0P/wEh5eKu2qm33uMO5xLAZFSDqMRzcpZqAwvI6GwwyTFnGPcH3x9BQ4Cjdu34p9TrgnIquYvJubaxLQYgkydtcvOGWC5SZDn7nDNnFpRb9PbytfuEPlAkhLhCpKk7MUzYNi9aNuRX/P3BDXOvLwK61oq0uapiHjw4xJbYQtgcTRsTKBmQ1fxJ9t2Hfs5g+JbVf/D4G+r2busB5U/cIDRl+3qWDn0T4j0Px0pFSmdveFqsJa+JaMiTwod89LINDtRbMenOMunvQcIeBm8WITywCa+XLgu0sqIq1Ib8FF0m00elSQJIULUO43PhBs6kD8Z951tQKWWJZkDzfwBbww0rDnuHbTe6qaxrIZRWtipEaz9e8NXAj4zrsIQXdTMlLeVVKDd04lEdQVLlOLRzazgoZsQtvXYVQ080i4r9vwQY7otQPYpN7Cx2BPh5K+WOwyH51TO4rrfuKNtf8O7faB/6SOph/7+1m/5g2+2su0ph+XXKHbtO6+UABoCxd2IvLtmwXMnk0pTBGs1TgTP+HuFvf/n3hbj2/3jZDxCwv7YOXJGd7zVuSa9J/64hYmmvDPk2SmYeuv3T/gBSEDW+rxdgVW8G1v1P6nH+YX/cu3fGmhFdbDwnoOEUO9+YwUrvM+f5DubjkljQ4GOV/AGMshV69LSEfLmj5rpmNJfsD9ge/c+Ahl05fvUZ/gIOab9cMubLNQ6XJPpvzOZoxvjIGDVjDF4egBTrEfWjnTJ9XUIutnEifOFiRjbEdrqBpmEfNeoBDA7LA/cqEIm686vfAVBI0yAFon5TfM1YDts2QAQlRu4DSe2XLw7xYF6BRUchumNv3avYTjHaWIpa83fHEQhkYdJh6DC68iTbkGhfch6FaRFsD3rZ9usLtMDvUFcxJmKY1DK/yphDop6mBLUWXD4GWrM6sz9wlp8xP9K5gsQWJ7XwHj+D+i+bRIHnogAkAvbYgwSXWLwuenVOiNwULyWCx//ARhXmHKf6YpsXGHTLi1fwXrzlFdBURzMm80xlMar67c/ICoWVseno9/xp+h8dWWAlm07QOaKKesl7yMc3VwgdJc30AuG6RodwG+Tyv+AkdTERrdS+mQl7zN3UbDKwGpUfhJZGswLDQTlI8C5qnVHk0TovSEdxhpktlNB7Fe40igyySOpBD8DKsdGx35hW6QvXKlj5atzBgjVR6jcfqBy3cMCldkiOeBtGZP1OoqFkQcZzHidOYgKpmmNWvw6rSbS/Tb1M65f8JIekK6FoHVjfQf1PKGDBx5QXsr0eOBmWzoG+o9WpAfoIgepwAXO4rgEApRSoJOpm7EbKnSQCBENMzb/FMMmVBMHtT0aJ7tthzTPsNNKnPJTWYXRQuiqSYyOWL0XfNX0LCIsGKOi29jR9OxzPaPJ7JNzn90ipKZ36U2rGB/o/DOBoGWE8EAKesZF9Fv1gxoT0ocw+cLEuEYmk7A+LBdgyukEIVZECAk8AmRiB7PG5dg7GlpwKQSF6hfzPE9uy+vsa9SCYJiCJzbWx7AB09gfklXrgDB7fJfEBX1u21/V6wJcUnCFcnxmMkqJInox2DG3x0eC9ycAzwIgvvUcwWVNRrbo7Bt2miN/hKaDUGldu0WuQIfq/Ngb+QrzCLe2zsm/a6SZAdlnfJC/Gi3ywqrXHY3Z5SsZeD8VdZss7cwn0sTIqJQMASHdBfADVuH/wEMq/D5LYatUmfXoL6jI/snPPLmw+64zwRREqZsbr9U0UEsscYOx4ZSqwYN+ZOUjDmd1GqBPTJZ9II3mk76Zw4c8+yU43JADiCMv3YeHLB0w+3vWeuID372wqZoa28R2TGx/QJxUXJPc9dghVWYbWb4tJo+tjwTXT6JbKT2D9hjZRCi6XfME9UywX0MSh+s7tdlXfrPhwCCA6YGDaCVm6JYNJITqTO+PRJ9sPfKE2tR8MdvPbp0fI2vnRgHPQfHv3GBrf1Y8FFis0vhKfwOKGxJlnF8/K890km7Y9CHlDKPDP+rgB9N8VaSMTnyd6inCPbYIsEJv3QHZ/gJBD2D63DS8wvY0gHBDyMEOMpU9nT4IUZozcm2S42/fB4CJcGTXzOXSvalugqSsMrDP6mO/rnzC630WW5E+seOQSbJKCk2Rqqi8IfSswMTn5fQBg2fFrojZ1aOBH/0Q3w+LoS4JQVBggGtethVcDLarWqXGvqaJJixcqO7epdlUqAFiCDC0yk0LL8U3R2MMSq74yD7/CR61e4U1KfIc8aiNVT5xiB0mUOv8BAjf72RCkOMOymksddRDPpAb1JPyUkkz8PBvX+XhSt68+REWvI3HGCpcUA5RXJtRluveE9svQzjaQ8bM7OpNU6jztCgs9yJYeRc9RJ7DKj3GRPuvnhvrC7lBnHuJxovRzBKFNAx8dG+fHfxVQq4aITjH0GBe7eSrXjEMFrtLw4De8tMq9AX5S3hzJ0BjT8c2u8nqlh9puDaX2T0T14xxLa0+yI+wE3Nx8UEmDBuEbKHOV9NylJG3c/FAY7wD0a38C6yM5sxd1a+WB1L2kKHFoYIBCP5VpmqnsRRXiDhGZeMp9DJwKLwtrCHaHYFkiO2BkbqMgAmKdLD/g4TAY/wgHtm/4wB0K2k97xjyvuA/gXX9oG/aO1v5Yt+HDb//ARabn6sLWcSY+gyRgRBGpjG+Y6QwNMBh/igb0P+QR3/wceBVf4MGiPkHHB7/FBWE3DGckIlX+AgjPm5BEX/LwhVF/Fper/AoAzf78KwoWD/SwZGL/BzBV3ivVT/w6BvgPncf50BGTaC+0BYAP8+FYUP8PsRhR/l4Qqi/h+dq/20IVRfyGDB/6ACsKH+FwYOrAAB9AAAaQEH+E1qXQ4w2wAqD+HwLn/BAnAF/4mDhcv4TW+P9OCFLH+ShCmJt/gXqypMnGs2hblzZJvEts05kyTr6wxrHHfnUiSvB59nLElKMTkBHZ7tuTLEuNBjPQMToBy9c3bS7fjtggKfhAI2sTb/PgQ8/kf48Ah5+8/x8G3nWuUGp7u3oBmJuKTNuWtudCcO5qW+q/HQmeE77Fs4OiQpAaCzW82HzFjHGBJhuFXZM5c5wkhuBTL4e3zWruO28Nf5QJBZOWoCBBMmBwAyv4B6ojG9SjHEAky30QY41iapTtM+sd0vh8IT/8ZGvhcf/go4SI/6sQJhf+ZA5C3/HxAmX/D3BUB9H8E+LNWoiyI03wDu7zvHsvPgIpm9kyvuVu2F4rzzXYXq0kgs+UrHQ/8cDO+s1/e30r0uug/m0a4o7ro3yRa+dUotwm7ph/HjA3jUg9/gg9yAuWvPlGVVavbopLUNL+duTxi27yjgPn53ohsXmMHpXCBBNC/xwXAd1f+ECd9o7/18C2Vgb/kYmcRd2qXaUkg/grBGssAVZqigDpnJgE6+QrCsYA0n+xB7aEB//8ZEK/8igWysD3+zCpv4fgowbboFXkiO9mpNwv+pEPV9SL/wEdJHigLmBYllWZyJkW4F18irVMdD+wfdsaedoil7NWlUQSvM+wHpahJ+inW+ElK0ad1hfu667m0qgPVqMESU0+HRwzTVlNrP3JGP94OzXg2e2cAhQhAlMR/PdtVEXDgf0ZG7YnWvTdIHZSk5uGq2MRlu+kbTBlfaQu/4JEojLejHgvupZxxwQB81nf81E0JHWpqYYJC8JSbBc/ilR7O4FBEiI1KQvMMzzLvtmExtvEXBNa/5AIW4cDMjlHiCL5mA4BCNSFyyzdcjbQIPU+ZwyAIq/JnRFBIQ2gGZWPjwbFB3IlQi+2esgJ4YrnmyMt4/iRuLEFmreBnJkjFhSQuoHxjTCugCHPujw/Gmb73okU1Bq+fOusXZQ5vohljzbmN+xYCgRkxqDgGulk5v0U78+mUDmUFFIUUB0L2Y98JNReUj8uIBTKuDsDdj/np9Do/gCnHX1PBKhZvSHEx3Y/169M0Q5X4aCXKtaiEulsA+51vz6a/MIjQMrW7pTA/hTo+SoHLYQCf8UogtP3/zICDOAAARpS8eUir/kQ7aUPIQCVlKA/4DABIyDAoGZ/5sdtKbv+/AhkuB/14dtKbv+gDsrX/BwNH+wP97f1JNzR7Bejrghp1YmvCqw20/OBs8ef54PGLzojU/A/xULKAf4IREkWqEAkKUtP4DABL2DGyPi/yEgiSP8NDfn+RkJf7/Jwpfp/qhHPfAAD/YCDsr/A7uB/rAQUgP9fIMWAAAf6CRAhP9bIOuQAAf7EQaDv8ZI9gf8DL9B/qZA16/gXTruTez7gLkUbfkkoxsMAITcSkSJh9qgBz1Fg+0YKpT5XF/hVARTIICoIQCS2K+gwEOO/4VQEaQDRsGXP4etRxL/CBYzfy4sXOKKBdxJJOa1IPG0H6LYOF8FvOY2GV/w+84bMd1g7q/+MDiT4//8YGsG59gksEPwD6bitILKSd3nffiJdjpD8eZNSP17tAYZf2stPby4f+wo4k+5cL/bxxJ9lTf/wDiT5tZG88Yuxb2ay7kRPWu6+JxHU09QmQAPGMXTTM8C1hMZPM8YDtMGyUJco+brCgCc5kCTZf8BG+KsF02cZjsnb/gJJyop7v/PxxJ80HlLbKi2SNlvH8rysz7hOl8ygRyuNA9bOHwJL9e0qBu67mtu2JBvTM9fX+Ahr4UmfOWvGnYFQvuIX57YD99CoG4ma/48wH8qiZlHNhtzFuSVEOnN1JoAz9S4e/8BEXUES9SO4NB74CoQPL4k1Yz7THcKg9co8VLGHF00TGhY2j1p4ZcGmEuTJcehfx+A/6qkHokSLOxoom49KplFeEy6auP4rdoYfJKzh2fWxV6RtBmQH0CTS8Tx/ZU8QxMbZBFpEorP2ygYqlUlMK1b1W1/56OJPgL/xhxJ9N6pwtTaoeVfEVh/iL/0FlLN/gI/vhp03CRmXh5OXPgwJLAv4GEpP9iHEn3I4X+3jiT7l9v+gHEn30pZVQZIKcpd5fT5+7JRMaCkKfBzoJ6vZRW9519oTKnTCH26PJwa3lfgB2jCgfB/c6k2MLcYDvDbCJ5yqlhi4cn8fh8h/kzjaOnawEuev3PpAHIYNf4CKXxPmTyEvpYfR/nsyiT+jvXiYqdwDo76jgQ8vEigh6I/OaBIsdGLophrEm0ekGsiKAf84C5F3+DEINwN1PphANxFBL+AwAQbgwGRLv8UEq0kBcgTIWz9o8f6WNX0D+E6d913rjMA6v1QRPQFOYhNPSrnJOXzKeb3u+tRwyQbTf4w/+HJlX7FKD/TBCxf/GOCJ/L6i//iAMyX/i+7vFRlSn7+OxpNIZffNmKSJiln9iTxl2Lc2iJxOSR+Af/gT1EZjgT4a0U4sntq2sOrUtYxw4SXqVmOkTF+O7K0XLzL0ZCODh3wB/+ILg/+F206h/kNtOyv4ebTuH+H204R3Cq1rVPKkLjUhnuX0LGCe5aCKo8wkPk7kLflj+k84o7Yi8IwrK9d52TkfyGXUgugyBrvDiT0djWr6Kh5uW3JX+CltO2zZNdRWf9ZZ/j6ef2nvZn14SsVk2+T/F24q2+xAHQ4u0AacZXZe3ksDKFZhxxfu/wElJy5ig8VPZBAhxCAFK2/bBhlbEf4sRPbxgXg1j+CW7XftTApLURUyYLQ7gCrjojOsBS5YA2I20nEbAMUJl2Cxoj/h1PvPf+pHlQm9Pz8n3xn8vJ9+9/ogmee5eMhvzVHwwkDSnU08g8q3tw4pJp5Yu1dQ4diRKO8utBQDP+FDt1sggKihAJ/FuWC2RT/xchQU/weJdfwmeTf48BGmmv/HR2CFIf4IBRI//cBUAoN3n8EvuPNEzgF654L6bi46rNHWk301rV6h6+Og+ssvkGU249tPAH/CjAW4wPd/joetIY/DqfBte/xEPWkfwhsLMHMhEFkNQ8hSScvgKzz81KwyRff6gOioND3Sp99KIUf4SGzieXqb3p/z6/v1slB+pbATITNk8qH5qWYQ4MIZP6w0O7Zb3o/xYbYI/wJnxLV1/tA+vgYQFivAVubDwCpeWssG16vzFeYuFbwyKPdMNl0xtkli9vKr/Yw3G3//OLvLf+JHQb+KHqn8FCWoFqdtH/nQomSif2jW/8gG1uIe1acgZSdBwwBc2HkLiJWZa+mPKZBX12km8or09Cbd9QbjxulGVKNyrHsjA1vYV2VKSB7qiXdt8SWiZ8lMSLKvLgYU5C5HRWXR/x9FNIaAX1XR9TpgL4EyPde9J4FECwM53pXe3KfWapm//gIpqxIAgAS5NNvIjgbi8FvQFAF1OjfvD+feynwRelIjsKGrF4kVm87kQAATYsjUWuR+CAg2G9sFNEt5GwjhmVhUU2cjJR03irQb9nN3xMXWETbUzBYwR4Jzf4fdYlrPnsP5j3eGmdMpHQ2Tcg7z8+CF93FZ+gJz+4Dr/LnShiTDlOG7kn3fN2vuh5cL3AYr3WUH0z+ABmk6TbYR02VR6xDUZglCzvc9ZT2lfKVrHXayxWz8Q/0YyEP+2zXFT/AQaeuX2yZX/wAdVP3PRZ9+LdsxHq41hR0PAOqHJWjukki41JqirOfGuAPawvpXq+/d/ICVq3b4AwE0lXOsRCdI87F4ig35cJgEmIjBIOCb2XQXyCpVntwB/ktN/najKwPL7Ea2127l8P4H+MiohakhARxdBb/XgD+yUlj/lY3zKFcDBG8f6oPxukWbM1LxPsghHHyz+KwBP/iBHE/x4h4u/xZ9kAguTv5igeP8MqWQbPDN6cVX+eAG1AQdD8/h2dUCQ//XgOIR/hQH9k7pNyceWUpNNRi+7zejJVu3I3+dEkSIdGh+SLnHHMEonmGme4Sodl0N6Sh6B8ceXYgrOIHi4wL5Av8FIJdBwyQgF882T/AYAJuwYMMUB/C4ToSfyeE6fwmBK/3+E6f0+E6f5KCmOvItcPYzajIoJrKDPtGkM0o6ikysYylNerRVlujOTok0pQekW86ZMqdox+GRVYOQhZz4BHI4//CAXTtL/zT7Enn+xDJG//88elSf82LQQpn+f/Yn4VloXqdDGcFZY6DRotV9zT/BIjBsjiy8Hu2D2RyLGjFA2373vJA/rD4vOWGnLNEUjVTMGErRzynu+L4shS0iS5IUwVWrH/I00nKG+Y/kYHXh9VEP1XuQEYdUScYMu4XiXDWh9qiN9BPvWC9BQwi+2hCJxhlgJ51/09KcmQ2NQJ0zlcFLVXzDNV1wqC7ApC8JnHnbBIy1NVcuGzzdFrh+OeFAruclfjRDPut2K9nOTgwzQ/0aB76c5CUbtTrwysDSDX+H5wk+5splSWySpQbTaFWx1QbmVqx39l1uyiNqoZIW68bvaQ2k7bdc/3ISKxs8QNynL9LxKGTJUH6JnbbqNmNWmis21Kw/1FosPdDcx38CY8w5psFq8QaUzv3QzEifv4BbbehlDAZyiUS/+AeJxvq4qBvjvRfG8k0jzWjgqdwr/ASiVWZLnNqVhFOsMWUzTYoWHZUVq07hG32oc1PsogFzVYYovAcV5mWIyuiGu2ceDZM2Le+pYZn+Ai0LuZOsYkASaFdIeJPO853MdI/gXTf+tR4Duio8cC0MPbDXPpThCTksceC5nYntETdGOh/ATeq4J3xr4YAryJt+H0Bzk7uflwMwWQvjtggGchAJvC/yDA8seP46vvgj+Hb74CEf8fFsn/JZE7b/ADLxMLZz5b1RBAMg/WUdq8kWRQbnf4CR4YkLpWV8G4glg7QMTMAb5f2bBnFNuW6VYdkrKfxj4Hedj7ln++dfxlT/KBCMXy/4CGsNgkIdQm3fls03LvVu0lzNt0Aad6W6QbU+3DNT9/Snlzv8bIQkgJHOMICP0VmfwGACacGA3Hb/LRX8AHH+VjisJY86oOxPOS5wPihfd8SNuUVb0aJv+KwB//Pw6/l/FUw3BHyPP2cLaOd/DidQE7ljx/54HYsgnD/x4NxZ+Xb/LyRE4ByhiNNVUGxxoI7FsW8ivGoJkWje/wGagAKz6QgHCcOAQYicoX80BIP8JgRn9/hIP9PhIP8EaOvMA0nqAe4MxqFGXo3t2CQhy16TP2Pd/EqVM3Uwe1mTRjhV/w7ZXh9/VLF/v/z9ZXzv8vWV+//og99l5gdsczJ3MajFvm7HBuFrW5AYfI7cLmydkP2AjcRSJqLqb3/HAxsmVf3tZq8kVYeIYYmh/gImbgldwMm8zGu0pn2Lb+O7x8GL5IF/gQaT+YPucGFpJVuAuWEASBwV5Q7j/POyQesdn5UDRBF8O87YpX/hkYM9f4FGDBJPgP6IGDF2E8zt8khTfI/2GoIGGskjr5mVTf4rAH/8gAnJbh2hEBWTTqv6U2TREq17Kw/Jn8fxwhsAPXt9YjQKP8Mz4PErQ7vpy38fDBkr/DsjKeg9/Miab/DqZOywgF8U1J/AYAJcfy8mTgBx/FaZP/c+Enf0mEnf5INcB+YTjJhQ+rYwHI7IrJIw+/EmeD0a/SD3hOf6Z9n+7B71XoI/xwhCwWX80pk6D/sAM2e/0knFGv9/pk/+hC0XutHv8KJ9jwgpXsGQfz+mTxBcieNPD2Qo83KSwadNPFh7RegXaooksKgLGW8HHTLNwzVYurv89icARjm9lRqTN/QTUVijGjintZ0/m0jwMWHl1JR2VIijC0yI18w/IUnggIjg/wPEaoJizRUj/U/4CDM9YsFAcFwKxb/Ef7zDYZdsnIH98cu7JUcHli7wafqDYj0Zgq5ELQejxdCrSQWI7Eie+xxqmghIh/wEL6klVtpTcLosIeypLQ02/Ja7P2mjwf2h9fa6Dbn4I8rnjO4JpigB3ygIY5c+QEHDBMMvar4Qd8DL6LLrNg35MT3nVHwhRpBnedjazBAtNQCRwV6HG7sfpCbG2k8E2HllMKfY5i914WlxM7S6PhJZdzQZVGpo6Zm72llXKOI4QlulJESO+/HeuNSiU8sCqdc4zzru8lbh728XT1aWDHBWrQ3JI8PD8oHqr4Sx+BLjVipXveXhQtVqpjHttqKvJLuY/jR8qgUxhDAT2JsjG0P8BFfCaUNYuxj2CyTBBuqzDEDRkMM+my6+S3AimLsS4zsDRfbYZgfw6jsOUICQIbrfwGACbsGA0kn/RCOx/T+Odf0mOdYAEeGn2fasQGQy58jDumUJIgblxff4OGXzMWlv4IcrNODBRKUlP4TI3IC5BqQjdlxF/xwZsHJWABc8fZTRB/FbN5/lga0SAc8HMHqpR/89Ax5wMQf68CqyAC/ysDgx/5aM1ShNPVv0qT/9eBfFP+QClHv+HPs7+JSqI/ijyRFn1J8Qa76llYfvPGOzPmpCGvG43+PQh8DgP48Ql8AOP87GbZTT29Cv/D2SR/Bnpj/j40LcmXWIlu7F7xZFLOS1PB4UpTZ/P8BDNqf+VjNqr/MhRdj/Bmrzdpm3Dl6bWA3o5ddflvBGyLneBK5/lYW1//y0SH3ZlsvIcisv8lm0n8GibXUxFdJi/X+AixqX7l+GKklT/NRg4h/oAw2qsAA/p0AAQYhtQEn4b48Fx0XPhI8uBqXcc39G1uBtIRJ6chU7jIVmT+sB/MWzLs+yuxRgCciq1eqlV8m2DnEIMeUbWnMZUleBz+JFGsdsCDZzQgIHrI8D/wa/wEHh8zQjUuw/EPosWttBWHhlekBGWP+Mjqf6R/gW+m//8VP1H+Sip+r+A1Ecj2UReWkfJnBeXo5CPKu9zTNKR/QgW3gbZ8UeOL2twjwnI53P5tPbSiK8ILZydDXKiwUVjOH4sR8/x4b5sCc/5mEVG/5iNiPv4TAbf8iA5MF1/EEwt/CdOK/IEqQQFRwgEoT6tBgO3Wf4qKmX4lBN0nvNbkCJAkso+k/h+G81fYGVA9iKPkeN3OIy5Wq2niG7Ccj/wEPbthVenhtReYxlEP8P305XKdf7Cx06QLCYNt9To7J8C5JlKI8tZ71hTNapqo34Y3/FglaL+LcGf+BQUb/egXzL/ggQYR/8ECNC7IyuDe5tLo3uTz/qAJWi/gltAJpMuNPl4hr8POq0QJWBRKBKY6CGx9SRBT+gWxbNv3YbOP8dHIJkNQgGdku//AYAJ0QYJN+P//jkEz/ORyCZwgm36+CUY/wPd4N5/D8sqDhcu6LraefT2EIpsesm5K1eL0XtXIpiZSqw24GGb1/8GTqpH/gJCAaggiph9Fzx1hr/ch0qJ8k9E7/0gcgmZ//njkEywVD2yYeb+HgZ/F/h+c1W09SIjlUFrLm6+zyEV3muQhpcABle1SsuuGMh8Drv8NH/vTUUbDQehG+fB5iKmioPvGojQ05Ff+hDVAtgd42VpbrP2vr7P4zqIeYq6ce6LwPpwl1m9NETmvHaEFgYvhVxW3trGc0JImhefS0hYfLU76AkYCTR7TImg1Jzo5+JjP/BQU8Omz8BkenN/AYAJ6f7AEIKf4DXpC5oI6qMymUZC6JCwSGHG0AJr7RV470yJk2giTkQUT3Lp7/GiFwyN/4aQuGf8TAaG7DAljfOLQ0QGB/CYA5/AoHjAf4WEuZO3/XB3Kj/DCBCAP8FAQlsofwuBCfwfvPf5iFkNk8D/gV3yMsytY/VPygOsdd+aGtVMXuS9dgYRU/GBGAA1dIlXtzuv/4UCIcwgucITv4Ibp7L/GQaIDh/LW5v3XAYH+OBq8y0/vZ/ITte6liaAxSNaO32HvCFrqsA/gE9a/z0O15TwLjfwLCsBxFQJZQUXlKPBbcT5p45n5d5VBShsJfjlaiP9+l1V7cvH/iYkRc+QR1ggEQhAJqc7mDCE8IlI5gt7ZLY81WsYyvOyty3mwSRkobocBKgHsFbbguQREVx09/jDmfAZ3JSMtgnELrIrbMfXl6vj/axMf7/iZFGe/hko0/gUDm/j4gRf8IJR+X8a74Q9O20Krf5YHCUf4TBsv8rJe1H8JhC3zDJawFshrIfw1ctAISTJA6rAH+vAxTj/IQz16B/0KiAH8Ygzf8gKrUJpBkRe/xXLVfxYD//yEavfwTQNno88pkBZpxPUonio8Q9/O0QlfpqikUmEqhka4Y7Bf1+C/+ODDC3cP//DxMv+ngqcHmQ4zRBNXbQ03rYAYsa+S5+E2tLBTPG433SoLh9fsiZPq9/8dG64c3CAdtoJv8BgAirBgQxC/+bjdcNWaxuuG/4eNZtn+H8vziVmnbCQ1blxw1ZkB55g0AqkOKo6K8ro/vuvoKF7miP8MEMdBf9ikUdmvQ+OL1fg23hmaq0nnmKP+hjdcP+Ay8X/GAvz//jgr30c/49gtpa5t8u7PwYTVtdP7//ARED968R7Pkjt/HiPS2MPDfwLRyo6zC3Mb86t+/K0t8SXUm/4CURgM0VjogLH0jeGIpJGTlFcHIZb0LSZea/wEHyvpo7bfHZGcRGsLKsFd8BAlv/Am5hBKQRKSpkUXN8+DXUyXDhcYDUhRWr7G4LEAC2A5+nIUx/gxJq3A/8BE9Ju4y4wwumLq3uR/MYT5RmCEvHyCGMDMwgFFDpZBgUfHf4oIom4gETVdbiF8AAP/8xFE5/ArF1t39m4AH+9iKJwESpZ/yKuQAP99EUThD/nQLYtie0R/W6rv+rEUTn8JhnX8VgD/+YkONnlN/5dxJPPOxFE5/AeJwl37KoerMVxG/rGKxXocaAWOs/TH2tv0IzdTkZJOqNxXcf44Q28D7+CD6iv/4Aht4f58GEO/8eIS3gJf4IFauQYE/3YK1cn514/46Fa2f4ddiolCBKX+HABfQzig/h0Afb1D+HAJ9Etqj+PAB910cD+OwB/+A/56sB7bLiID09VLM1hgNpsAz707RGGl4NvL3ScrMquozuSS1yADnvbUEcs6Fg/s6qBOupLilBJK11kQdfSBeYDS0wE/puliZpIiK4cF7/AQeUFa8A474r+gHkGUtggn7vMdsUc/Ze8teLgjFQvSk5WIefgwNBkVt4tl1FMG8Dj+yRCxk4sn8PgT7GJYgmnjNOPs07Y5916dplPUwvVE5xd49wL3dEWozwij+8eyTJYTZD0XLkIxICNZjLcU/4uAqHDZMkMMX6DSceBKjWKf9vddDzItTniqWvtMdpl7LVFey4ZBBFYRMa9Xg4j7XT/+X4Wjr4dMZ3a05YKQGRUWutDdaCPUtSBPXHYZH8C5i6eTrT5TpfXstW4OjmkfTZLeVgEK0D7hWn7J+HXPgJSV3D/jg0/A/wIXBuf3+aGf6aCnCuZQ0xH/gA2WHuXQryce1TuyVD9DIfuuS401SywplYYYBGL7BrF/h1t3nP4KNDASOGv//G3wf+cDb4OcwMcKUrb/E8bfB8v8QJ2qv1VMLnWwDrLZPh2sIgtZqVKeMFSvV81zFrNaX0n5m/wYjKtT/h9b22f4CGo4ulY6ET3Ra0o+kxN/9CG28bB/gg2+E/9IbfBryDJJpH/DxLZq/5AOrxAs0oJMy9fmStOsJSIgNkw9MBGGQqdxWiHVfbv1ONL+DFjRDMP+AfwwxyB/qo+d5PoF8XGcH3KyG/0MbfBxsj/goYOMvr+NG3dcpWl8f8utxHBqrpmc8aOHJrKSoZfd2bCf44tzXZA2ZAz7MOy/wEisefQ23Xlnki1rVTmJ4EZ+WAXwd/+Ck0Y1AoFf+AwARggwPsGf86HKBhB/jwEDQdP8fCJi8BSzXWDddaQ3fgn+ISAJsoZnOsvdJdmB97JBd5r4scKWUNb1K8v8ANYeCPWnaUZCS98dyPpBVgFYX1BvKOEBaxAMYHn+Qi+vIEae3mYBju7NyOuk1bzX1ZrkbFV3xBAglWOGvfwK+ZNazXOU4F/8cBkXYZ/tBBSaYQOjtlKVEjxIXTBlhn2efGwhmTunMjmfRW7dHvCZvvDJFr/ZCCkxf/+OPpf+IFCOMu2/4WKZlRHo1/4gzXSng3qANwupobOK7dRwlzC+6NbJ23mFE5kfIlNcUy/rgi6BvAZvspFksea8WX2UGzF+rVyfAeFVpNs7ndpV9q23/sRmz3w4oSRwH2m77Ov4czzap0dSJ/v3sD8zYiKQGtLV7UHj7RjmOmEBnUaoyRFjxHn4xrQ2zxDjO0N/svxyRLHMhP6jX/JpABJiqgyWUDSGUbvmgnNhOUzaAbd5c1qPPOI8NWyhD+ajY8goHPEX3f6qNCe+vTUICTxHcXq+nF8/yAZ+tPeDM8EjBFCI1qLkwe5arXqX/to3OeEC4DrHUhG0+YeZvWezs/EDLrcGzDSr5nrR0yIC+MZGz0U7oIYi0yBY0jOBU3ACkbxhQIYXHtLPkYJ7jLINt9bkvDNexEOjmNRfomlVdgNAP/4CEnAqDZI7ab5YEVYhv1p63EYUdsakPGzPUYbsUa+0hnR+NkqKLgvFk/IyuYQFubxL6krega8s+NLUzzPRIfYrN1NdSeBZqAe6qg7SdYu53aTY495ICOibnXs8aVG/x4VlHn8IBK/SJ/wGACDkGBMXp/zZCPE4P9+B5vd9868QjxOD/QCD/L/jg0l2n/7/aTv9JERbEfgf44F4hBnzv9u4/08Mlm8zJRBnW4/4CMl3FbzNWnSKFu0Mz2UajOHhOyUedqt2epiuvPtf8cEeISp/hgjxCGP8cEeIS9jHAEpnxtxnEKKbEG8zTDDrYG+EYOjtMONnI3HRD1zP++iDxCp/4fR6xyBgIoUoPgwA4xndFpsEWdLhNDrExhZqloPcbj5F9p769g3/CBC1Zgq4iyDi/e+TSo+Q+gdYUS/wEIgalYsT837jNF/tgVD+/zAHvYfw/Z7sXtv/wL+ONBuh1faPLrxa06w9m9bWiJhIXifwnr443VO2Gz9hqYVVpWB847S4S612AbwnfrsbEbq/e7KX+DjB5+LhALhI3D+AwAEfoMBZTn/IyWWt/GhS4GHoL/D4A+gDSYmczocxCWJRH3wSGgx+J4jm8naC5gNR6E8YRRGf8QSrFY8WFf2ZUnKj8dMwvHugTy1xQ3RWowAW1IzgEmGJiomGB/ioMYm/gEBBwbvhANBC5j+AwAShgwN28P86AnQpd/Dxto8/4+EBeR3QPNfyrshc/iZjnab90YBUQHqFbaUxRiDhEVKPuODBt06sRNaECHSnGgez7fo7dEFCO6Z8tkGjz2y5GJ3acrJT/ICHEdyVCkL9OdYk+jyF53zHbEEH++nrX8EPVvhz3sLnnYBkxFGUXzUbPWxBj5kN/Cr9AOP/Hij2gc5Z/AKePNi2IX2VolshcHpyq/g7IkOlh6/HvSaYbcentyfzE9nn2P+LFI2b/BANBeFkJ1CAS06Z/8BgAmxBgLs2/4uRI2Zk/w8pGzYMN9SKGp4iWXGfXwL5DCN4gkX38GUTf+D5RrH/nLZ04P48EQgAAREk6T4SFlEuso5QynZy/jqvQYnOi6RCV3BNzu9brEyf+GNqPwT/PB8KR/jZQm8gKuEASEDKr+AwASxgwGOov4WxnQJEJS26Qv9rD9SH8Ji6f8Hz4d/SATb/CYMx/X4Tb/jpWeu/xciun98IBySQWf8yA339qsjktuyX/HiA+Nn/5IRWS/4cOAqD+/zJj+SW0RYz4FV53CLplYaUAV6L36WW59F/cenGDz/ASHplDp0lO1KbFkoKMZMj/wqKO/wLBiQFZCAk91AH8BgAgPBgUXifwL7XcJ/N4o7A6geQ+JVHhxkpXmXKprkWpiMlsn8fXWtCK/Z+lH+ugb7sBeP4K5GAtoqZ/rKRegb8TuJ6Qxz/YeKrscde1GhhJP8SC5AbJQv8MI4qBRa0HTjENLdbVKbEMNnRFGQLRnrv8VgD/8Lf5Oh/gxNzgAf8rFAwf8XkQYFR1KgfxWRx/z1+cQR7yBubvyFd6SR2KvDW1AEKECa3vyNarcOraMN3xoJsWTHNwtFwpzQlhcMY6exGG8SzEhz6MJH94nLDEdyfKRQtg5hI5Vq0lB5vM21j/7heC180k9njlimImLOdrtwhklGKms5DmC8fy8Q7Po81DjBnVN/KgiHRoI9LIvvw035RWRDEN/6wJ46kD/qSHIZw/z8qj4X8syqPw/yAqj/44McDOX+/093/Sxsmx/jgghV//cPJP/5hAW9D+HQWlm40b+Bezp7nu+4BMtEmXrmnR5j+NY4Q+oJcu2++4sy65QZDKqr+rK/xwgqhvP9/ra/+njbr/mf4s1CWwCJI3LY00JPm6CtQfWgV+a3E+URpWpsEVTeZJgvyHBInm//8dt0f6MO26AArX/D+BflqD+22ykZgdgoY4JNESsn/AR9/Ao4P6lwpovaM+cCf+Aivb/OaJxt/hwMfcr/9hlIdVFvMNMKSUC2VKMkMcfQD66XNdQv8eGryPLLx/Av/VD40RM+rwJs9hsh5Z4BPonTLn8j5N+bvw2HzJ5IYZwo9Qe9s18D7zAqapfMXgGdUGcAhmVeH8WL7/qwLk1P4TAbf9IBc6nNF4kt26jcwR48ijxLupL9KOVEwn5WC5S8gb9TcjthJwoSfw5832v9/uOn8lkS38CfN8rQs2HJtyEapr+Yi7l9BoKIL7s5opNGxW/dwV8nEEtTAD/jBAoSZL+A8Jyr/9wWFt/4UQoSeacV3CYcYfZMPdY/wEJ0QRhDCCKJDJH0euR6PVovHnbE2fSEYX+OCq16f/mibsYIBDqMFeo1OdhmTmfboe/fpnRO7mDS5Bcs2iOmCLibZ8kb//8Nsy/3bN2Sm+XfwVoNkIJwdX8PzdklBsGdfRTYBsWKghPreR7UiRHTLSfQUZiHP1w199k1niEs1dR3AWMLXsJjDs2q7oVIMJzmDtVeNkzX7N3bwq8mcKiUEXexAEAJuoLhXDBh6Dj0MWlWiGQBtGcwQUN7u4A82aoPJawWSruyY7ecGX9XpLaGoKUOEMNqYmndKJ8dYLMxVAixV6EIkyVrlAbuh+DtzEes6zosVGDMxkoYVKspuD5MeUt3fYC2XwX1byJ7Lzw2W45oWXK37Ab/X141uFy/w/+bAwTZyMhC4Jci5baEKuOTWahoNiAzhvN5uysjzpkshqXIy9bDaMM4Nkidk5IefCPW0t6+9z1GXPA3AhDVEwUf8A9aCsrqvtj+aCBE+6IMLzTRYCdjYgOJaK4/kKZVXSUSw7KNyGIjLQtEqi/PtTICj8Xatfr+o8eiea7ZzFYHQpSmy7lFLkebEeoom7GJsTVxocPnskeklysufdpqSpKIkRKqvgTKj0tEAQuSx1mGHyQq/NEh9J6njKA61uCoubsRMj2LQp/wL7zUEGBUd1A7j8mRFtja4p6LIGih/zgmxe7JBKnFbHkGRbZf+OAr8yD/2kJZcq/+wjpQf//xmEt/zQSy5AKAIf9/CRHlbqT5VXX5O+OdLw9DuaBcqJvm+RvBq8u1hmME27kyu1oCYZY4Rz8s+OMMJfI7jyU+cQqCmDeoKg3IVxoiZqhkyupOaN7ZGHG2DQVQ/CJme1sseSto4UFC14rTk4fXrShsxrgtDO0S0UQKq266Thbm+n2kSzXND2oPwj1nAXR2k3K2goe4mNzTLnTo87JZudvPiiRnEKdb23aC9sHIf3dN7/g8bsAdC2a0m1y594eP41Sm/Y/mIU3uqMlLo0OO68vUd/8QV3lDwITr3O48vl2oYDLpPiH2UBoTAG7XRHSDlkCZMtwBDu9YbtNtaU6JSwFT+7x6n1NKg9HBXU9mIYSFiwlNlBRstH0SoE2SiwtqAimgY3LO7P5+JorjS8WJMW1FuL7O6ym6Mfi19i6EE1ZD78frZE+bgLw8yR8mmozNfe4AM8tJEOZ9qR8j8RbmzY+rPlDbFOw8HaiAhLvsAq4qcloyevgqoCIk+AS6OCpWrxy66rtT/4CIZ0KXMQlVzVZvFm++olibT83A/xwi7fXn/7CTjrOsU3uLhd4KN5KLYyaRoYH+hlO03+N7BubNshW27Sw/gbLipjMCB5gJhspMLN8zlZ/JPEQat5GZpzzNuYnyvIU8pS4LQEXtvdHe3j4BGmlOz0jKkzZh4CE/8GI9wUFCATI8hH8BgAIIwWUBf5OK8+v8pCltgEedHf0MFYYIH8DZx1chp+RIoK2/BQayjzl879hLKowrkNIERiUSulT7UatORv/CgEV/IQJfwGBWQcIBK14g/wGACJUGEyG3/iwPYKB+X/MATsFf5QNYqf4JAzv99FSooOAf66KlRSEF/1UazkgRvyzM5aYNrwX8b1OQKBqLrG0qx/Bt2qlZbYhkJKapFPqXc4QgGWYncCX/8PZAQLSf7CKlRQQgVLWh0hvymFMOiC0wj7cE2cnYqoH630+ZpGnysQD9NcI/4sCv3fwmIR/4yJNEOtzI1YCaP/PQIZV/9oqVFL/UG7ZcpWA74vwRClhS+sG65fdt/isAf/nEOD/0AX93XZ+F7xL8hIUapCKgjmCQhZ9mdZjUpfmpUBx138MARP8ChLNEDfwmAafwmJjfwmCC3+Jg0/3/EyHgx/igvyoBAJJBVj/EhOfX/D4Y//D4Z8/6YMgATeqAJvr9v4YNiA3fxHK1rI//Phtm/zE/+PAgR7yv97FRcqb/At5nZU71nOu/21fsoO8MCfwBfJpwKfbOp3hRh0wC4L/wOLSbXTsx/iYEogAvkMeYLnCET/Cw3ccpYl/ig2G4iAFss/GDPOzCuQz/XAWI9/A4DN/n42HAz89JVzYWwf6uOaLAFpgsrqelgr/hz3t3IVt/zgHu2AZIk0xhSkv/roLEeCiT/ogb9T/D1jkE/5QVePP4PBs/4gwcP/QxMqJEkdkN9uZZ8XoYJ99ZPFYuToNJWFl1OYw4hU1XDZ6jOWB9rXTlE28CMZfkdMePK3P7XGvklVUDVRhPx5/p8bI9d/no5aQBWD/Ow3pD/FZNsAq7e5/C1QSH8EBIe9yxDuTQfofw2C3hxJ4Uv+MjC03/YxXkR1ov+fB3Tl/38G/U/5YHexf8eBuhIvYksPe/ycGP/xx4I/wkA/gD/v4N5J/x4V20f0oCf/4UKHrAL//xMtJ/r4oes/x8G81P+pCZaT/TBI5n/FYw//IpIGBAH+vCHvj+fwD6p/yMhSrxkZm5cbGxkaHH+Jh2igzNy40MjL/ERQ7gZGZuZmBInP+Hg30tjOTI0NDc2U0tKQmo2Y3E6ZGZmMmM3ZGRfY5KTjFZeEv/FA7RGbmn+aCh3CbwKLHXoc25EYUnV82jS2QN1GdxtML/8FCFt4p/rw27OAczJMkUR5FAAAP4VAhD5BHGC5wgV/woIW3hQQr/4uNuzkX/WRt2d/BIFP/xYI+LP4fRgP4pz7v4lA3wHAgHD/PAgj0CBRl/rw27OKP9Dxt2d/D+pl/pIcjP/iYGDxHXz/14bdnf40JMgrL/HQ5BnlpaUgHqA5P+CByM/+E0t7+E1DECoP4fA7v8EBfMP8CZDr/lAL5h/yoF8Wfn8CwfVq1yfhYhg9cy5PbGYzGugN9TyKzjcQPovGzjIQuq+AocgMQ45q5HntvzaYQJFMhG9aC8YmM//gwRpfFmn/CCaEgHwYC1hn+ThcJmTi6bggGm7lvfyW9tfJrOtCF4i1P+FjG++3rohzPSjAADAQg/TbE3f3LNUk+N8CPJOLZ+n/H/BR7B2mQgFFDpL/AYAIZP+MhcHNPXqflUWeA/gW7UwU2L7W7ANwuPuFAvsKRpgQpA9jDgcyi61jn1Ig4d+o7t7SzXXuipQfrp9FbBv3e1iX4zGJ4uuCBo+EAlxEVf8iDQbP8CXnWKdjIKReWXQuwDi1UU5WWbqQoqK2qEGcTFcaUwVdaVOcqWfwDv8J2hg0QcmvOrWkXrWS8RjUy9/DZBAVNCAT1W7sF7Qn+TiKfvzhMBI/x4SMCD/8E1IubuLHsuEu6U7cTPOjicdd8Dl3eKeHGYcPagCWm9MBOufnusuL06OoJJSOYVN1s3SvCfNg6PZ8dsEF3P+hD46sESv8eBIN/T/j4Zv9JPmRBFNrfrC6zKr46g7uhLi75YrbO4QuEVWyACQPJY2aa8jf4cJtUC8h4xXG/w03Ak5jEL66s1gEmcXbaAbenX/ygWFF82ikQdtqVA5z9mu05fHMjEj/D/HV51O0GQVpAI7CKkNSWDzwxTvSf+CAXu7/3QUPKsf5ERwAEH+qdvPi/n7bz/18PB0HF/ooIy71XtnG7sn1C6K0vJ498sDX+21BnhkcVL1AZl4m0t1MfeX6BsYMgvXF5E6N6b/gCG8qZqXdQ1iOLXyLF+/1YfnefwmA2/6QPzvOblQIYmmlbdWoUYRdgWaWhPN4YVclgg48QSk1c/MQx60p8/xkOvJEkIBOep9QgE54qBwWnsf7AHXkubolu+xmEAZ8xlJL/r6KWexOluGy6rwVxDkR/VzaWIcNwG/hQZ87YICp/9DBjAel/DouoBgP8D4upSk9Huexv8TSAEIflPYqqmnPQh3kGPiaP7ayphQEhr7Q025BrSmZd3kDYvBTt7xIjTwol6TPdZZTfSESydh1D93/lA+mA5vDh9tZfytyJ47iQZjG3eBL/AQjsV73FF8yFfngoYoaQIUE/wlFXAg78tYu+awVC0cQelO3GpSwff4ILqkSD/xgxlAVYUoUMcp6gzXEKV0nRiefQwy5WjQRtRFh2wyodSjEbu09vN3/7KQ/0ebu/28h/o5lX/8BDP0SchywPb7xh2J0uDuUs8U1LYmlK99h5tKB2TgnJR0QhnWpOIc1ilB60V2zftga+dGomdsBsyawTITV+FZWL7i+lf8/IfaL09VUnau2QOCjB8Hq59zT2Tux+1w0olA6gPblPUT9YxaSZMOKT+5KC5BIjQGrwqfJRN2mSEASPa3kaowxm6szF38fgP6Sq+DpyXbg5CwxODbvjDgdPm16Y7RnY/NPqJXW8b3a2D3m0/x9mwHkczSCJSFZKjfOmP0/BJHzgY0CMnrY9yQDX+fkQNGBb9/CMzEUUhtmd3kgBRGw+QfzJ+23JQpFqp4rrYFJ+1pFyOIYB/htuSCVtE+sC8CT5+mRGfZ4xSSWGSYh16HwsB/zwRRgmP/jEP9Gzwz5HFp3sfDZbxzKhw8VN6Q1GhcDGXpVKT24ZQp0YJb+BhKQ/2Ih/o83d/t5D/R6Bb/oCH+jUIMPn0TbwhVmhvY06Txamqf2KSZh5Ngv6Y+wMs5bPtuskJxm5IFz/S1lfBiXtIw6Sv1WBclRUGmCJc2xJ7nMUB/H4bIWxrBzJZPbDf5sxeLr4cgEYbAy3vwOGKm/VY3GK4uWkG8zWsOY7qLWBNgY+rB0Nez4QWGPQEih7sfGgZGSfCSjB/EgbInB17oSAPecpaQcEXU4e8PPnm7S78OAFCAV+3EH+MArppuhthbwlQ/xEUzx/wmAmT9BZcLiqIp+Xp/yhpQdy24+IT0vmvRn1LuG1UAJSzXpfwSdFe3oNJQCxe2z6zSSGFjlTkitjEou0orVu3AOoceqICyzC3+OgnvpzhAR+kJD+AwASXgwBuLz/WgRpDzhH+VkKcTO5fS+tde0xg08e/2/gd3gv0ytAf+KwB//PxaDl/F//unvZyf/w4APvHR/Hrziddwj/A9MEev+wAjSH/HB0118f3t4ieaUNx5Z1KPqFbwGsdYkZxgjvicn/54KSzKo/RP4F2i/qVTdhVfbYPWAO+SjK5B1uzQGRKjdv1JgqBPog0uol0eou5qikX3P42DA+BTu0HL8Rn/rsvz/BiRwgEtP+FkjhB23v86Axm1d/Dycv9/wPpy58rfBmVUUonfwAN4q7lPP0eRARe+Esi2rWPXijjxqENmdPZL3+AhXTzHEJMrkg2oZXh306lbZVKsOkChOFKFRe5oLq40v4fRfxv/HhiJshf1Sk1cf8/JNXb/LyTVwf8iJNXOCWPjU8cDsA8DQTM00uO7GXOKRjAa8YtoMKfSn9ivB2/z/wuhpVEIBHkV/nxIV/AdP+HkNIf/h9anDIABF57oBXERKOo+eFZT+0dNKCgjUIeRw6cpmSvJAH3HIwT1OsL+hdSUmVh2hK9f4AKKAgvCM48AjMPr9xR+fJOwGH/ygVxR84O+hIM0H54w0dG6APBwv84mQDqYsD6Bz5ILGZI1yA6F7d386VJaWaGsyRm3qQuKFlTX/Oea/w1nU/mAvGF/yoNV2mZk6ky7LKRbKfw0sS/4OQ5lEP/qSHMicn8PLIvL/zpDmUQfP+XlmXk/30hzKIPt/Ly6Ly/yKJLQWAn8oVcgkaPyOZP7RFfmtCaYe5wZfYE/Wp0aa9hDUtip/CYdj8hWYd/wKHZA3Egf4uLpvaT+wrTSg/ia00MwRsQRZg2Kc0/4WAD/5YtNM+LEElfuqPtrhuwvvBRiQhA82nb2M7tR5U2urf5BJhCQQN56+AyPivcKBYnKV7sYMx58aiyx68IHIkeW1w+9WRv5LtNP4TF2/5G26zCEAcQgdb/CRf+Gaz7AiSJgtAAB/DALeCjxLkHKj/CZfwq7lCBP+zCD/o4hP6+tNP8sGXy38KD3X8EW3yxmEqgNFn8nAY//G7oV/BAD//xYrOhKjg8OdwkI7mL9/IlgUI3zaRMiW3/hXUEiDbNih6P7wu+u/8rF0bv8DBfH8Phj/+yhr5N/hEB///8IZ/fxDq8f58OWFf9/CGf38VDD/8EF2f8MgR+IhebzacX/2wgfa/icT//58IZ/VP+QBDP4zLjM3MzRNf8SFJixmXGZsaGph/B9taYmhoYGpkZGpgamZubGRhP4aIH2GIyNjJlZDFhNVJaQmlwWnc6ZWE1OWFkODFweUsSVhhNWT+FLbQ4NTJ9/h+Fh5xUd5jfBWkppvfsP97FzcWxgXnCvdZpraJ8Vq51fM5LaoX/HBp4Kn/4QFUSLf/F4lipsQG+MA2oFKzJh/gIFwddACqWow++eXoZg5UtOESEAh9oOF/sQMVy//8jMpf8UJ+ToHMcf8KEmwoedBlXz5YI5PopCTID3/gnY4vmEO/x8iD9g075O9yhlmU5pBXM1uqTUC9te4qurdvYNmAErJXORVsZBwo9OAJUDHTCrIAGLFis+SYMqUGO8Ir7H/gIxeGmpAFBcD/AQmkMmf4CAuZHFxLQ1/wERtEKF9dyIaMSY0RvJ7g+vywE7jMiwDGN0xE79M0p59hRnf6lhQL3AdieXoTRa+KLby4julOvyoVYmZA7lSQu7UQp9hTWlXLTsugZVACmzXDSyBd1s7eBjjwUlQ0q++Eybh7TYgbY9vu8fpthWTrpc6BfuX1zin7D91/D4G+6SBvFvSblMUt84HxjQIfZOsPN1jm9x22d2gxD5lczqEjTTGD2jYOiYzL8YTAWonhF8ISuUnhtbJgOgRpJog4XsfIn4Q+f3i8v0tKw7H+eFQc47Tht8BLgOeYJNhLQAoSYUiJ4/9bmgeYwjsPxphvguh96rXJF/gIfJcfqFKuHkzcjtSzudBHIMKasl9nnwALOgQpxNjL2UyMozspqutGDZ1pEA1sN30srW5Y7VfneZnRU8dHQ8MgxAvCD3qJn5fvxbOqSf4FUsyuz2vvicOYEaCUVkdJkt7to8AJdaIC0UmuZt3i0awVnckcJaOlM7DFhF8Vfu09p9GzFBpfAf/BwzqS9Vut/AYAJyf4yFlWUJUW/3oqf8FHnenOF+9e/daa7uQD1AeKGMPQm9tCGpnGQGJ4WLqD2rZ11psx/+KgaXcJ/wQa2+QFbCAlucTH8BgAk/Bgc+3fws4JBDSf8CFsYQMg/yQpBwf56RH25b/viI+2P/6WIvjf4wazCupUD/XyI+3cO8/J7H+Ajvsb/ARj0ENzQT+6zU1JFaYi+xIkBmhtPM3ybU/qXRwKf3umYkXD9w0fOCIBrRvmiSDs4qbwjpEYerS23WtmPDv8AgHVZs2F01YFgisH4lGVGka7bTJeHC7zFc+ZGuQYuR/t2JlHjdHI5vkVp1Bx9gwLHX11mmSLZjjw3hFjpSvwpIjtwjI9k1tkRJ0Qh+oaXgOlfQ1pLnE2ItqmNXdHnTOFRXDxhXwEsvksFKY7Z8JBFLpS9PjEfKNHaMvMzLd7ihfx1Dmf3GFfkH+9kSpsOxURlP5fCv8DO6RI4MHRdVomznmQe5F7tj/vzMjAfn81HyWUpNEYEP7ZCvwNE/mEV/AmD+fhX8h/10X0Y/wzU7AVHOOSvTpqBf5MWeIP4SuQ1TXV1WO/xkaSVD0bTxIjAfwWVi3+PjaOH/GisYR/kgLri/x8GXt/4oHaXHweDM/REyCv2ry/YPoe8Knxbr7Fg2U7SLOyIik876rDZ4wLY3eJGMM1Ko+vCgejMSJ+LNH1akgO6InJg28jSQKKfCdg5R4n4ovUAM1FolrTvMAdVG5FxEoYibxVRZjAhC30mE/T+ACft/KI5qPnx9wmMfe9Ikq4+N4D7Wg1W44TRRT8GEcmJ5EBDPbUEO/zUIY8D8azbLIoP+5wkj4dFd3KwOP+Oxg/wEcXbqFRIAsyiuOxS2qlfR/67o/U/wcsPbgK5CAWRSnl/AYAJ9wYE+QX/kE3wT+EwPT/fBlpGXtsT/r4fk/VA4n30+FH8PcL6sp8cV/csa0+TQEzZ/T0NC2Dsw+byp4iTIc1uwB+IAzh8+HoT9BPXlHfQz7hgt5KpKcVosMpEDEUEi5ZGwM8df8BHhinsbpELFVarC6PbDwsJYCpQRxXN4kYkTiDIcUe03EOtPNsciucWfwBVfbBnS79g9bn6dYbAuZmzjyHuzyWe+mhFfwBVrZ08p0pICwdZWrXK9lDmxi4O1CC7/4CSmTG+FAUuELO9nzCxEGSjhBgSrR/y9JxDNCByn1GmGVy0/wEcqPr4Tf4YFrSCsB3TIpRVFVKElAgQ4ctYO7xJBBLkjkip4Y5SmLwIg5AUjK3kq6dkVJIaQLnFwSrhIneNQAkB8CjjpMyCKql5QC0hzwn+MzHHmpSMD9mkrFtmvnEUmJzZwdmgUZZrngdAM6lKvykxd4AkPMT+NfS3cq3yswIwWBZy8gdokTMHWOpicJcQhaIQDx0uDWYKC3mOWSqYfzAEhmD5paLDiq59NlkTz+9fw02gbiyNDT/ZyOgVAKQjoLqmU19zOEHACekrvMXVnek3ZwvF8Av5Wkc+HgaqyOsMTQGHsFbyGJsvM8albuF370C+eswBhCC3d5V0RePLitXZElWJ0/hHOJuu1uI1urvB+5wa68yH4FAmMCqfBpo4TdusJ/j0urxZw1YNf7JadFxknWxdTY7MCwgguW6hOcc2bq4JTyEMkSP61WQ0RgKt5/SA+q5xh92Jy+ngJFCpSEv20w46Sfc/5+H5P/4FjeTPuYd8M3rAIbtg0C6fuHtmJbfCAKqVRNd11aBSBhCbLLj/Eyvm38DdBgNe1wgGfnOB/ghH2v/ibwbwXBjKiwjFjv6l6EXmxPdPqpULf/Fx4SxD6xBjAp+nfwWofouoy/4CSUs9tL9POG9FiFpArNIDqP2IWXsKHDuc8pmpmoZaA/xwUoPQX8e1GZU2ajQRYE/Qy8mEEcSZIs2YJJK/56IJbo6uQv4FhbNI6WBPRjmgfXdqFkpERiw9K6I5PyRseD2vidEpuc6Nm//GQwpbKfwJJG/+6BlEYT+CZbNubqWXlVAltzr14QVEoIqsbpRMa56bWrJ2r5nkNieAmIm/hSWzcTz/HRS8CA7Hf3pEAf4kLIYf4TgJWZzXFJ8vcycdsKujzTUMY4CIjuG67qSZxPMCgLrLGWrv4E+2f4X+C7r+Q/gtL/h7g0oX+H52NBYuudxPImjyXBsml45xEAbAtf0j5fNWNcXoQXPaPRCoR9aXZRFnTXwM+/KeJCul7zDGVPgRiOfOUC/OAMZe0vVvIF/8oGUGXO5T+2omjuxxygQbCSBs5sy+jcgalKXTSjOF5/gJK863nc/7p//Ew2L7/gwVuWBDr/DAi3LZX+oBW5b/Dw2OhzuyJ0Z8faZKiTOQiHes1dQHIKNFrHBUWBd9HSP1tcDnk9H8O+2yJ/VPttzfz77bIQz/L07bc38i1pcErU9e5qdvH/rsBdbUs9/pT8jcIkpAjfNRgCQHqGFyy0t/iZT3xAvkD/wQN+zAa1TQgFKauLCAUsuDLBggBlf4uU9+uT/Cinv1/qwW6xAer/Diish1f4cWwgD+DC8JI5sMLl3SNHi4Dvz6fvjiQyl+MyvC6jo+ftg6TfCUKWTJEP3ySqx9NaUx1f4AHQIxoBZ8HmYyRAAMLKTDvr9deko+Yh1c/wEaBAN8pGaERsGfraFPjVbXZuB54AXKOBkdhq4p/gIzFqtogwJJgluOpgwkTe8jm6Aekhmwzlp0ln0sbieoA+cex2YfZMeEcPw7Ehqvm6s7ZiS0FclFZ1+A+Jl1V1U8FgZfRJJZgyUv6E0S+f29DYCa5H2S4YlNEsUW8Z0HOZygW41DS+U3jsENVo4u10zTz8QCBklhDElx1vVSgljD/WBpsiGfm8qmKE1BGIMFOqw4wzLINPPKfJaUzjriroRucu8JtyXChjsMWkuLyjSsFxvrrQtwlIzUOpDi+CQ3uTRWndN5/Zqj5JCCbZ3durTOYvojoGSKTsKj8CmhlFllAMb72CsZu4Apz+Un50TxVMrPjw4VZz7LmfKzJ9LtD0H+ZLXEP637+Ev9zfMuucbChP+VyumAwoS0QvnUeKwMm27SeHIAjYwE5JkL1bOfEB0nVlYiisN9DeZuJ74mHD5G0wqnAH5YSZrk6UgaTixQ2wdE7Qrag8nLgfII/EnPUVNFDHZNf41nJV4DAAMRJf4PiNVHc4QTlsqI+rwZosUZ+VW7g0sNw/4PXdf4TAI/4L2p/uYGC5U/gHunIb7SdD8oqtnfwg+NzCzQmUh7sRu8xZCuNQfP+GrDs46F7+Fxzi7hAKoB9r+Pf28ETP4e/b6L+H7pyKLYWABS2N1QsHK+Rw53Xg9bCZ5zltP8BE3Nmf8gyV2/rqRDSYbcl10enRRuBw1rJ8VqmbYhIOlA36LLOUPn5387WwfOUDB9Dninn34fAMH0MILHNlmlweGGKyrNmAKlS4jD9wpiCSlCLwv8cJlp6JCATKaFP8BgAhhBhHNDf52TLT//zgiZb/sBMtPbBoblXobIw3JyK22I4zrKZ7JS5o0yvVOIjA4J1x8f8BHYZP+PkvhXg/n0Ffkdna+jjRfWS8f7X01QnPkeA/wEopHWFIaWBq1Ro0BEehW46ZFf7+EUjS80TCSveKWQ0EJ3k32ROzmW2j0JkG5u4c4EunR8ZGeJCbZAZhRk0NN2IzGNZq8pLJN5ydmL6QKQZe65FuSGpr5AqQ8peJgToF2e03mHeJRmWzcz0p4tbShkSyhaCxR+npSqCXvK+GtpAJfrc+WNPesk1P8BEBW5LXbbWc++sTDMAsWGDC4aRGJF0w+AP8gq8vjhkoNoG9/qDvgUDjxtkj5IpAkxhUWpcavgI55+RX8Aeeg9XcYl9rOYrxiHvnXTJSAUDoUJGD6LAVbnIiCWf4CJRUE6gpHsNOD3KtmbGLLl/aHJDbHPbU7B7ywQj5iJ+oKPZ1HAnWCCL3MXgrqIC17dEEbwSedQ8YbfRThjNmI1kMRpIxJRISEzquMICHQRfpQHTKLivo6rBxBAvibBGBaNKDK/ApFnCdc14oRzXT5W+HKI35ArF8caoKC952bHUqsivQFmU/wEVNJcsWlyUyYIM88oOtHTLaDAif5SSB2i8qyLHqUW/yKiuOO94dOpa7LCwQW23RZEiBNXs9sgAQ6IqJmF+bTKoCvmrjmqdomiU7nKKZaSQHBa0Lz/gJegx3BtFuhtmxjnxCGKdpGzAAUK1QAYzTmzfjOQ5WqhIz0d+i35J6JTDeHQOFSQMM8CsKUVi9IsYqiVKCL6nYJcpiEcBWwXKiQMmYt4J18p4IB3J80Bj1/QpniQzpcfQ9ZHxK8AUYWe/WQxbOpq1Hkw+tTPSQfnnXu4ZVDI00Yl90EPnvgGPSgiFAr+264ZK1LsZnlXCdskOMsDBJ+W74A6lFYA6BaIc1o8StBmMXNEpcjJUE4KP4BB36qQerSqjqCW/Q/oOZ8Bb7OItfp4GV4ITULqm0czso87NI7P4lQ7bK8Zi0/TZ0os3/vGKfg4XdfqYUfJB1SmWjg1snLCQ71MXcUdCgKEUpq2co5dMFILf0dzuKDCp8xtJVgP0s1ghMpSMeOjJuosA4OQtsy4Xe1x2AQPc83AsRahRgfjUN3AK3oyUZ+Wwt4M6C6aTLRhQp9n7EVPiZA5ajaGepZnsp4dfcOcElcSooL76TxPMBtk+MYj8d0HZSMMEiVwAlwmhfG8WuZWGRHmQIocXi9NJjsZw6cwc8kw6dGnMZwT4e9CVxjJ6dmeda+AJZ/nyHmsLcbZc0htBCyO3N0LAmW8LH1jPSzyN/xqYVwSQrfGkB6DrBt18+LBY1F9KYWcLLLRbRFFG1XIIEDsyoGxmD+AP80RtS+at8FktcSco0KakI8ivCA8RcYP+oObEl2bReFGyi09CtuFQEBlJmriMJzNQg+O0ISZxTBRSMy1i3HSYXu/fxf9IZ8OWxcWh4D/DF58qj9hrIzX6GY4od1K3IyPlPDeR7mycQjaFT8XQiIha5SCKAuGaTAtlppZmnx+fROUEgvyzfs4x49LfH/AROkdFstTL8YC5JS8vlM7zqiSymHOQWjnpV0GGNZWqJw1jl5ykP7fgrmgMyMaUeRiuNfpd1YL/VCkC8RkS80cq08I99B1u/Q1QF73gLlHDS/mMzmGUQMdyjyNiKAlVKoYd+geUPI6U92RBjZhLFGjwBRWdlxYABwBmY7JnTTUYxo8luToHsEevOJyYeBdiktcRUU6a28sAPT5zgR7RyP0xO2L952mBbs/oWC0HcELfgrAc9exTS+QBuxCRLArSdQDYis+KXuzDsnlU4xVtFguUx4Y887PdEMj9VOOXtyMkvrRvOGlTZVSdYMpUw7XVyksDtY6rrMV4HoRigoZecWd/ACXU/dsSFrp+hxp2RpOkVyyQs730Eb6oBHAD1GebrgDSsIw4aM4ZCl9N6DzziTuvbKOPhJUXYfCJcwBSFVDg9KqQT9ii0CM4JhwFwSGhMdBzmZ7col9d1q0VwJX6Qmz/23/UW4OPebJJaT+j7AM49wnjhnFIMGzbgOhDgf44PiQ3H//xSfj/pIpPxe/4F5/427WibDwH/eCnyTBH5g1yqaw711uHPi3QPpanS+T4FMf8eJfPiL/VPghz88/IYhD/L3ghz/yL4ITeDdKXZ87DNc1D+kTcsqZp6XEx5wG7XznUXlraU8zvqIn+OD6m6v/v8Og/0wEZtAbT28PPm44RQrRgs0EbEpkBsEpTFXPgJyh5AjXqQQUNiAr8I5QdzQQE2w1iFfhhBViG0tNdrpxqxB/g4RmY/yACrhAj3vrrESPhAf4uLf0rb/CA4SXYGct+ZmwhYDEUEsB3O8Ab9wTkybDJvEmW65u/4KPjC/8cFrHyr/hgtY+JP8cFox8/+wXLfRDyhr8IMjj4fPOOdDR0tvo87xDcRlWQKBNBiV/votY+qP4fohByh+ZPN/wESWbWS8JOnhSR1KcBnRD/Q5Gc98XY1xlGm7jXULpYoTxFVXKef7cxRM1Mj290MOHLK2c1zyVxPU2qJf4CV2Esr+a314RhwdiS9rLzUvogtQ3FdOFE49UzL/PBV2Gc+X1/Avb7RsSHmql8HFTBLWCwmlc9ajnBfTgY/4CDxlh+QBEF1vFjhBez/HCetSiwgBM8xLfwGACTv+fjeqr4/zwQhUQVIq/76ISSOG7jnXeh5PUNKi9aCjOJd6uf8BG66AQg3ZLxJo0iSvCx+ESiR3PvxPWp+f8eC4RRj/DwK+rVlvCDJfPeHkqXZtGgwtvGQASMMIW+sSXANYro8kq1a4/x+kJbLXWAAizST5X0+gQRGOfAMDoZVocMewsohr0EHAU3gS5uPrSlOY9R+h5DJoDvY5piuVPs9GT1SM75PeUQ7cz3/kAb9qmDdMB8HfcRQqqkt+uzLi3N1G6uOG6rrz28wfwq5I9Wt/1cb1VfxEFi/+hE9an+DQb9p7eZv//8Qkkf5+ISSP4gCL/82EJJBuLNpvCqzksML7ijvc9N7hTbA1h4ko2gDujeXA+/Mvw38DrV5UoyLZQvxnQ82u7feq8P2UpRLtNeoB2ih+N9d7aDnieCZfwo9SNkEF3kIBPVXUQXtLf5OBdWP4TASP8fHDP5/BL+nW2GNYYdJ0ukw8V3OyV8OwrMT/gIb8QxV4Dh0S2RAyOHCQHxErO/dLKfWfuN2kHPPWqNooz455IfhtggSav8EAvq3+CEXsT/EyhLL/CTw0s8KUkyPzFJ/m0D/AFVnUKVeYMf8LPtgalix6kb6i+/4MOqsv8cHK2R3/f50h/6eP8fueyOYr2AFF22NkWovo61UweUbOM+9pmFzL0Ci5blQNldto/hMq7/3UZ1689twseWPaVtQL6d5qJgjjCn6lzdn7fbj1tmrzZB05eWOft1su2ogT3JLltkmTPMc3yFKbSBXJ/A8qWAgUIBIMxUfwGACWMGbmx9/ioQPZuQBJHcV7nX+XUP8L/ywnX3tWdRgEmr7/+AjSlA2JQdaYBK3J48hhn/ayQ6X/CaL//AuyXdaIA/hxIyZfcWBSFEIkM8qLJ9O2IqldmBqt+5kYNkI94f44Tdub3+/0o//TwQX28993TcW+N8N2SG3mmW7Okp+YTNaa6mxFsprXyUfbHTtYBL/jhO3/FP9oJ2/7GAphhDodh4gTc+wdPVlYLB8FZXCeJH/im9JulkQs6auRJf7EPxTAAAAP7/8q/+HJRT9JaB+a/4WPl6USU/izyrh13syf4JOyAKeBf8gD8DoNNpTwhlDmKIihm/O9Ny44ruRlB7fX171gNPvrBxK+M+Quo1j96fgdtK7dobMqzALRbLC5c8ujroqWW5mt0KPF5iY1ecqAdIDIkEy9NntS+Tjrtt5emVO3q1TfbCt/tQxMKd9fvhj3QD5Bcq629jHi2EaN26r3bcskf5ItoGUBm5Yf6uw+pPPObztKjri/TIGu0j1DuJaMi9F1+j+7qDuFBQRGAije5sovS49o8Ea8Cu2EehrHnAXvnsUD4GutTIvdD/kBMaMH5/LSdCYLBmxcGuvJiquO6t5/EeCYOZUQ3sA5ab9kr8Gli+CfmpXSbeZnp2FrU7Qhh9hJOrnfa66OoDpk1PY/dFqGZCO3iR0WztmZsmGYwwNoX+g+el5RlxrxPtefJ+mrDSUxWiT6AiaUF/TtlhUrri217BN1v/F8oj0J2npjg9uUT6AAbcbdoTNlaoqdzh6yyPAnrWxoooSdrAmUbUhjnPGzC/W5xLytBWLwAMAQpEjiH5BAyekB6dVqKYEuoOzwU/wQoBKc+Fb569tLSXHppOUnHPSUMUhSjG0FkxH2eOubGOwtgMzxT/jJIYZlf8GGg4H/+jr2P/Hh1BLjX+CiwsasgXs8VXfRBEILP+AbSsG3EmywAPhwggn7IQCbTRh/2cbQUfwqUz/4MF8eWLCTP4DABFWC1DL+XCkmvjyQ00t1dYfGKFLc1X98jhvFVHzsggSahAJgfZqDBjxOP8aHQS3+Clx0/+GyoEDhAJBnDD+AwAQ5gzZFjf7/CoH/PB4HX/Aq/BjRJktL44ptjkVETEo8CHDnA3ct9WcBFH2P+Alol/eBqOnGTn/FiGhd/gor6aU/9oFfTUBA/ywCvpr/PBX03//4r6a//8V9Nf5CGxowF3h7xziURgZrBB2ANRF+7auGkwWbsVhC3NzAENKBH+q3Hd5eOtOzY68fNcdgboTAAI9OtgBJkfN0SVOFhlf7kBz1W3MWJ3PvKhJyPLcOBfY7F5OnCVeWz3JlCWzFsEoMGDHGGYkKa2ApEmjQsIDsNLoaJWUkrO3I+46FV+wOg4sMCKsEiM4zWdorvoHcqfVtshDJCMmgEpbPrWvU5nu0IH4hfDzlAYcn9SFJnET+AIB51wj2EKGrkwv0EKl/G0yYqXKic/yAs+xMGEu8oCWg361mOptzZsYaZv+30V/A4YsWHpVagZV3c/Q1mpaDHq9FAY+bEpWWOVRGEWVybaPVZLblW8nzj8myCglFYfPJj8IxQ7JjbvEw/4CRLt92TouHRXRThNszjXt9GPxu72EvsEN9F1XUw/qGwW5b5hTJwQrId1WLou+9O2gIwoVJFROLgHmZ+uoplFLzFzl4Qh3gHO2SV5cLp/VtYid/RnvLT40cgF2jq+aa+4to3k+YFLDU0ca11DojVr5yEqmzA44hKclRvZZRN9x1xmcA4JwUZURv47YIDsv9CFY9EAj38PZg+//D+YOzKvWNHah/gJP9Gmpnu1AB5RmjxzUdJcy4zUdt6WCW5DneZQaX2LNH6q7hPOV09iqc8rCJFR0eXH+AQC43vp1Xj/axWqT/JBuVP/Dm6VB/3tF/JyENRw0bLYEINdBSVJSAT5RaMnd/56JqPWiE/gXmEyTzbTZbI/E4Ve8Do9PCIUHoC71ry486gynxhOb5tYGpqZmxHRW7Zw+s+L08pK8tRX4u59GQP4DCkiH/QhuOqAhv8PYs1zf4+G7dWC+hagh0pTnq68R/FUX+GHVXvBvnRuC7/TkT5SWbR62D9+H704MbQKQQJAM2CcCr9TG0YuK4xFbCH5AM2zg13uv8lG6nQI09vPmscS/xWsAuAfO2HEk7PwMKuvPTBBQumK1VulSmNXJKEs/40JFHNf/DQb6Fhdx3pRP9PxFNk0LFAPQcYgXFraTDFd8VRrysQGB/CYA5/ApaBE/+FFDVv/Ux9Y2DtK1JYQa38AAAfwqBCfIKowXOEFE/hQt15DeOG/xYfWNgv+tD6xv+CQKf/fR9Y2Cf5YPrGz+KYKj+JQN8HM3vy/jx74Emfx/10SKOAf+oD6xv/aRIKOfxMDBosLTf68NQtrH+DY9uTAAFlAAE3/wQcm0z3+MokUcueXf+ZIBtP9KA+JgOi5IL/AYCcAf5OB8TP4fATiFpv9iEi3QO1wZP/vAkW6/x0J2son8KIVfyAt0ECTcIAS/CksGIv6V/ixKLPAUT/Dx1Vh/CaHv/tY6bzlRebT0ncXDXj35HxoyEQNVF53Qw/l5rjAv8rKVL3+MF/Ej+BD8q0QN3vW4GJL9mxRP6M+Kc3Nh5aYg1t/36OkGkitrWObrUP8dJO+SN/1JJ3y6P4e87On/nSTvkhG/8vehnR/s6Sd8kI4/l73s6f5A97P4cfogSEAkNQlf4DABJ6DEcjzv9Ug/kN+i1cdrBOa9w27oieHLXk1CbyWswkw+cIvouHM8Ja6sgcy1v4VGRc6CBJyEAmGKGoMNwkX+FxkWku3AQcL+XhkX/LB2Ht9rPzmlHgjoEEBh9ziY8DBpxlyIP8RYA//qQppI/izF4mAVU+BqeVdByVIpgKdG1s3i2S6/+D8A//gnzn3ZUUPNYUIyukG+nhKU46Exp5sZFKNdJ3Z4h28Rg/gDY3pODkJ6h+kLV64fMEbXfIpmAnA38Kzhf+AS8MHIQDH0FX/gMAEZf4DuzTdKQSXXqN7HcUMvO/ZxBDYxkKmdYckLIRcsQ1ggP4dD4AWEAkGa3P4DABOWDNp8OP7/jg/4ovi/8TK6tH8C4cXDP0vLp7eW00il/gI56eiebKFxhyJf4JULk1m5Ao6cQswJ/8IDjwLHvM8Z1MwRjJVTKQQNznWV6P4FbVz2EAltJ0v4DABNuCtVb/VCjrh84MW6RcD+FAIT5BBGC5wg+/wsMvosWP/iwZfRA+X/XCg63/4sW67f8vKOt/+CgnoL/NwUOQBgH+Vgxcj/MgYvQAP+shvntaJ1Zp0v47yPcu6EG7/PitPsA1laqEuN/hoUjoP4RC+/4rAH/4fBI/8tK3HH8PgkaQyVX+AkU4oi/yxHELv+OhD2n/Eg8gsL/w8vkm/+eg5JH/JR3tRXueB/iIGM1tn/HQwV0lf3t1g66mx2BZX4VJ5Fbf8vSWaWSgQwgofx5uQJFjL/4KBfoexQL9cT8rB85Hi77vDNiGwP87jhwwiIdXjRT3DzIPb13T/HBl3q2/3/4o/yT4ovX8OmjADQgEg0ZT/AYAIZQZpAbP9UWXZN+xJGn9eecdN6dnCY85n8JSAZdX812Xf+JlPBb+Bfbv0N7Ie9Th0m9WCLGRxgyfApU+gKFdAgSmc1J+jA+POCy9/hbnUt/4I51LNwPGK5WhGu8nKW/SasbMgN1m24IZN6EuJCen/CSDnBz/TYVPtyWVug71ZHJba8BxFogvPtlSQuPP62WUIbs58hWnzDl1mDwgEg2XzCASDZgUGYcUz/f4Pv/HhG5/Aonj/gIEf9UCvq4UzP7Dxhs5LN/8BHn9rj26qSc0kLb+Wafz6YxfXNI3+ME4VuW/gXOr/+IFyI/44PgSOf+/xfT+SfqVR/4E77egGoEYKTJCbuNLozzdUNYDMLwx0o8uxfQRkLSCUa0zkAmY/hzLWCIQCQpK2/gMAEv4MY/gH6D/v8Ya/kBGGugQtIQZwQJNTeX3PR4JiPg4Ipep03XOQoUwwBw9RHlXA79/hzRmCYQCQaJ6/gMAEg4M0rB/+/x6r+QdGboBP32KG7d4H7CDw9X1CIAvZ9levjjIOLKEx79swwjPpo1NCY3YSy0HajLZ4B/8KKu7f+CidAFZUv9oJ8kbGBOfgE3YtxQ73GjoTVHGnCYCYy8li4KYw4fwQEczQ/pgEf/PidAH/7iUIOEMeb/hZXd8ey3+H/Y2i13rDeDyanptnY3pnXbrLeCrtSOJ1GaFtKq9WkmFPG+VIBTP4+FHsGDwAqj5cS9Aj3d3IuzUhdSp6Xm/wEcXxLh/MH7Z2vDpvVSLwIrs+SFjbLtuPlXsOqDMTjWAbVdK7g26GtmQFB62RYVVHc/qvSd4mYKqgAqfPnLlymX16gVR3X0mBi9H2932BHu972W++zqi+T5etNOZHXQhBnUr6695JjiGQ89Q+OvGHQgwpq73XKw82irQuPtc5rWa2Yd8aNH+AgYXL+H/m0Vd0yf8BcqpT9S3tcoocA+vqKEWsnLTeJuD3O2mABVETZgofMsOp96OloWWEH2ZLLHwMStv5sLMojAr8GPcsf8OTKn5Q+c77SyYtvF71GpuVKrGvNCc8BeH3dcpfqvd4Y/d8A4JnCUf3smUa/7eIVVFqld0k98fRWFTjtK+cOXFyHCIRukoXkVUW3655q2Q8LkEKf6Qe/H9EoNJP76MZbkI0VoKUumeA7dcMunQaZFJn6TFJMy8tvCBcC3mPFXDjAzY/gUevFzyAQq8NiO5io6niAGKDUvIBrqnBOw+zESmhr3uhyMPfv8cKTezr/f55p/JPmFG/wIhidA3hCjzBv7M4y467LG+TtIaqURa805VNQlK3usx2/viX8WiQOVmhodqoTMTZ/xyMI2fv7G/Tn/ggJUwuv/GEQ6I/YPzXatHTxbjnvgVTHk+4RvV00sh9/tVS8vr+ZYzUi/8Bgkn+ykNUC6Br/byaoFnX3/wE1QKOSjsWtCmH+h9Xbz1y8ULYAXAIG0UOQ+SULgjlqVxNKFdXbWjCCGYdOQ6v9jQ7AsCi2BPOY9urh6dzh9r8Vssv9AFcxqUrv+L8mu2zIvAED7N4aZJgU4kF6cPgE8rJIwwYd9WP4b2fvOAyuleJfKM6QuHUojM9l/gIobwH85HFXdPdfKGb0/0/j8B/I6gJnypgRodtZZnayhVpBfMrVbLBCACZai6YWgwE9kqI+y4ANBLlJAlZHNaxWiaYdQnzRkfh11ECGAbRdF/AL/QCE1nG0YJPa0LT3fEsZAgN9u6rb/mnLdIuq+L30ElduVZqpsmjkVoQGGZS9ZQR9oyYQg4ZmWGR/+HkgZsyOWGpuwwz9/J6N/wmv/xyaoFR5K3nbz823cvwfkGew7LlIN/BEDwwQmhJIZXJiLwMn8DCUn+xE1QLoGv9vJqgXUb/9ATVAr+ilTZYnZRfT6HyONIpEDXS73igReQwisF1IX3kgo+iDgJJiyvKrCkW0p4YSc7h3DRtH10x+PONe31tP79EgaIv4Pw2STIn2hj4cauXbgwge86Tqdugfio7pKwHDaiuJyUwgqx0jxEGdGWQJujJvmlShvtofhVGlbP8BArdvfwKSlPEU+rHHK/5QIh0egfmQ58JXR57eZbObxzF6AcUuASxQf7NpIx1wX+naWdZJYpyIJ6XQK0tTL5CGUGmsEJJV8yiT/wgjcLMjW5/wGACPkFlJ/+Thsiv+KnzICteOvFrGIAAAwP4UAhPkMEf8FD7OX8EPmQiPz38LPmQX/GD5l/lQMRD/4IAp/99GGXQWAf68I87zj/PhiRf/DHLhX6K+UDAmIH8cPmQMJJ/dMneB/nwMD5CaT/wQ8CkEkH+1jE/P+DEHD/20l4HfyUFR/8GHN8f4nBY//fESC7uBEn+KkvA4/hMHv/xwRIL3ABwbsVPIjM/a/vTZ9xmq2Sw3jzcF1oC7vv7R8H6aq9VuF3FgfKavHGlz3ZZHQH6PCcbxOqeWfJx9mfa5s4uKC0rnJ/4KR7Pek64ANp/wEQ58fABcfwQA0n+ziAw7+HwuQBAO3/CBAYd/nhLwO/yUQGHAb3qlH+RZLwK7f8eJeBQv/kogMONP8VJeBXSAdv+QEvArkAe3/AUQGHBwEL/4KIDDisCNP8FI9o6qBMP8CgCLCBNX//iMav//8RtX/50I2r/4TLQ/4fjg7+HOcQV3kovh3KNPqQhttuyTsGhrkJuB9bp0TaZ6atgf+XDvaXRGBbkItNdClLbnk9Gy3Gn9INDtEAGxekg3uZjWqcmfxAF//x2B71X8AlgN0EZ0AxaDUSKRWyNHSObYoVRhIv5ZhzIlvXMb3MRPG8dflw91/t78MQHLfAioyby9ptL3DuRjfwJbuKH+EAFotpP8ZERkAuszWjSp8B/C2zmGwQJOwgEzTj1BgtjU3+Tj3Wb+f9nLqzVZWWrZC+w3CggtRVH/kFu6Gl/8/HjrN/kIEgCCv4DlyI/+A5c2YgzRnzv7/eHf4+vAP4DhPEnRI45HKOsGiPXVorQm4ZFKSwYQyIDWBViRvghsftqAXu/xwp0ldUIBtOsTfwGACF8EtHko5COZ8LnSzZVrOhR7FZmnUc9POP+Ai7KMClWDFnEBQoD4ComvEvZFLnUNoIwp04tgNHGb68hCr3sk3aoYmS36MP4FhzTMpZDU02Fkm+c6Gi2h1bGt57wTDWTPA9+3KpAbnt1Qrfj+Fpli4/gImWGZLvx0oSRj8PRAp5HpDqDX8auvahk3jQYabRYD+AlDFNgyG1pmBcHPEfClrTuBgnHGzLAYb6Y/DywpvizkWTmDYf8cFkC4b/tAsgXYwPTlzt21Rp4x9zpzYNy613w1L357dVErZonT3lB2dcjxf/XxZAv/nommVA+D+/3xXP00WQLvyRmfwVCDkdBmrX+Wi9SeYjND+CrEwHAk20/wsApFkSrm3r6fwSy91dhGNX+diyBeTSD5V/y0Jwudtsi/+EkQ+sQVPoz1v86HvTzTsrUMm/54BKmoxXUP87FkCPaYW8h/5YLIFz6Jkb7b/zsWQLsYcMmv8EorWcS4El/kaLIF0f7dLr+CiqYNVP8gKgphBN2QjM/4CUdlku9WZ6gbvZ7+5udaPUxv2ETjYVJxcbF8wD8TVcXdAazqv6FmsFXNJYYIcmuae1faplfpc9djZEPzprb7monrOEI8yI1IC1pGiEEcmMaUF9mQnB+uxWNj5ijZxWSlI2/AlNHukFzMPDxOJAvaUP6bwAy8pdc7Y7bpe7u5WVhw4Ir8tGrRDPZhu2D+2WPC8XubUjjLUiBXpraN+S+eF67j5QHPWIG4w3/qPtror0zundxbuv44A5ir5lA6m/yAg5zyT2+Mf1Dlh6CmOhnUbB6VCY/fS34WG61API5Met+YwjtaoiLZ1OpfYrEmoB/WFJjhU0SywOcYdi4kpK/dWMVSxKngHKF8lycZHCJeAqt/gIGz8skuKm5T31f8BLdO6M3/lp9SoeCgabIr7mx3xohS3zGoXTJlFGRZSsc3jXLWR5qdZglb2PjPFZmkUsXgZOPaTdRSNxp6ZL12LoglFeKJGrVtGI0IASAgLAOWqWdmnkl+tgY4YUQWt9tmndvzVwKIP48JqzzYE+eQZLxQO4DYIJioCpHTsS4HImT/IIpAUIBP6vwQYBYzZ/io9RMkIq2ErlQuQQkYGMm/z4b6zgSB/CYEH/FUKHG17z8Jw0QgkpVl9oLrw9QSmNKndgMsyPirKYBeodAqcyX/Fg1h3+LQOuQLjRWhF//fgaw6GM7C5rTS4IX+bIcO0CP8EDS4rW2uf+xjw8f/jR6oZ/AoZN/149UM/x8Yixv8HUVS5A3hp4/xC4ffwGe1HRb5H/70GOT9DR1y++A/AM+U1JPd75Y3iQcMjPYwT/RVnvs8fBIui2j+xlmf4IFw29s+/8JE+X//SEoyBJjgGrMJZ7rGP7q3KIqlXZTgj1BOmO+2ju+ETQJv6kSP/KyiqfwEeQlvUOU+tLrbvuAZkkIer+V3R/nRKMgi6q/BQ//HQp5H/C+OXL/4IBOgfP4wYH9/jgvCRf/7/3wP9PB2uHQneUo+1CfvD9CdICjo793TnLsPmjHFiqtdP8ARGt0lQ4tNQKr/Cm/N/gxlrPAuEAkEuA/4DABAyDVP2QlFpeEMaitVF7PPkZ9r0oYoKB+aeAP8KMtZ/8dX/P+WCyFr+H+S7+GUZtU1T1w88wtMPHT8gpaMRHlvKfUzJiPPysLsmLxFC0ib4u/hNFv+QoDv8EK71f8Bk3YXN93+MCIiP+b3rn+E0hv+KwB//MhEQB/wYtayQD53O/Vvgnmss1itK6JawLa+P+cAhlt7J0eMRF5VjD+XqRmA/z0d0tA0B/HwH//zgezL/iYGD/t97ZTUB9f8VG+3n8Tg9/+EB7MvAAOIwDJeA/wEdwwBtJL9dDmtDMbj6vA1zfleoWP+6GONof0ZVvmQc9ZULRhzG60gPlN9BujPLPw+Sk6b2azYVr2PspKhWDn8CvbK7oN18lD6twPwAYH8Dg0r4cSV8JVvgm9XYchNlsJPmg+w7e/4IEmhfh/wUkz9An+CkmfoLAgDq/iJ4BFf8uJM/R//Ajy0sAGAsf+Gkmfp4/wgkz9PgGARX+KDfmMBjgIBjv8FJMwf/kIetRb9yoVT/8/wQk0LYH+CEmhYVBQDg/xsk0LdHPko3zOBf4FARv8LG+ZwF/g5JoW/gN6qCAELAOoAACAAQ8BFQBc/gN6zNgE//wUk0GswC8fwK3gvwDHAHV/Ab2C0QDS/wIIdgNKA1YAAf4YSaH9YB3/4FANf8ED/CIGAAAA/gEC/wK9rkwEZfwK90mQEd//OHrKyqGgwEI/18PWUb7m3ygTVHGwu1TvQkVZaOhEPiEx8w0Lk7B85V/0dGNRQD/7AaQZ/zEPZV/xiQR/yCC//64Nbrf4+Ae6psjo77KyfBnJxoW42JMeIDR0rqEjtmmcWPopiBevc1/4JrtFo9K6TN16BzOx+6wQTJ4aRrQyFxbJLGXSWn+E41CmItBy/xwajAq3+FjUYHC15T/EyTNR/g41GBtkfnd8lH3bNFO/7o2fRF9M3ZbfRZ7XkAIi+w1mWZ/kn/30ajA1P+PjUcXG5xt2oeA4lmLYABVHw1ODrkK4BtS87yno6cpTNfRT6NineHKbXZ/CrG30cGBsGT/Nwhx50v3W02VSQGqZKWA7RMP9rHj4uHxpDavL6BVFsA6zew8i4F1Pyt/nohsVEImEMD+GzWIMhAJBLZH+AwASag1VdV5QIGSt06KOYtoi0HFBizDpVNGXh/T+cjWL/Hyv01/jAELP6F4CZj2wMnFB09ce91WG8FR6ZTCWG2v4Y4rmrv6FrTq3JDxrl/QeEGmfNiNh65JSgjhzRqJsP/BhnYSUf4YM7CdP/FRykcCxBlCl+BHv+vjOwn+BgGb/3RnYSQgv8WOrB8ZQiriAs9Nc8fwlqqH7pxJ3+NE4FgFlCD5huEj/jlgmDKkbOCEa89PGdhP8JhPuWROXmRrLeynYiYCGkQZjfNyS4ogkcwixo+zpb8zhVR38OMk+B+yxrGbVX93BZ8wFt7nHKw9m5NPV/g5ZjpNoQCdfx+/gMAEh4L96P8nL7mGSXRe+8+Cg87LRyVhVozO4tf8AlSwh/DG9YYkG66u9ah/wL4EsKdnyWMCTwHjlYhcFwCg/XU8sLOdwWOUcW+VO4zMy9Jhv+OBHvQ/wIX+c/3p4Eka4ZutFRyLnx/quM9brvDQ7lAl58x5PfGTfZfwL7MtYnGhCF+6lECCC0py6H7vcipf4CVPx8AppgQ5fWJ7eNCXjCw/xUufGuP+Cj8cHB/ggI9jIKUB3/Hy58d/GvOiqb0opkth/AvyO3zrb5gGBWv8bfU890cZKqFITijZ8DtauLixQHVnL8rGyYn8AbNS0c13zOLURWctI5a5BrZgk4+3/gpV50eYQDvwPF/gMAEzoMBBpz/CcdlSfxVmAW/OMx0kNTwXAfwL95uYGLtx1M730HZa9IS3qzSbXMH+Ah04pJklb3LszJy4AtuXs6x54neVCMnzyuLcjh6iHNYpX4ew+/wcVvCp3P/tBTynZAE16Fr1fN6O9kyZ6rS+wRjHuXzksafST22U45ye8AZNhL/YSW9kAc//jrH7/iwIcHNFX8FlDp3/kA0egBnK0vfApSrRpfOaRRcKVhftKojihyoCQIfuotBqIBeUzADQtS5AZ6wwqeGOUGbGLR/6wGBOK8xYICZDwvuETRFMrHmV+yDL1cGJ7EmTnc7UVXtIoYZES+WBYBO3LbN2is4EPKEfehQooGlMz8KWgC1X2MDUqerH49/1JWnOmllM64PKJdXCUBcnOrhG6Gdq2zMjq4Y9uxSC9IJtV1waSTxunkbAhMTYp1/VF0gvlepi/wygSuTCsEV0YS7Wq10z07iqf4fizWgj4BKlyCAcfFOqiKZgVncaFieT6k1UOfQNmaTHTd5PHeP3pb0NQjahoJLgcBAEr+AOu1LWSphCnun9f4BRq+OLt8hBlAVxdJ5RyPqnhqZFGpUE8SW5zqyjnYLV92zAqfu1ZVcTAtvNtP/Ao5aCdy7fVMQv8BJTDQl3xgoJj/HVeIxgPKaAb9DgKsFkTf+ARFr0HVwzIYKdiZyvDoq+2FPVck9xFs+IQ8L9fT3vtCRKZ5eHbBSZhEW51pK6RFjAmAyewRvQAy4ut/4IV2r+h38AKjvlqFR7MK4N6+hNSpXRaLAlzlLN+7TP6kUZTDVRM+KFSSiDU2pp3/wEdse87v7lBYKFsI2v+EBsOz9/2kRbszP+xiqNr//yS+l/xRUEXtlX+ChGBEjB9gf9/A43B3QZ78bWZW+9g6nk8ITAki2G/5fesVkN6v/g34P9FLCpWEWhXvNcVhNSErnB/G1jH1z+vO5Gp8iTYhEJENpHkrra7bO0oDlNdPz0uZwIMwh9G7ZHGcPkj8kb5EOMWNxzSWxnaFKJiZtyl6hgq4CG44Gms2KxUKVMlV0Vspjx4ydyEdVK++Z6WjJ+vqlVkH+GFBoUWrlqWkeblUf4CXSmvkRubPhfGXF7qemM5wPwemdPjvH5R+qoRTybnBHkFO/J/yfcsq/yAnSttulLVGcZHNfMZip1c8yB7ZJcVw68IkJvlPXki3CYJ+0xjJHWsbstb6OvmdXJ3YsxfsvmOyMkvKWw5YX2VjoNMaHgt4eDFtr3bF9oXEqIAXHc6qN4F2gKbSzB2aH1+qk0YfwDEgodpPkVkXSeTPXK2sujRoYzuNDGjRmjf4D28naZrKk96LRB97ylOySxxgNP3OPx4Ae+oB11qrCD/4CRzH2UftHEkqsnN4a0I7Migg4t1wpKJnbdelsroSaP6xF/KjRkJwnA/hQnl+QnQguHCHr+CieYM4ln+F3Qr+8O8T/Rx6tN/BZXP/xhPfT/0kIsln9BudX8Tgsf98d4n8Jg9/+OCHBe4AN+b81TcOD8hSqFUfL8LAe1bnbe1QQopychTTesSbHyy0F38ELAtNTGQFFUYnVmVwR8t7TD72fOekNV/wKmSUWLSz+Bu8ST6I0vj74ALj+CAaP/BxDg18AP/AiCEbsBgMA8P8EEODnkAEB/CkpF/EPeLK/5MT31Oi4bogDAb/+BHWjg/ijvE4PORe8Tg/g/vE6ADs/gV0ZcgGBAYH+A3RC2Abz+BXOy8AG//v/vE//8Q1L/zP3ifwmlAf5AOfS7qP+AHoWJ1VHsw9u5i0183bFoRcSbGn0YZ8aunxt8x/pfvEX2NhF6pxpAEwGmGYOLeijSJCjuEs1R/D/zSJjxpr/H/IBgdn/D4E7/wzVGTH/HgXFj/Hyn4v/GIM0DIH+QirPca//YR9/fyJgebX/PBRvq/Mh//CzoHY/wJ5BhsIBIV1BfwGACRcGCKtFSn+JgjmCAuIT+Yqmb+Iu+L+A+up72x6EaIGJfu8DQmwgJibaWYLRBw2f2LyfHBmc+Dm7J+n/xYOoVef8CzfH1/wUuWCXvbfwsaiiOJ/4QdQq4P8rFED38WqzqV3XPllU5/jwwnX0/5cHIbhuiP59NpQ2g/68Quuf5gHTzrgboj/uQ6hUFwn+QB1Cq4boj/Sw6hV/D4QH/qIdQqCoTz0AOoVfw2CBiRvDSaqt5P8MxsDNQ5Y/1d///h1Cr/SxEFvf5mIelf8gCpG4CAAB//gJQgX/gQAB/jIs5GP8GEchkBfwYAB/6SFbsgb/XiAAt/j4vt6gP89I9WAP/sIdLiv4hEp/9gDpdX8Jjb/+Kh0CoB/08oZCfyCCflw2Hf8mB1Cr+FQT8F/08RkEf2wCfmP1+pET9CTT70J0zdwfaLE5PrlU/6UL7sv4oBP/TpuhX+swT//Ey/5v/pIv4S/kwCfxH+kwn+TZ8VhkV+dBir+AYFZXsY4w3sYrhIj+cgT9JNM+4qT+GiaOiF8UKn/5eDn/7/x0P//EPPn+aFN01q4YRKZx4Hyxcc1n0SPB5wKEJNoCk3TNgE32EFK5pr0QeSLq7kExlgBkxoSBTl6jvDOJ3Rsf7IbkCh4isdYDwjz/CALsp/4mDH+PCDAa9shAM5ElqDAZoolGjYwx8xA90a4UDZ5IHqm+y31P2MIcOKIcCZ8QA/wkkhNdEBab2L7FpCBpXJz2TLW7NB7gIbURXmzQp7ZkOtTquyYhd/jon/47D+CqaIKSxD//xL3V/G1AmKRQEF+J3/DyTrtfy5Zey6AWn//lF+//YBFOlLoFp/f3oM4gQ9/lJRfv/hyAb6ADfCOoN0BT2Fy1c+a6c5t0XSbCMmNUwgKb5m3K780Jy5TwHM3fOL7HXjS40zOVvBkeO+LCLP3MYVNvU7sTCVA0WKY/4GgG6/olbgh+ADol0C0/mqAV/yARSC8wMBgHV/AkAi9n+KiOsD/JRL3V/EsArLnaR/lYl7p5/8eEvdIj/EsAKl3+KiXuniAOb/IBL3TvAPP/AsAqEAQj/AsAqSgRd/AL6BJ4Et/wKAIrYEyf/+OYFP//Dv0H//h36D+DCpX0sRJDXnyQe7Bm+LL3s+wn4oB7m8r8/EhQfHINPBplDFYOC9F28f/FIHEXfbO7fYxqmW16G43QZgzmzUREEt/Ycc88jJ5T/RDD75HbnteTtPDdev206nAFlTGdxwcmSC4R+i8Mi2KQ6F/w31cB0IBIJeRfwGACFcGqB/7KSzsd/LRxFGnLe//gJGb3oRMnKjnzxyv/kgJA3/igKG6t3ACoNAoE93IyJ2OS+SfB1goVDw+O2CApP/Qh190Akn+OiJf0Ehv/HxVN5dPcCI/XYzGa6JLzWNZ87fpk5V4DOAPKYB3IBysq8xDec/ZdUSbfrStFOqgH8gXf8JFLDu1Lqs3AtsQKXjUW4Sd/lBA9S6Jbhn+vEEzZInXwINQaqUcWXVHjCQ3DjY4DyPCL+Qs6bZn+G/MkPhAJBK5z+AwASlg1cK/wBGg7/+JmmJb/rAJRZNP9aWnqUayfnKNwfMN/NvRPbHkISa4tlJrPkRDC/oz76/46V9ykf/qSvuV1f78DQEnf/R4r7ldX+iFeevonmVQpQAAqKoYrNuHE8eX2alCAwHiSMXwpsgUTW2k/XWeP4b7XSsA//YjPsb/weawRS1ikVrEvTOh8eCPLhzyX/x5OryMDtfwL520VeX2Pi1dUik8u9iYmNi3fEwpUKNxrMAiOdBjNz3hHJ2WRVuM0ZFbLFck1I8SmGvpCfDFS9X+DktsOz/zUlth/BJuUiNI3hV8QUgoPpi81ErMDB5/wrQNwcI/2Bo6teB/A4R7TUyTya8lToTAEkoPM8wR7hSvUb2jPmLI45bvFci65p2t/w2O2iH8BIq5hfwGKw5cGrdWVKcUrvr4NQkkTIlp/B6agSsKJkmkDfzkO2/4+W/Xf8VBCe38B/VNWoE4R6S+zBwhkpR9ZqjWwyyWFY4bojWPURIjzA64gamv/iYEaZ+G2CCHqEAn/IWaDAT32lP8TEf3v+gCYDNNat5j2fH+CU3WxQngPtm8r7ndx02UwwtV3l8G+4YNw15EWYG+WRurENS+/xkpoGwz/AlKP//wYoO/yEQaWf4SBzCLX14izFuvMZOJ2S3LtPVuDWjrHU8PYSkIZluw1wDFO/gDnx22Ae4KCq2JuYSgFSUMNzwAvJgiMCv4I5RMb+aeUSi/2QTOPAP9/8on/DguIfZnLhaP+NFxD7KQPsAFehf6IF7vm+oj3j0Dj5xlaCq82mQtwFBO1ImkmsPlD2ukwJg9TFXwR+L8TMROSOhUUzw/G9/ZP4xoSrEogFmNEaOs2tE0p5W0GPKDIMs6DTQastvXRIEfWEa27ZBDKSi2tT/IBusIFFiRKTD818XlNH41wapL6mItdPqyuZ36Wt27ZtfEn6WIq/yG2UdbX4OUua7lQGx8MFnSpqNNTcHkfLfWBsI+qXHGUibqs6a9+jBC4kmE4Opa+X6u5O0v+QCwNjAUoOcHIsahJLVHFuj7/PASLYINVgPbnWAepqU7056M2giZlNkPgiZgju4hIBgM4wC3hMgNskWuwZG+I5DToVKQiy1XkjR8o8OR0Wrhq3RehY1Lm2HV+JwunrKSx13+0tp1E9Rj+JBQ3OcZl5EgfCOOn/DOVyAdXM/UO2gSH0NTgATa5LbhGqVnsenysc1MBsndhvPHHItORqKquuXch4zVpdijjXpho2NwOzbbWSgnz19IN+5cPCgFHpElol/gJHjRqAMnJ8D/EwhcP8g4YIUIBOBpJwYQPhMpsZ2d00CdR7KDUrPtH2iQ3kZOOhMAXINyYLVz0n8fYm/+fkn17g/z4hZS0H+VmHbb/EzUpxz6WEM0f4y+I4QrZ7Vw/isCD/zAIaKubAI/w7Zc+38PiDF/BkEb9siRcsxvsh+eXJWljEVJD6CS3N/8egL/8QZY0g02LaSvDZvqr2FRTnkjZZwXf54gomleWHzxE/znNpAx1IpM2QR+Hmmv3bFrQOz2nmGiODs0VHuuUvPOk6TBBOJBjf8jfRH8Pgff84hH4AiVluECv+Vh05f+EgKMG/loA/wb+Xm38Ehp5Q/ygdnafwmKD/z+4f9JB0cH8VgodiSu57Ke19ru1Un8k2uABVGRF3/5QEVA/4fAO/5yBv0TLP3X4f4rCQ/4vDY/4Jg1Lv1yCga4Fh3GTpy6O6RheeJ05uoUANU8XN9MKSPte1YbH+G1oMRhAJBLNX+AwATeg1R/R+UPOA8gg4+Fa/6arObl6qmFyaDDzz+rFHK/OxAIcFnruc8L/8BKV/PDKWcUTvE9BYdGaEvXfILxyIeJpdP0nnaiXKYAlmqXvB8Vm+VPnUylhX+Cikb2CBOuEA7D6vT+AwATGgn/R/xUZg5BK8KXzD+bn+Ijr1D+EwFiOYJdhHT/AS4F1j9LHGX3AHjoHczF4bP3VD9ZBwCAk7fVY+f4Cgc+nDm22dbzUSOVRZ+730gQzaYxuWTVqXEmc3j0IsDil3WD/GRS08W/8KKWngD/XBS08KggABgwQCAIOEgX+OZJUCBf8dJrgsUDPIB/HoA/EZP8OAP4RZ2n8OgD90+88eAD4Tdfn+Oilz5e/yAq84AbQabiccIwWfD0cZvEomywKIwSAEl0Ftlv52micKNhPf4CbEyK4LwcxevRt2W6sLeLyGPFdFsbB559R6zd0TYOesyDCnCX5KzPt16x/P/U+SVurtKhlSh386yyILTzQuDLsaHzrVNb042EEax3mAyx+jhxM6hvEC+GXIFZij9gJvQL/4fZnwQe/B75MVXBJC9Q5174bDNAKa72cFJrdbWhiTCrc/bRpNJ6B17FgibI1vNqrmzc/yzw2Mg/yuc/VrqzOT/KO8D15f5OW9PuxiAnzsRGn1tuuJaCcxdy12JtXXcsM+eVrjZ5xhmNhdcDOJJrmHgVonflJ+TJq6ZRH5Ch9Hdl67fLHv8FAseeL1LMs7ADArpGqMlQ08g+rTDsItIJ/gpN3BCP4JEd4MB0DmRP4TAKoC5AuTprlxTAf9cEovgFAf78MAl/9fHhtrB/FPJ8fxf/aAX3Qf5QURHf8yA96v+gBjqf/DxSFT/CYUt8x3NOfwGB/2mTz/4CLa0IeArNqimhlvlRCfNmNiTou/wbPLr/V9vxJeKn+HIbX4XvhcgucIlGPAwZooKhtSsH3FaZbWJSKB9rjKwmAoCkGzqFetiVIuGm25y7wt4QLUc04x+SZFpMZdC/KHh++KAotb9gf94/iQmqtx1AqSFkyVjlPWVNwiCK3S615UYp5nrMzOPDBogr8V456385/Kn3NK5ov/BBTJgnwgE/q+z/AYAJz/4yB7Ng0iaQz0S/8GAoaFlTgy1XT3OKlqFjnwwoQKXZ42ha25HPqBe39jby60RbuI/w2//CX8BkvX1CASCWgcGq7nTKLj0xP2Eu8Flx1HcePWA8D5kbnYt/1YxIZyQfz94hum+FgBJHIB4F9ZzMEUFbmUc3827AMG6IllB4v+KgaBouggh7hAJ4leGDC+yXf8YKhof/hAaBr+LHmP/PxLEv/CbMl/vwGga/ywhDrH+pCWlf+fqxP+JAYP+m6xPk/kKsTnQJB/hWsT/icHv88IA0DWgAcSfEQHxLSxrBPq4hPporx+s2FJ8wNQzYEYBDKoGMuM98o1ST6J/gJnaqTgyf0cHFdO3AvkHaqkULQCs1/dNOGJuNfnf/wYDQNWOgo9xtlVLx8AGh/AoNH/g4JU6f5hrGf4fC5f8IAll/D/ggEsv/xQS32f5KBLL/8VANCReAoB/kgmYV0FLCEySg1gMB2f4ErNvr/IibjMFfwCVd/1/DFZsafxLWbKP8CgI3+TAS0wFAAAB0AHW/gWs89YB2f4FrJAECCv4FrP/8EBDiP8B1n6QBKP8CgDTkBPx/Atai/AUP/b+nSPuwyRaH8v6dLvlVfIMU0H3+l2mZ6Ed9n1JYaTFWSIdtu9diGeEel5/QD/f+zT/rwGg6/h8Ef/5b2af4/Ae68/LTLDNM6ne94BnKJi0LSuO/8BDGr/yYl2n/4CQhxmqBTrqsd/4J1yNKjSPhi9O5SaOf9JBW1EHGBWhmw/iAeP11Nw85X+VLPr7/HCQ5x0f/sN9KISBcJYXPOKrSnGuCRNvdYEr0zwXf89Ke+B0OfwLaD6ySM8zBq5UOaI0KfFaIvPSCJcJwFXelruSoRK6arITpUoE9oBTVQoP+3csPML2HI670X+AluSYJ/wcL3oqP+kEo5cJ84esKPH/HxQNsoHxJZLinRmIUw9G37LSRp5ez75J4QYh1XZzj5QsZ7TuiDoUBHGvrRTmD2H171ufsGG1oa+hALHDhgpYUAr1dn8Ptio5/jpTwSU//2FGFv+YDYW7/IAzQ/rQr/AtkBwPSu1bZf1fVUxzyeT1/yN++kgZAj7yE+MDnJhfXzhwVl/4uE8dr3/Qi1acCJv8PXW/5/D91u5cvnLGcYe/Zfx2HyTB3yHfa2E0cysV6VjeplYyUuJj87G/cE1JMRnGGHpeprcqVhAXI1lQ7ih23IAYyjpSQ+HT/lAz/AujkGRZitFrNmUj0RyheIi8G3R/yYC34DgpPY1TwiBsYw28/xwuFAo0IBMzPyfwGACdcGEc1J/nZcKB//4Ho6P+wFwoFtzOAPy6MOcI+gXaf8xdG1Q3fnq3jJu58eFgRNpzhQuGU/h4Ff4f59BX5Q4H8ilR27E4LVMOC7ixnqPeMWFj0+bL1DgqBZnaVPQxbX/38QXfk+EGfwBwRW6cw3uvEZVmkh3Mpy845OfxAVPjRz9z+A+hRE+E2ZcZ3gOR6vccXbiwvh6f4CJQlwb/AStio8al41SFhdlkRF5ZMVFfpv7NbDYkkBYvInpoMYaVC2bN6iszo87fiEkLX8KkMKpYNbNbp+0fSRaU4VqzAM4LDEN/gIsPh5HZgv+SD91h4ssGVYc0WOsbJLH3Zf4BAWxrZib2UJpq0WgcUwt/gOJ1g4ejFsykHBmE1g6kILmz0k4ZiIidhKXHglYjbCzc8n7u5zzIWNNeFs5rk16VzPWHse7WaZ0wZgXqdkYWphbxzsHn/dLJ4XEpqov0TjKPOaCQSmauy79j5Ja1+5B5/HMIFVEoxLKQweyx6+sPNEQBqurv4vffyVMGs2Fw7begE7zPx9nQB0hB42Adh4U2hurYjryE6fk/xhiEinN5SGsAKB8trmQCgxnZpv9CAaCKHmVBBoqk2eVqrlThJK0LgEHxcoEGntI46ChL71K1BnCFhObiQG/gDmOUSASOxv/AQlPmAtcn4q1PS2vnBacaMM4eemmoz88gqkqXbri4RlXD2ssjVvvvp9/ANEq4BgrC9mTaLrl8xbbOjYEV/a3N/TEVugJUZsE9N0W4OrAlBRE8PkvifcHfUJpzzzkFgCbCALRO6RAq/VQbyWEy2yKBPmZ2h3yM56Cp8p3G+a1CoJDQp5yDfJmXLuz+nKF/gSxMUBoeRjugaEk/qUF/Pp5xwOCnsARC/sVdRunaER1A/wB7sFNMqoB7azxRdZBVBFYAErHLxMEgjdJuX1bqQUW1HS+yDq09THZP0rawJzYDS0ZgTMdG9vUmeb7xsKHLzOfaJMIjmOHCUskVp3lA5APy03JkLi7+NECwrlzkBx68+JtlIVBeqx2iW+CErpLR+PJQwnmClmPdf0gy6k81pfwB1cl889JT8OW01XqeLWNgsclhI+UbjOewm4h+L6Jt4VXaSxOBaVpcVzYekjpc951nREJzhALkqVHQc/hVStMVky9ylXp6h4GDbDr6yZydwFMWws0V6lhaiD3RTvC2uH5sPAkO7Eo0IiEdGiwTWaqUGXZjV4r6ej2hu4bOL68QZtvgbdsdBPProAy+H8N18hq4vRFQHYAZwgHHAFLEWL9+eYUU+ybtKJxyMDMpCxc39exhUfyZrrXoXwkd6y+2LiFjMoCTbFZzcDsEjhdeleeqEs7ScCWFj+p96Jmun+AmgIFuBLFOUa9BArJRXRyfNr1aMzJJZDXU7wrfhs2bDfTdf4A7s1XwV2tYo1prKuvUGy4tMl0ePRajzHSkbPp1m3lcrK1AXkpOa6VUvrFdMTQFbGmwEVvY45HNSKahHqrLanmN1G9EAB1Ns0K60pqiI0HKa9Jk7/TBiizYucRP+AauYNteCKeu68yJoPoFyZA4BAmAkxDlDMpZLqFgY9hlZTkIcZH8Ky6OqRC17ug7kskMlFQg/ZsIbqcPt1bOxBMI1I0qtd7OjNwG4mFPEz5y/ZoTSSEnFb7Dv8BMzxLz/gIsMDoBjMbmqWfEz/ATEFJ4daIMUF2gik247lEk3syzuPBUstzGKouw4xCLaMLxdl+NoRIQZQCwjNIVEfiVUvyKrew00LNbkvrK+LnOiH76cYSwoNIYV/CtIzZz+FluwDR3qj4H2eC9GL4E1PxXv2Owgp05Il4CEIO0oS6JDdZFlIewKNnBza9NRaC7ru5h/pLYmRfyoiCUPO38bbXAc1hgE2t6bNrbijLbL5hGOKgYVwuOtxBRZ3Oyj/gIouVGS4PIFIEhX+FMIMskzcfmp7/AQuVbo8wFOGqjN1s+QS5HkFxWEqkk74p9a8jXkrmZJ/F4CPEwH68XNb0AOd5AybbvQnmgnyyFJJ7sN5A9Wn8JwPdWMGMiuTcHmpurDuugrpVvSrA7jdBaSEY9a1tOPEvHt6au8obw9I6A/Aw5WP5PMgrHI/44B0KxP+Dt2H/EzZ+x/VGyw/wH1Ac6IRcM4RCBuuBpiRh38AigoA9d96pFMrcAp0JgJm2TyCsk1/jpWiaSP+pK0TXX/D28D2f88DKp5H+Xseh6/9+BlU8r/L3ED2fyBxA2DAyoD/yWkQH4Ah32hgSS35HhQsgXx/AbvzA8UIBKo2N/wGACEn+fBNtJ2/js3zAX14QAk2e4KNYGSCRkYewqhk30XmL+CrJGsE+f2jwbY5i9291ROjnDdaXsKLvyu2eICB76eM3pmTtCfrkInV5B4Iup5r/JCSn50d/fnIxuwQCVS2Qc4a4hQER2ZlGHJ3Ba8KDkH1Lz/bxxI38KXvoDWIcIBOKeiQYHC1v8LXvoAwn8hXvv8Wdlv8ZU/v8v2XrJdBDt/jBerRCmGUWk0+VhU/1sJ5J/wTsIebkyVWrbTte/tfMkuF8lzU9hf4CTYbLz506gvWsilEFjPlek/h32CSf6p9gu3+fvYKX/l72C43/VQZ9R/A3Y8Kfwd2PAxHYKVfcf4CRwbaEZOzSYirWwbBZTOH+crAj+H/Tr/GQPqL9/ugsTapH4GBQWQmjvuhMUvAlF9G17aBFOY2+A+F3c8j/44SV/+2EA6n22/4DAATk/yoiWAKohv7lqsLs5SsgLaymckhvF0WH+OFN2gYFgKz7Yo/RZNhuSj8RLK74DSx64ojq+vK/YRJFFjTRhAwNDQsCVoyeG0tbxEvfB6Zur6ulF9Ilj4P8FLrU4NCASoiHp/AYAJ+wYCxuP+TlGtL/HzBA7/CDyAQlmh/Ai/N0gLJWJEYeJJm59ynFH+AluZmpuM4KcXTZ5O35vhzVTaYD2hue8J9sry/pW+9GEIE9FpVKValvKoL/gjpdM/5p6U2u/2EOnu//+W5qv+bFQvhY/fz/0uuJDN1GMFpHbZaUwfNJJyRDYRIoY8CdT7ekUOvMh+SPtO1sz37MMDTbwBQO8xCCm+wxbQIj5DN7AZG15mJvYCbQORl4kZY1T10PjMkio8ixch8xD5Uu0LBEJ3awB9YpgvYcjH5iTKXxENQ8hcPcfGjM7UwxUY7nUK/4CVU/ZXFvy3NvQ2VmNd9e53lEgwxf3RAYVpHKV55+wEZV6kSW71QWL3w38yv5Hfx9iQmzwzjmcWQ2juKhQJ8QTi5YAdgAReznZZMj+IAN9RFKR60UuRLvTkZ610JxwsJTMW1JlS2+8XqMMYPl6dgP1apNXmsYe6lpwEq2mHzZA0ROMLO8CwYkbMnI/wEf3XZ1UoBY3r1WBElpBo4ZlJ15LStOSbSyQI2ePq4bO+KQ+ErKD6u96XfOxBNhRuQ1fwARYUC1tnRKZlZzL4ezTkbDSP9BfhM3Mw435H+Alx2fIIFM6lnZIa6XXhHXrNCrrlFsh08gyC+CjEncAsrS1USR9lUFOenvq93K25zFxydylAVTBADeodSjcD2Tk6pJCKP2U1yfVKKWlGcnSsnjwL5A7j/AUK7TMqH9wgEyogJBgVnVf4qJd+x76deQioj62uQOEoHxzkP/Pyp9W/kTmAa1y7lwOFeFgNWhDU8clPuHc05qiD86VsI4BNLz/xYXBJ/hME6/xgfru/wMCTf78HIOP9AAuCT/j4cg4/isEf/h8E//0sGhRD3yzsREhv0Z/jIzh4D2Ez7Ywyf89IIl/QD/w6+8BhB/r4W5J/gTX86QkoYdT1Mj7RO0ERfZ2wH5xuvRWYJEBUiuE0035gXEC4U/+HCHz7oQDqE7S/gMAE2/8qIInEu5mDzJnK5rigs1zWhJ6kg7LhDfw4Q+hS6WJ+AzxTkk6uyqF4NasCDYgDbg2+K/w0sP4OARStvW65Nc/8CgZoqu/4CQc3NjJkLHozdD6pKHZx+rhI5bbDBWrwCKlBC2p4708pgndR0+jrkkCWRKiMkf+1kEC3zr+sC/wggNq4DIDnmeQsxJcpzoMIAENdhLyhMdQzIFr3Zwf4UBZE+kaikafExw40+Lk+hkAOHXoXnLYhWX2TBm8WNGVAuOiiPT/hN8i+QgTggu+hAKvI3qDAk1/xUOa8wv+CCHs65B/8EI57AyZ0cyZApK5+Awk+W9ykMFbkVAQku8jp8pNhXlfI6siv8l27X+YA/1AABlD//Mjnsf9yRz2P8REQmv/pEc9gAaE7h/2gqsV/6aIjNQCgpLxdv9eKrFYGyf54Qbz/4SAo/8fI3in8YyeQMnf66QS88CRP9BKqhYAA/yWCh/xkCRrH/PFVQvHAZ3JVoDbAAD/wUg3n/4mZ8Xv8UG0SP8KE6X8JgHv8KhDATiUUmMAwsgH+MG1pSqj9MX0X2guneE8jP96J/3UwZnywH+FAmMBlkABlj/UgzKJ/g5Nnu6R4t5D6lwfG5krib+urrgmvnFp/QQ+uEtxVEOeF/Q2z9kT/HCt3Fd/7SCKVgMg+tPIwyEvbW0NjV7wne49BSUPCejCbClmDoGMrtn1E3/ZCoQQ//+Q21f+IIbarpomL3/hY2LxY/8gHR1khqjMLXIk9fHaPrmWgP+aPmff+AtygWtWPpA5FvmIhukn5S1pY3GfYG+HxoKeZ2BXC8Leg9hqj7biXj9414bHTDpCRoknWC8iGACgvxZIMIZ/FlHHnhkRLNaF1XvFnZXoUHz7oq5Yv0MLZ70CpaKy2xwe2xzFsDS+5I7xnQMz0mdsM6g+RMXucr8zs9QhrawGyftqmAXPvKljx+qfuwccH8P0SyiKc1pTKptbpDd7apz0ln+AgjzkkS7pq+ZaKkaOqsBex/h88zHAq6gmEz6Z/v/okMgoRQVyw5FHhb9H+APYD/AQqbWdYVmSsM0ua3n51unLuldmfZGLE2ov6uPegefbasAn2UvyJXxYNyKKS0xO2S1xmcMc40S4tuzqWGClZ5i5lkgyR7E44ohRLWb4imv4TsCjxImqZO5f4YMlLjXoH/4COSvrgrx+gXrYEC3abTDF+D6JVlxYYnR0y03U4JlYotpCdTnjRySdU4afm9DiLRKPqwsO7bYABVse3MWke8yE9MIr6nAeJ3wmQYo7EAlwK8Rdt52lHCrn522kbCpOJowDgbt+MtggUv/0MVgRAv+PEICI3/h+vcGPIZvtNCtXcVJDTGCRaRln4JZ6mohvIq1XfT/KxkapzBpGFJxMUMbTqcePNfIj+ZFTwk6tDtjH0eI3Rt0I3sL+H/wgc/yENG8UP//jo4w/0kdcYPH8LM01f/gpOA/Rmej/gMAErIMFAFv/JDRRjf5mJ6myEs/gzXd/hNo3/h1cWknb+BUUYogKUVlf8BBGa9bramcvDBPiDrI4tjdB5dspdKxwoWjdyoGs8k8kYIFyiM3Yb7fdx+VPJ9fjf0+vx2wQUWwgE4kvfBgcO9fwdBrAt/Dwawh/w/jiAwOLTcDVdQWV3+HoqasfGM35W6D7IT72mVED4sq0iHvI0PLjdsnKSw7xlWM8EAisJ7Oloh7u5FQ28GjOesbx9/lJEaK6SysVDyZqLZ+HbbFsKxkgEdx04PkQL4nrfLoEWhg7ro9z5a0okMTSt1xmP4C7XCKSQdxq+2lL/Bh6lmR/5GPUswETetqb3/L/XwTNT/AwAzf/+CZqf84BM1ICIggV2v3l/HJk4BO2fXMIB/p49Szz4TCff8BJdO+YCxxshWCc+mIH+Y2tqlZNgNCdEdotb7M/ZvDTpn8ObB/d/ghU3r6TBJJaYyT0VNJ4kA+LKnKOEd99YpLjafxua6LR6TwI2UW3+HYlJK/qmJS7v5+iUqnzl6gz7v5AnU/8cDZUPj/f5C//pI60PVf8VAysKZBAUpAgEfxPR/nwnsx/glgONLIpDQcUucNks59u57CUvKUZIvWweiIukTU98uWh87ELv4dmL0v+qZi/v/n6Yvrf5esBDv/0Qu7PdJz0jyorwYOEfm4gGWpPP8kI8PV1Pjw1LvCZTuRLvYREFWl8xXzKFsCT4oeFV8X3OKn8SMM/4CTtwn+A61VL/gOorPf4DABEv/GQJKe8L0tnxzHAfwK3fntxAoXswR29vKR2rQb2B71LKH4SXo5r0hm1J1jn+RqcE/PjhVthMP7/Mzv9JHwkcX/Asvej4NznvuIih1RLNsq/IP/gJZ+OuW7J9irjE5XUHs79Sq2nSUFc3EHQkPpF3XVCrJiO7u7AMatl8X/BAuFu2/+ML9WABKieGl6qELxRQ3LWGGPdJF0B9r1fd+w3Nvawmnc/3qf4DBJP9lLYRPSjc+3lsInQyf/gLYRKttvTz4pun2SL3IyKK1XrkFU/LFNQtpqLXanZqGUAp9bz3tQlSMbEG/4CCiZHwQPpk/wQQBESDs4sfGBx/rPE4B6SK/z8tfEy5OaoAV3fsJWoIJKc8CJY9PXDDVt3Ybhl8CqQpWlDZoEsSVZ02Y/KOF3yS9cC/2OyxF22/4SNTALEk21IikBcGf5+WxCff8ao/WfInALZLXwthG/hyVGVayV2DfJK4iCiTKDl08Q04ry4mTqs8lvkOEWkChX9NB05IyHpK+cFKxtLUWJ+5v4/Afx1+HVTkd+ixnD4kL6KMz4UUMCGn/xIbpGxr6ciMcpMoWNkUKPowebX48KV0xFGFnwH50gTxhoBVv4V7kfuuO+rf4kYkb/GCgs+k/+MWwiR1fJFcNobDAFtmvkZ3jSHqyiJyoczv6G5ZN+pMkKpr/wEvA8fwKJSf7EWwielG/28thE9ixP9AWwiS/onz9dQSu4uVnbbW21yyiwn3hbZhNvGyNpHMjrm+NSXIH3Qr7Yme63pNXrQHKHsB92WBppaS5XvIskC/icYJYv4/DZCUqRjYqouGk3wDhkEcmBDB+ZxBvsn3VFbyinKckjC7D28koJHaMNUd/AlzNkMb1ZQA0Pc9gB0sPOKwEG6CYj2iAD/JywWR0pbyBfn+Al2Y/1/gBWMNrfSPiKAO4Z2pn4ABlNyh/ayk+2gKiSatCaOn807rXgLSuLo1mQAH/AQe40D6eNG/4IU71U2EDqbjcP4DABJqCWjw/xQD7nwwXGkWWQ6EB/joW4r+f7/W0/9PHWyPSnmq2EAQbT+0AIgSwbIIYHmLTKuAwQ75CfULTxZFNrc2L/46LPy3vhAOpuNz+AwATlgwpsyP//FvD/8bXmpLhvJ6vf48B8QXrp/5bZRAXohwj//yy5J/r4qkH/iIFj/98D+aOgBEP+gAfzT/CBf5bMy92DBAbpC0MpSYa4H/16FizIDgKCUGuWasSQcWOna0XbH2BhY8dECDWFVWFcN+5EDTNwzsgNUH2lbLUm3DTNV/gpZck6nzgBdkwxvgA4P4DBn/8HAbYh/wYH826AYDAOz/BBbxD3f5wLeIZH/Jiy5DwAvQoCYBgN3/ggLy23P8iBeiHV/ggH7w3P8YA/m4p/koH83MH+BQEb/GBbxC3AL6AvoOAEADZANx/goLw6uAG7/wUFyHXCA9X+CgfzcLAhT/BQP5uUARj/goFQSUgJe/gUARXICZ///FvD///iXtX//xL2r/BJ8divY2tHdOT5xLQtPkfsI4fNO14OP9ePKy0HxoMWUXFp4A1KDMzp5FD0TSm1RRJp3Njeuj1f6IqQX9UOkZQlzChKN/xJjSqh7aSZAMxC2iUiISZwOBYZq6mGqNmPiIw0k0W7WKZNVfP8Jl/v+AxPS6EeGjBv4DABDD+A8OC4VZKedJeh0oa0QJAvXO9Dwa9uMKHB54x3JKuwf4GWUVgWfzbrgV9gI2oGkbEklItJp0aYbDvCK+nPwNe0HqqEfv/FiO4Nc/4SDXutHiEe0EnmwlYUo34+KwKn4BCfYhk3u9fkFBf4KxRWCgLmkTuJDkXr9d9E/5mWhRy53tqucA4x14DdFR8Id5mfuNM7UYIMI4vjcdjFGdVvjXGQpEX4c/8EAoNH+3EcSL+A8yhmMstFIxJ7GM5vRSTxuw1xmIqf1mpHLEH96S/VLaUJbdI/wYLvR01/ggNGB/+MjyQfwJ54t3xv3VJy8YsnMspl91YHnMfBOkP9J2q7UVGV8IgyFrt/4CFk2f8SNPKXdBgQK03+CGQJn+AwkBdQoGvDRTzDNXDpQlMjq8HSxEbGYYBCiYJR2v/gXOGfGXjcr21OkcQsuwKbFv2lQ/km8z4c4IEFKEAjgioP5vCGnKBKf4B0dlmuR5pr2b+dHQNFcOC1AcoshUHfIzkiGE+UI3A/hMDP7ID+BQhZqpJSJzDgBtdEzope8taW0jZZcUnTfCGHAX/OiBhgJoD/CCuUn0uqYpGIgKnLdMyetABILfbEQ4HQYH4Z76qe2bGKaMiM0fH+KAHq9c4IIfP8dCpRoFZm6UD/4mPqEv4TATynYhwEcibYS8RWke+w4YnBCwcCkswv8N/5PSQuLPx5p/8FMxlv8JCAB/uJGwj/xwpIluX//hv3L/Rw37kNkLJ/AvCTA1X2ZwS0p4DjSdm+Tw/+9zbXdj+K5GCa2n2QbV5rcW4vfwsMS91/BAxuGhSyEMKGW7OQMW73RokpdGTbHi0Mm+OzkQRKfwVu+wpCBUZ4EB6aAeIfj6mCsgec1gAVOrQsvo1dKEOhcXmbcwicedLF7eLn8Ek7bxNKDUBT9xI7/IIKwPnCAEzZJvBhDMgfwr5PgthaMTwUGxZ/hRpRx/jvyfDmSmKPKTZjImZIwYNw0oD/lSxrGa+u74hgyx9k6n5js/ovyc/4TBMv4YPcbCc0+XpAAD+S5jr/Mw0zv/n5nbP/ivyfz8xJTGn8NnVYvHSR+bFn8rgz/+JjxPD+Swd8FqGgsH+FSv8/D9nIB/roqBD/yoeJodMT+z+FhZn6mMeu+x5cQsEiMOj/gJFU8wtRu++cGjvBgehT/ATA8n/Co2f/twS2Rf4E9rpXpbpRvhfp8EMJ7cZqSsjvjv761hGES6U7KxIPPHZRz+3/HCmi1Wf4QQW9GL+XtCplAmNpsCA/fZ4FT87XAlfTZjkRUnJFQeCpQk+1gt4E0jlYP9kI1gAD/f+hRf0RoVcmAg/wUK9oJ4s/w/mlSNi27nEFCV4yfbS+57oIxUayiyihfimJutcEVAtLKu7iq/riBQDr2KE8aBEFSAo3kY2ESmPbPTJixBNle+9f2O5XKGPNL0g7bSsjcWfrD46DWdhFAvksFrxr+56nHUwZrfwDxFwcdk+RG+1wcpOOX6Uv+98/YTQmolUl3EHKzHojz0mjM3eJ7pTo4XBY586JfsAyAd3zhzv5hi2W2JmVhpIiCT3wpXl9VUK8kcZHh6pbxcs19yb3sTcxBhz8Qy7YUcaNf+QE62iofTUOuVBO6jV5YUIQnW17B4rilw+Z6OBn5MNoNCYfydbD8xcLTcwrz9K2XT2blpOLZM1l8YBg9agRYr7BMvxmvVk9qFYphfyJtHHr/ASY9ZVYhD6IoDoHnPDUReTxb/cWEywJwB6Naqc2D8YCvQ4kIwFvmq/vwm9XDD1lQnfGpA5s27CUVkvm6duYcQlpUbddPM++un/nEXwmgegyOv64F8ZMwlKAgET4tCwB7hetmKlZT9ufafE9Q/DiFLt9BnFQu5d/AuHL0O1k8vEGAm/lLgkPTqFx9Y4LDK0XHhkqGuhkrIh3Ewnb/+MlPp2b/wYHBMCPP/5BDxK/x8M3ff4+Xvqv4xkDz/PzCwwN/4+DGc1/+Ccbqkg4Vb+gOpNgxouUVQrjHxZPZonw64Yw+zKAAfUIK9NYB/xwg//K/+FEH/4CBsv/HAGc+WA8uoEVFDjsTQf3Mrt5NJNt3YhiJyLuD4Ywx0lNBLEgf76Qf/qr/Hydp0Nii6c5m7qP8I5GAVQVkR5wi7kEtYFXPq91cHay72yJIPPTg93vtf2UMl4TQ7U1OCjOjktnqRXMZETN7m+fN2McufzWcINQUEhN0OTlEkc4ZrFJwv0YY84p/noWkeAill/gXUTrhe4NQ9y1fTGzqCHTio1AyGoyW8ZVZeP0OTbvPu5AaY8f4TN7/9yI+JX8B021kkLsamd8srFi2BuBl86ifTmCB7D0lx+iAYE/9LLmBu1Jz4uFEE2v/MAogmnJ27hxrWOkLmseyWyAS3t1D29fwsTPgTo81my/Er6/wMVV+LaaV4oS4/pEtBuTad/d3HIOi4h/wHzN+P+hD6H4C3/4e5m87+H+ZvryMB+TTp04yfp1QYJQDxEBwEinHbZ7rgdry8+jMbPp5Lt3xzVhPlweQ7TOmRjSQxon7Knv0rC+ZW/ILLFnRnir/KCIfX02zcHvuIVVOABdm9kMoJ13ikySH9zZ81YSZsEY8TpSvynv+KlCCb/wUXRpDY3/tIujSxv9hF0aX+djV2r//xdGl/zwujSNYOhP+/F0aXPE254YRvjRfDEcbn2LjcFJ9+up9vSfzgCWvMdFNplPrwuDNqU5UPzQIbCGDaSL8P6t7z7Iyj8Rw8VX+Ak8x++Ey1yy+VZ/wEn92SFT7UZp7GSyaFOZW4cG+rKkOx7wjvccpY/l0IjbFNGd0yZ5/4CNufTO8AeFvI5gEqTlS04XMWz3jLoKxV+YmplCo53TLV1h8GQw/Hy2zmDmATQ0aY/tD7ScT6kAx423wXLoznynqBd6jIzsBmqMzNpL7u0ajkzkieAdQ4zET/ICe5YwytJZQvX/gIG2X8fCzdDbCNdKeZ06S1YeSainEOhBD1+wYUayr+H0bua/DS99m7IsJxl0t4JB+mfsmG7KF1kDNm7pjxP3D8aiZDnGVpw8no5xGLNc79l78ZB/4Ap1FbC9s5bwfFzSxGgJ/gSarIXDbN1cUHI93LTwFwmXjfLjOU4qs1PCgQd/U1vgjvq22dngamierArkKy1eP/wEQ+IthAalUrOKTRfBYGRMlwMj+HUhmTnWAnPy0iGl2w9eyIB7dNuOqSTPz7Z3+MohD4tv//HfKv+jDvlUAC4J//gXIcuowa5HYVKmazeKCAI9Xe/kJPuVSNxA9Y+P5KMKP0Vqb/P8dLAPU7CAY+WOf/AYAJsQYEuRf+tDiTbpIv4fQPhmfAC1fbl6fLS91waGC1ARI+e5LalKY0rj2Bfkpvck/gz+W1yIbRZMY0wQQh1OMzBWpl2QaUw0/2AcSbfw72opn9U9qPh/P3ajt/yPdqPh/Iu7pgIAACJ+wv0aL+CvuaeV5aLiyJ1Gjf30q8A+gKrn+RQif/woWfNkEC+X+XhUH3+EwEj/JQqD7/AfQW4qw1BnNmXcxEpxqOswLfeDMMlUrai7+XVdVQBXZTPKZPbsTdu2YordGV60ndulxNF7LBeah/wKs2K3+FAgW0/18ZCPfwHtl5aBNB91EHDBQ5rCmDKD7qr6BqlyMAVxihsZuH1xuX7fxP+JgjuX4a0SEAoFu6IMBMK/+JwKJMdMlISsNi7zciRceyCWs4MalA53+GGfd7b4nDv4D+HTRI8/5DNEuD+HjRLm/h80St1gxYy5cke7CrT7JKWYS3/BhiGVkV7w9mOVSxgSmgI8AOd7bDQgnvTKX/dCOFe0IapJ9zxTCHwI8vOY8y9PYPh/ykCGq58QrIkuIzkNqYlgRIqxnz0hgufdU3xlyQ12zg9AHrjtWf8O+QSb/VPkF4/zzeQXH/L3kF4/6IS7GunTaAZ9DnWmHPX+ALeDnTvtmRVDms5ps3G7a+4rTsS3XWde78/+HGiethAJ/4V/+AwARov5faJwPisJ3MzgSfv5VW+WneIjLBORBSom4ZJettKuxI5Faq7P//wbFT/drRO5CC9/hRKrAKIrxf/4fZRxyqBOQ7lDm6mwY7pmiwnojgHk9fe31hMZBqiggte5saZuCwSTycuVYrfNudE73oI1duYjGVnPxcIsbfC3cQSm1Fm8AoYCHVmETzDtt2baO/ChzkwtFeqU8qfvbBw64u4StPyO5ZWhkMebEabh+82JMmSF8sf15ZbzKI6ovwOkXCSXVA+tYl0bxjtpe+Z7PLSGPm0QSji82+a6BTeXqm1rqDNPQ6LG2S/FtcPCZQcsPGy2XW7W2jAvrwINvkeD6n1l/yADqfJGIWB7Vc6GW7j5iKDtHM/tn9O026GGsMeBTLhQCgzSSbkdY3UrABnZ0LzrFIJxxnHfP9GmajTpEqrjTxwUiw5wrvOmHc7WmItuNMh/GZ1N9h+IB/WdSLmzQ6CEpu+KzsDDMV5X7vwG93DOXYm3QknA22CZeLfmrcMDEAuAcQeSnjK2wEix+6R8XdLHpG+FyiCYwyvGQ89arUcR4ka3wdUhEIfm90uVYsqWIDfnjsn8ocpA0AHKOOpLUchFEUhJddoAmB/ChGf/gxNSg/yAEUeAr4ePcnjtv8DVTW3SjA5W48PtrAghCpy3q2fGOFiQ6gZy82A3whG9WTQ1pr/jgX22aD/aDIO0zAPwGijoWm5wpdF+uV7pnDd0KnsvLB5BIiR6GRIOwFNdfH+wCVfkA//8MhDf8WQO1XIs3f8KFQ+WTpQCW/5YFT2nr/AQtmeNj/+FAnCjF9dz/IC6bEDEzE0JV9ltnH4uVo8b7jmV6blZMPZoqKa0KS25Z1XH8AaPv0YoYjgKpXU6pB0h8WTS8aHGnVFQJ71yDsZ/lV8eQYBlggPDkbwjDEpZ5npPm2IixBdZ1AlHwOdh2SnWN5OoR9hxXLctCrDWIsdG+W1KHRpzAV5lQF4C/Bi5V5DmbSxNj8D2Mbwyujs5JPvEHRsyxjjlnMQQz2wF85ru55+kG47qSbNMGoAplTntdEE0OAcXNfBYibMJWRDQB25ZZ+1DBBn+QFHXXsikwC7gI8YoTU7qcboycS4Anpop5skPtDeDNFCTenjRWsPOBHHVx20lOU/rNjZZiERAS/X2iS+z30jI6kI3t1O6di/TKgIlNCDCRyrBVA6LhsXHTDEGKqSRRpeNStSsWGRSovetsf+AY/qiit0XjSiqc2d3lMNLNe+BZYLiWMkcBllODoVxPJeD92amDZftcfol2ORmKLAI9oTpv4QPFius1DFLKjslBh6+fyBSXmC7WIY+gIKAgeRUj6cx7pQPYwEr+BeiffcT9VZl8I/gD3Ekkl2NukHYlspGA2TQ62lQ+HjPW31hCSvH3/GDJyx4QgF/APhkgf8yH0whxYt8dPP8eLOBAW4f5QZNVOnkhLqVy2TOry4ttBh+zUqtxznjRb0DrItfZsuobB+0NYW/h1M5nv4LTOf81HHGVDGF+TFnFeAH8gJnLy5ip0qWMKS3SWdYcT197WhSqgiHG3GAl4Me21W3SDf4YSvLBaBM5QeozOXfLG/C8zhEgwJh885v9hHHZX8B+YXUjlJCLF/Vp0KKAN0fBRVJgB1KCSXfFGsztiIDCHH2oVF/jhgOke//2IVhe6gB9bfmUwGpc+yGcuophqiP+AhA/7/OASEFPC0SG6w4YfwNoA+kjFxTLZOns3oTqw+N39XaviJ2VIgAl5YrnML3+5cfNnhFZ/DJQOmmKj3IyRHOyamZrwF9YH//hJW4AsWj41taYkTuerFWlWHQLdv6dgBC/jzbtNjr/BSbs31BCI2G/UKhjiiP4VfciHdMcR4zjDHaNT/hwzmpWAMTrlHf+OAr5YBP72wfYUd2AVwxDpidC7Xiyz34yk7qdEI/jzh9FKfu/gFBjw6vWicHYnEyfL3jLLmq9iKG56jp/wEYdFVBho9rytubnE1IFdshDe3japVAAzxfMnlUvrAvslOT/gxrGoXOEAowWxP4DABM2DAU8WlOUw0olg1IcIzPPmKqe0KoC8EJ674gE3N/aXfEQALkB5Foscc3+Kx1r/EzRqp/nBfqif4GAhv9eCCRP8fvHN06Hpg8IqIcmBwbMxszGKEamiCMJgYGJyymSMcnJmhoLCaoxqcGJuyHJkcoZyTMZ6cGhomZM6Oh7/lIGfUJm09MzQ5OTU1ODc5N/4IAFG/+LAEJj3Y9MzUwNDAx/xEDQW/wKHyjgeIHaGDBNjWmtYM4TJmOfl5WiEJGmvcAr4hyRluahpQ849wl8utN2g04xSV8kMyI5hRO/yH+DGQAUEQv8KBF0gFJT/+sFBdnuBi3/HwPSQ9g3eDSUO4jXldT4WKbGgGgQg53SH5RoKtz2xZSJFlLfGuNI6xqui6y0fr40Z03pPycp/4CO7tUH/ATN9jLDIraPiDpPoXf5CZABQJp7eofktC+9boaJ4uEJnHO4+eix4T2n6hyMcWPCUL3YuZ57ts/hNgv7EX/CCOSn/CbAN/Iaa/b32EdhDNhhybz8eLiqQX3fhn37c7/wcyC6Wn8EIlo2wYEnpf8dAtwJ/DwLd/kBA2KySnKh+JUl3rVKpCTTBW9yJDYNHU7TJ6fe15c45lFE+9QCGqbLfD+QgUwdPpKTqbeavVZj2JMoG7a1a/Vg5v0vq/yQojG9RSUrJz9CsoszPC9wHJsJ7tF4f1T5jpZCHWovjdRAS0fILYf4CYhyjjfEz8+xGyVRF8HAbB36TA7v8IDJNxN/hQtpDAjAc/PIM4uf8Jgen++Ev5IAvAr/XxxdcMGIjqlBAK0SOGCSoiBWE4GKixawh7iO0I2pL7CfWgcPecMRllsbLTaW9hWSlbzDgi3a+0qFA/3QwliQiZsjDPVsl+FizvmveemyXMGqXOSroSYyBeXb2meFDLkiaDdKffPt+yQia1h59yCyMIxJhmRQHpA0/7M0nUCxMOl4H17MJ7rgVz88DpV951qDsLDyQ3KYGY+AQIM4f1EVRAJ9qNqrfU+6C4LeLoiDKpMmfFT1/s+ALWZhgJ6Ag5OryqAvsVLG0M5fCHLnfvKwQ8fD03z5M1C8TvfdB3Qvsp5yRuca7EaHjzYh+XYq2TkQc271Pi/ZfU9++8SiofCrCycNW1qE0/AQkAHWJPyLitDeK5xK9JDxQadVv95pRfNez8leSP16d1M2ZdFluWpIY42k7lle58wmqLyhsxpxyA4B6Fs9NgN6iyArK3KPanlgy/lG8d4KlQhkHM/+bKzd7SQBZVxULTN9TwG8L/cjfJFqvsmgThQFLXQohODDa8kW+TieuEFHQOwf7vv+sh8u8c/fpRkumzjF3uYwlz+AyP90Mng7yGnUB+AYiwFO39agMsvb+bgBpB/ACNugy40Hd1obE41+ZaRE+0DdE6WC6+MwIN7gh8H6o2XoBvDXYufJHv3dwX1/JdsWYslPoPpw4iPti0wa5hQdQnKoo9yt9lGaS4DZePz99/r44uu/gUrwi0yF4i78mCnzRTZnQDI5bzxBYdMqHtDIprnmFaJUAqDOP/HS9umnf9SXt0/L+HoZLz/54Kfwf4+Xadv4/hsvI/34Kfwfw/Lnfx/FJef8iRSXUYgeFYHQgDTWgAwTdI5hsbawXMEvc4EAZcHPNRSkGQ3v/joGvXvP9oMZazMBPINlG2kdKh60QZTzn7yK8jDg4jOXeF+vMDWmRx0fdou/4xii8v+/96z+iXYRz/Shn/go9iam7/IA1Hq8kmIfkxEibhKADI60WlLY/0rZeooIqTBQ24QVE2Jh/R1p5siV2iKCPpgpvh86aeJDDE1lyxy/TwgqGJ3Ciy8ZBmlh8CTpQR+J7CwBbmGr/gJFHPJC0MQVhgkAOe39TifoqVSvoLvBXOR2NA/BBoIJVuei97yF2kD48L75KlnPaEruSSmR5eBnq5bVhK97clw+9QXx4AvItxNkBnetFgYMx/pSg9dcMMZX5LODgiqwHuriJNqi1DxbgH/ASGKb6am3VfazxCXy/5AZaSSqrZCPLO87dS4BoWN9GtFsAxUsPEv0E66T0ADqF+bTSo9pU/cPb8lR02dlpjR5U53DDUDIyyhV3zLt4qkggqXTi6C9ks94Nr3cnUjgS9hfZ9KFZVH2dEFS1wuA8iJHGTCB6zopqxUJHn0OySUMUQO97W2nqtCPd9FFGqI9uuY5dCNNxHfyY79oeYGVeBFIvsFLa9nQ7PBpzGxswJscnfgPcyEwK7XpzzYn8q5qBC3918H1aCM0lnLhTmtA3lB0Zv/AR1eP/AiVmdsunChwthGwiGWAJNJ72cawe8JzTnGo00ErAFldxxlpy/46M987GEA7PcoP4EABIMCqs2/7oZ759Sx/i5fa/ukvjlvv+G/ttqrwKQYj72/iUYAQfuRyslfVgqwVIlN5ZiYH2zop37B7eVU3d2q9ogKFyEJP/HC7bIl/3ta//x+bg/whh6uKDPfwKhQJWIafE4VVszbc/RGxOz/lo+cL2mTjModkrFn3BcsIHbA3+Km5JGugLLWf+GGkan+uDi1oXBQYIAAMBBAkCB/9zKjnHw8/w4KSsV83XF/DgB/9dZB/HYA/FiKlWfy4APiTATfx2AP/wB+BksPpuYqnlNTgep8bt+XWTCeAye7BWDz7jrl2PIEhb4dyobu9LTsBHRoym83lbgKCk0i9SLPTvFwmQg4WPcEjiLapJCGjkatZJO1dLaonTph0gmymVSQeeD+eRVfW0O8DpI47FG74oiZJb/YlE7CUgkGX5VGyBNmnM1VcQj8hwuaFH8P+Vhc4fG7qqvB1gb96LCLShkBD8oFaGUH1goLsnXwUClvOT0q5zIBaGGbb1oGjFm5XV8DlhrJrGn0qoGz5Dh83zLT7MzngNqmlaHMtABW14yYDtJTXHIVkkFvymipm6S8gx2Q2y3lEm1eApSh3dyRrucSYWgWva6leDqKv78xmxq9P2/+CmnXT/GzL/yjwgEymrB/AYAIzwYRzI1/nZl/5//4E4L/7AZf+W66MQwvKKSYJd3YO8foxpBdyDkBROAWo3tbPe/jzE/7Nj+HgV/i/n0FfmOWmPAct2zKNjuq2OyApMN3QfJMSzlI6bZSzaBcThbJySr/fxVJuEKAt8wpk56vfZnFIEEGzT6WdE8AoKUlRdOMjaEz1DI4DnOur5xJNm3IFpUf+oqbmJiRRFjKI4mzbrRQRnOCgGeEGQKxM/+4KgI4xNWbudz8FY//PFoOBUAtL1YSZlAoNMhZB8KdtLHxl27cxYy/Haoq1mUadth1ltH/mz49S3N+jjZrIa84JK007rpuOcZki1qf2HYwC8V18cLWzJ6+hUiHFP132TmpNCE5qeKsLdJF8VGDbKP52uXtfgpwozuKr4SlKVLbD7ge8ZfZpZm2y9ZK4vGFK8kMWNresoXLxprFCZhy9IjhPHt+nllYOYcLGGz9xHqqAxkhySjWrOXiOAINNgnwXgeXiHxt7NZj5+Oucp0QQIBBrlvKJARgwLqkjcdegr5cg1SzDGtJK5N6dPUHDzw56iLUWlbzFuynmkSPT4vBYXRxbnOZuQSdOmEcdr7ujlOYoGarPPRx5oIxs0mKcAE72eof+AkFMHBBRYvomfcz3BNd70aLHPXpmSL31wdcaASdUQ5k5VcxKIr4DCENFIEHjb5QVhUXl1Q0pJJaA4ciLmZ8om/WTeSltg3nBl9dT/dP52I3boK97/wEH8YJCgcWKTIPZKifRi+ysY69wapNwNKzdN20Q6jPJtPVaEKoBHTyXfUBEEt961cURtJ3D01TGB1A3Zs5tIbRiADxiTYhFeKA66BdFfl2JbuStWRpT/Sk6nWmkB3lteHqQC8MgAVb6hEd5DigBNnCqACnhhgfdYVnSBKu9Jmf4Cb6k4zavvDW2iBpVx4uGAjhuGpTDikXOO6TCKpBG+2kihehSB1l+nbVzt0gZz6cMW9ouYoUAUl+ecskBYFfFekkDqFDO1EZ9yiRAzIeANT4liAdjbVh3VO/iTgV8/wEouSjNIh8jWgnM+HlMuho8F1ygwBk7OV3PY3ovC2hd2IQzG32Q2r/eF37hbGS+IO6qSqM4lttTsv7Ya+7TPrn0hYgjD9Fjbab640jh0MitdTWSU5rhWRUJiKjDMZRnRGQwfgkBY7kN1qCb//gIFGqDLBeVBbigXgr5jtG+ruSJBfcZCR9gOqVjH2UfOVtNJmG92kAGV1rFIVWHhBP9gH8qdKN5gYFO5B0gS3qFWTwXi8VsCHzUQg7YZLwCgCIOzBhApPkUUFAIH1rk4VZiijxpotP5pNLDcn4Al9DcLS/O0xvVz4cTUDd5tMFeiudaHXEujXdC7U9fV0eXk4S8w2c4x3o9TyC5XQQitkVl4O64mFD/Q0Y5bkIOsp3I4ONQfQQ3tLXhQstxZattiKabggPG5FdpIxio+EK8RFfuuAUzo6FftBhAdzNNeTxIuABT0ieP6M1Xi9AHt38n7CjKZ7Wj9zWPn+CxRw9gUfwzu+HxHr4mOYL0useIIgnvh/s3Qm5sHmuAcVxKTPafbMTj9Yj5nF2mD0jasVNb0rsvZZK+YDowll9RO81YlhsoVD5EE5EFb1NovIJ2jhIYB/gHr5n0DNfxhDCSF3Gzq4JBG5Xg7MpwT/YkmC6HZEgEgHDUqWozgCZtY3LO3otZBii9VuWeyFVwz2F2eGnfFCmRLSNhzxUSTTiBGLGlUfOP4yM/wEn5YIZ3dC2rjNsw7kEVQc5HnPWdg/DxN2JfEx/Sc8/z15mdbUtkt3lTJ5S7N0g/KRT+XVX0U0uD75yzaQm3jzdcvpsiq5CiwdSctMJsY2O2xb1W4YxjSIm9O03DPdhLgeTMx9Lhfo4T222PJ29Eebb45RTwEBOw8xAEJ9nSiBiXPyd2RfBUB/wB0VuOhYYFGFX/eTBI8GPn2nK2yabJrZ9d7FlJOPfZCAK9aE5kIgyMtXWusjsDfmxokljI7zCeH91hz/8AmF3CJTekhImJ1ogKjyiTfyGKvO9eKXkEidA0Fx6jsl8gPfcKc4xmEN3QiNSDBlpdUB/ghcIH6lKBNOW7wcTr6OQ9uvPOfso2gsTwcoHYARbS+nbT4E5fFv/HBfY6HfzSoSgYuaFfKumSAcQWPD0st3Bk9DtLHty+hZwNXperJ4bs5fw/2yf/+L7Hf/PA//q4wP+FFJfEfyXdi/h/GqiV3fjcZs5LAjRs1HFdEdLZsDOgKB8A3dz4eq3OUey5LMP8BE7T+IFzyKgd0KrTEhAEQrPEMtslKmx3qR62o6xQLX8bTpgg+Cx8O5SxcKqNTIvg45+gpft8fiVw7xF5SdkqIEBoxfDTai3YYjNM4dcCX2JBlzVT3Y5QACuPcIt7ZmMqqUKe08WiRBvKo69UpSzstXHW4LhXWubBOKejzQNTvK+OwfdT33QmCHJjxTq5z/zXclM98mzj3340GnMmK95DRyf5AMzXkvyFtyXkUVZRgK9wh4n75B72a9h22ekAx2TxRTdgG9lgQZOne9YmTPUq9jB80lO/O3tDCyssHV72JTqhOP3EJDiATdC2/H/8BMIZwi/wCPxLqZuTbe9xgidskU+OaPVF9FU9lImCGK5NBSchVSiTePk1cgV7jyscU53P0aay3MAxWYbsmQolwG+WbGKoJFuD2kwmX9l7RiMdwPdJ9SJ5hfPEFVLazKiGNtcecsf5wqmCBhx01WB6DdIRnNkOtU2ASYIYTYG8YH+OEQ+Bm/72p08TnNC69h5oXiyyT2O0+/El1ArCi/jeVdX97YJH/AQyc+oAGB/jhYhesv72T+ABQq2CUFq5UuTcjH7/wEsj9pSW/z0ZPTFpnC/wLTWqwG1CWpIwqnNZHCAs3pmDIFELFQBoYmPR2Hpxj0NMq/zH+MGPEuc/wUCxO//MNWBbqXq0Wj590NTXuopeBi8+pXO+UKjcl5xKxYZU0PVSkgLI9/8cLi8h3//4zbW/0gZtrFnPgfwJOpMVxhsthIQBSsqZ/AIl6iwhZbVlMeMWv+ucVLKFHzXaqTS2csqErWF4xQjnWwL8sUdPNdDD8QkeBfIL/gp2OJwP4KnEgyNSM/Fi9M8C/66XpngkWfR872n7/PSbUB4/58T+YwOko0Qz6r+P154vXtW5ZnPMV+W6FnjPr09PyItaeDhgiZkgKqF6skcHP4Ma9f4TBmD/dNkY1XXbmoLSpu01pQOKpWqpYxMr7NoNl06g2clcmv+LEEfn8JhLH8eg8YOmfy9ZKf7ACYOv+S58f/EDbUJLmRV/FYA//pgyk1/iUOj/j+5UBDMz9YL/1csyLcP6+nky0Ymj9+wF1pdX/gIsGYOoOJTFgZdbF1DYlwXTx7RsHKcB+udp0EPlm0CV27bVRq+WVPVSyr8Tb25YNzjMzf8gJzAOAsU0Cixx3cu9mMWOjlXOExJtsBCr/gowYUV/8KHXxv+vDZrYD/AAAAAAAAAAAAAAA=","commitment":"0x91e60a78a029703cc1262149912ce813e1093221871ff84b2a7fc11eacf66991591b54e9499efac736865a329f174d51","kzgProofContract":"0x8d01ecc98377fc7dfd4618d11a4fbfa1a955c8f5c96194cce89772105a5404114407c27d64ec14ca5c65b73ee5296576","kzgProofSidecar":"0x8ce9d1f6f8ce720d56b10aa400cbe892e3ea89044acd94a46420bd5ffc559604650a5d8c29c6dc884ab9291c050013d6","expectedX":"0x982163b45ae304eab83586c9557af511aad407c45ae4697e2a395ebcc5fec1c6","expectedY":"0x08864c68223e428f33dc1a816f60cbd2a3ce8f68ce2b47f61520a84c36b09419","snarkHash":"0x08faaac7476295925ea4317372bc6a7e5118af7de4c706474c00ba8fdb534a19","conflationOrder":{"startingBlockNumber":30388561,"upperBoundaries":[30388573,30388579,30388587,30388645,30388719,30388807,30388826,30388903,30388920,30388927,30388937,30388939,30388948,30388949,30388953,30388957,30388960,30388965,30388968,30389025]},"parentStateRootHash":"0x1eeff0077670b9e5150fc25b447a9cc10f833a6528f25e924b977a5c0cc871ee","finalStateRootHash":"0x331ffd7f618324b54339706420cb7f4361ee44801dd50fc901c65e013f8109b9","parentDataHash":"0x0195cbe6907cda0675bb2cc0e4b4d7f10ef7791024ec50a9b732c9729ab6ff8c","expectedShnarf":"0xb686e4f81a269c88420e67e8235e63faee8aab7b735520cd112737fd283bff4f","prevShnarf":"0x425962763f541e4f57f2c2a2e95588f2763bc9f769b55cd4ffdde2842a1e0f5c","proverVersion":"7.1.0","verifyingKeyShaSum":"0x213d4f5d80ac94278aef1a2ae0f73a0e069d5756b2346147da1aae310fe8bf1a","decompressionProof":"0x00735f895e774c3163c52e58be266c9a167c10e27b7e8c6ded206aad67f15f474c45c3b957b4ae65c0481c81a43ace6801a2d8715e7a05aa201e7482614d65b1d024eba2e84f65bd46f10ff143c6b810016a7d96f2515ff9ef1d4c28d18d667300d550dd64db2f36655f12d88f035402002260141c3e443f328186c8f96eff45f21b7399b48ca6d34c4d49710d896ea20020833761a86d7397e01823a394d3529e15fad78bfa24c4a3e4b89112a72839236fb930f5315640e7147f2e25237a9f00ce06f296cb387e4f1d75a9eef83c8192a95332213f7183ae642ac1bd67d79495bb55b87608fc87cf21f51f551fc87d00a4063c53df75d37a6c4ac35b466f60e4ce6452e3fe7a8e25456003fa436123606bd02faf3e636f4117e30b46428f82011529d161e0550ae53402e087453d96305c49c96193e371de223a72ac8065ebc1760e8cddaf2a947eb1cd1ca146bae500796ef3cba98e042c8f96dfeede407d265b50582ac2ad9505f87b38bba87b953f1a2a3dbf077853caf2896e0fb58300018155f4fd929b3cc293b13d0b603bba446b54cd52ebe99d248212beaf75071136b8ef1e3915ef56d62bca27d8c1a75300bc347358872fbf9b4fc4e270aa24851453d09f0e368ee0dc847337fbb17fd297b0cc6bc1ef6dde07e7c16bac87f62d00df7d32766974986575b0babf7dedd0d3e20c609b633a0c70aa488726ba28958b539a6dee6e0bdff436ddcc68255e64015c2f01448300ac2d9e4e4997cdea0605287cec8d0ef4082130d811d469d4f114ed739feff29629bb127284f0b699ef01a7994c6c05f7beb18a92d247a751305644bb57268a80a57a7a22594d7fb5e0c04f337306c6ab4d50415b4204ec1395006fde5d1cd578f9c86fbda0b93cdada8ac6da2977492c351f587726c9d2e53a10bc9431c61a68ce9a8e6e1838756b77008d273829ceef23d057bb662123c205b7b6d6471810b9582b810b47bbd11da6b86a6e808b14c66e1b8f96856931f0ec004ce494c78963a0cf469723ab55cee7a47de709e5e1890f97069627e5365db9c72f19988f54f35e6adfb38bf28f939100000007077b9324dfdba2eb72aacbb5644e642906fd4ebf708604418e12866827c936950b9b4394ecbd5ce9e40e69241c037bbd206fb2a866919fe4cf822eaab10f4bcb0156a7a292f0dd414a1b1ee042a53ba79cafbfe3636fce3cb510717ee9b30da103660bb21416f1fb38846a64de32cae9818a7593ce851d52caa4e5a31e1b5af40783e9282677924f838914638f0dafd09e4b3ef51761a6f776c08f24bc97c98c0308e3563798e900ea56036fdddfe85fb309185028fc2a43b095c86e06233dc9090c5e37e72eb3d029358fd346619a5eb14182e3fd44c6f261da1bde4cc01bea009d5334c2ecf60b23eed8d57e1950986368998ebe4ffd813cd53e7b550773fddef0441014b19b4980f09bedfdc8cbea0026a14ab2853a0819e743954fdd88ba15e3d29179084fb02308ccd597c2112e82f071bc8256d78d9cebe3e96f3ed92303724935e172bc4fc6e73bd11ee09817f0df06a1f49eac9aa96b83d423ff4b9e000000010188ac5fc9e2200b107424f81753a045eea189b4de8b6ee2cdf1ca572940514bf61a619600dc5b4502603a39aada7c5001795d59591430b287be06464581799f7e1694ef722d8375a01961faf00d7aab3bd0a3e982e36f721b5fde0474721ea8","debug":{"publicInput":"0x876c1a0c6af50d9c1e8b7a272c737fa78b425d98d68a5d932929dca2a9aab72"}} diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389026-30389504-response.json b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389026-30389504-response.json new file mode 100644 index 00000000000..6026e556a90 --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389026-30389504-response.json @@ -0,0 +1 @@ +{"eip4844Enabled":true,"dataHash":"0x01f8ee078933b7265e4500cbe02c1150ef6476fd481dbb3d3b2a71b01c533c47","compressedData":"P/+DR4rvuKHEmsLIrjjTTOIiVEZYCKl5ZW7uf+DuIlo+qeAAgBFuMBAOgA9PUA75sA6T0A22QA57QAMW8AAQAAARp7epo3vnnW6ooJGzZXh+hfxTlW0JjW5t5hWU0iZyDes1VJuI/xUCT0V0Fkg/+EgSehe4tttFmDMtRgEFwbYAaPe8NDrDyAuQOE/wwA5n/5EDKo/8SBztyB/kQC3k/w7Bdmf+Fg+LPAf8Tdl65WtTLjexUAlU00AACXFwYGBACBwEICQAD/jUBkYP8fB5e/8NgXcXsf46DeDIvZ5zb+E0Af8uv+PAB/heJz+PAB/pnJj+HAN/qclT+HAP/szf4TwdoEwrZXWmO4NZjNcWboyqciyTOLtAI44a95CrStB8KY6Dwv66Ontf8yk/fLVpcu8Ey/fEWeqdqCLtTJ2RgLwGsLeN2e3YT4dlOzYFyetuUqLR4l494KcGyGA23v8ABwBQ85f3plg+UbxFQaYg2lsc4w4fAUCjzrcT90EZXEOUK/TTIsluCaT/D4E+BDZzQpWS687cwYdyvuCjq67fytQhewRLVOjm19QSQJYiJIAGhcESg8iNmyOcFO4SbQgnAD0WtaDUprFIeUL6fE24SWOZU1KhjlEe+9Lbtc2pWJaLGAtHi7OAP2Qr4df4CCXmRRWo8PBRcSYehO1C75ZD0uMKQcaOYTurn70H6GLhjn/wUDeul5z9iAi+yqBHRHU6fhGJ5+Q4obC/4OC9NQG/soQHZBZl/gQAEgwKsRpSL4CSNcVGtdRcuyMWPhpNyo+lBSqAuQEEoCY4/wwCsPi8TwAIbEZQWruY/kJ64HjbF192Cf5cBwHf8QBOSn+GAD0L+BIC1/hsPqae3q/hYAfCdY0trUMHBb+GhCj7d9n3W5/hP8gA7Ln696U8IKEmUFOD+BAdSFrnKYhQj/4MANpATMgQDQQoU/gMAEHYMD29yU2zo5KSaSgfFXpY2RKJGF8h4wKHggLjEfVFo0f4eElgr+HAOfuBv4cFNgvrwgCig59aMfDrSm4KV2xxaFK+u8FRq8SZyrgpVrn84xEYrg4z4SkYWhnfqRbS+ke2BHvhLpHfpybpCV2rQgq8HRkv4fFURjwGhJNcfIdQFNJ0JKQ2cQ2jlqJ2ST/wcEvgFbnCASNHGJCASNHlH+CAem3/JQA7//GgqBQf52C8Qf4PGPP8wBwVT/Bwz49r3UsTQGKRrR2+w94QtdVgH8ABz8jtf54C5RUU06fAAANp7epr0dMWLWJikRC8OkIDMEZ6Zs94y2vOGV5ftn16zGoIyQl45JNB/rY5QQnTxiN8dZ60lf1KE/k/8EA9pn8AAQef/ggDqu/hMAg/wYFyGrFNAoscd3LvZgxY6OVc4TGbbAQq/4IDbP1iEAn/hX/4DABG2CYnCU5pHse9cLDbTI+/hUt4CyzSDBvH0hwWzw/SrRBiAwNJsgAA7Id43lJXMFyazdefFvh4ZwL5Af8EBKaoZOJ8IBK/GxH8BgAjjBgTGtSja9YukZM2T/AAJBQsjHcO4At/wACzDbymNPAXIDSIq9yVX+egGJwAgP48BciTc9P8+BdhBB852B6oAGtQxf+PgP9g/j4DfQP8hA9sIE09vU5m81xskwE5TeLiJP/wCFzw01hapyXFr1TZ4C/c3YqG7SOu94DqJzw7nuhfmrm2KpjXNbpCfVKnyCR0EC5v8ClHSYloEpM3sjRi+b3LtyvqkCpjSZNKPNPCbAXIIyCJRK/4YA9vijMGjEsoiizR+LNhO48z73zuytxP49CBgFA/z4DNvxoDOdK7viYs3c77yOKDoh0ls/r8VckgfwACFF/VwySeuEMQeZX/AYJJ/g4NJrAF2ihVXMZ+WOzZaqDZIqWSSF7lzDnOrYMBNQna1G0pONDBkOGE2ZTBhZBgLOrwa31bYL8lDkkHjAJnoY5R/h4SEae3qcv4zKmg6M4x20jl5EYZ0k0na38XyKjTbggWyslzYMeqwIkTF/hsMCn9LaAP4+AAAQ/j4G//h8OlgP8qB55P8JgZaDh8WwlDCba7BECYdF+PAh/qY00LyeoAzahDq2YDSynF2cHIhBfA5CW6aYNaEWohoRTA8B/Qg0oP/gIDLV4AphWrX14f4+BIEFETliex2y2JWvbz1oaDsXLCtEQ5lX+ibZtDXCuElxH5Mct/BEfTK8R4bq2Mw0LNCnU4k0wX347evck99LTkAXQkB/j4B/AxC0OBhECKesT5vGS68nqKYkIPS7xCEUoIRgiItnsyqDIdSL8a17Nqam8bpaOrzMoN4HzHWxosP3xN/FRpFn2cP4/A/7lHMfmL0Fcei0gVMDt13uvbIG5xH9qnl+T9fSIE0/ZShhahJ/3dFMHiGOYdh6QFOnLaBb/AQCzesascozLPirBlS6p/kgQCf7jj5z1jf8aMOhff5kICWJdk/2pvOQNOgg0D/h8SkAyT+pBKRfH8iMe8+RRMDqx99rSajsIBnNL0gCLc0/gAjYwZSLuGwMc/4GEpAX+BD0hJa6IJJAVJWIEmXd2rS6LSP2/pdmAMNSOhqYOt82k2jVmMzEzMJkMmFj/moSkHYF6ANtHNP8BBU7wUJZuOcLOsHAVShpgwdEQfvVhWxn8OD9ZV3PX8fiUgF/Hw5X/D4M+g/KnoeiRALL0mcPIef8AAcQsTDPmQqAJLVq2zJlE5DfWry+MCGr2zMlfeyO48NKiofZ3jfT9300RzMBRZKEIpclNrrV8x+IyPGSf5YnvpjsfljGxUHpvdmQ0qg9/AALVEIYbEVhMfe+mBosHglZAnanR+IEeg11dcLGn3OFa067U1ZrqkD0rxbdvLoA/5IEEBAFp7ep3RkftAk3k5gzPvc+NRFFQiglMdbAr6l9E0KGufo3G+L74JvuojQh9kAfmlAO1C8y1Ux728xAvkN2f8ECBpmfCAd+Nmv8BgAh9BhtI1yk97Gnhewa9yARrplWP4HxM5KvkDMOD3iRUBkeXXIbSZrcIfv/HxRN/HwA8GkH8uhLX+Ggo52gka7VJlyts4kQAJbC902NoOoZ+X+OAN/+BgUb+PQT8IpONmWkP+PgGP/HwaJ53H+VglEHKnWlzojdfgmnFDBE85X1h1OUYDP49DvgcpDXxBP45MDv4jBA001/h0bHTZim/hokj0alKIAD+Kxhi/hMNW/h4mH/h49h/yACHTsH+fASIcE/isID/h9KT/IfS0/4yQIQ7+A1qCqqlVV/jZaqEP4EW4gEAAgABAAMAAIAA/5OEt/4/Lh/4+B3wOg/j49ORClfhDP+PhGcC/jiGC/4fDYORwfqm9/xkCIpFBg4BC+6fx8NVgX8fAb5//AmrV1xmnuFP+In5hb881G94BkkvnwHiAAAV/D4ZF/NoCfgUoMHAIX3X9Jgn4X958/SfOK1h6BTONagcEAtBIwwv+MAcan+dQT8zkOLhZEiUgI7+fwT/+EwlT+Fwn/+TgE//iQA7/jEcX/kACS/h9C/JHO3uy/+PiuPA/j5Fj/mwXXVmRbagl0tLsG/h41xyIgm9mM+zK/l5WqZFoe46z+AxhO/hIyj/h8DjIYVxT/RdE3sRMXiDl9FHBTWIwQ9W5LcA0kyh4QLSwje/zsF5o/y0F8EP8ABqpwwTGssj7aYchsSNKSEMzAxNyQFDbQ+Cyi/wAG3j1wjwOl/EACB/DwB74H8CHC0fv+fr5JfiyZngmRF8ovjp5ngBHg8QwTrsXIM3EbKnfigzECRsVdxPSVDmNhsemXofzM0ogH+Dgh7MW6T/JAH36CEn+TgPv3+GFKw/ZwGvaIEqHn0S4WbwjnBUhfSGbbLTyiEimGQKRhjfxHXWS3+PWon+ICSdAAP88AX38AsH8fEs8AAfz6AAIH8fQtX8cBpZgEVZm+lMx/BwakO4jL9n8FF0ma5fw87pgANI5+nfjuaL70WoervXFpCK4h/AA1t5tGZ5Yl4rI5/8Rz1ooa2Pa8wpQQql9rGWwCZlKV8SD1PpGFIDGWSGTj1+w0eb6W+ueGEI4n/ACYRgp8KbU0AkwdlrrRrrg802Upg5F9NP5h3i1rlvxLBAQZy1r/gIC68TQmrYjnjuNaDF7w96Y/+AEzsRYdPDQsfG6MBphZAd1ZsF9iuQYp1jQgove01nJ6NFoRfwSR9N25IQ0MfDC5y/CovZDBWZ8meGmGyNeM+LmnqBGdfuM9D/4fAN9mfjStN1IpFEUkEl4lz6F/W/obwuF/5VzXATC0DcZBj1kGljnOy1+lw3aUz1RVkb6gPo64z4oWkE9xxDmkIgaI43thx5yphejvhHexAoqrkooov+AB9cdToYUTioG6z4nPcoDGo7Nee6LeiFLygBUJSyHrr6LP8BBgvb+lIS7EbTPHsNTt/yq+esXoGSPl5agPUe7nMJZJ5ImMlZ9ws8MBnrIEP5Nvst5mxcBB8tJC/axsJ8Nl5EKpKifCGiP+Cg2ENZQQz6iz/ArWtJvfV7BrF+W1xCh7c1sHr8fDOp9OtZHyHxRN9JVIhPt13f4qCCG66C1Y7+A3/4b/wQEEN4tI8jXLOBUlsmBWKpELb+VmkZmM/zPKIgID/JEoieWKwf4CClQ9EnMb8qRXJAgtn9AACWtwIHAgQGAQgAAwn+cKURAAVrSf4eAH2n+XgB97/j0AfbtL+XAB8AAAS21h9YE8ukLOdfmLdaDJ1DS6CiVLVXz5Hx7yv9178mvRcXoqD4tisH+t+RvRk+1JO+nCoc5h3rG0nJW9ZfkuqxZ+H9ECvzgLMD1VMZBacCW9Jm0DGW/UT0gxq2Z0svFkhYyivU4VJYukAbe68PQ5fDo+s7Uzjr0r2PJyAm+mb86lL+H2z2FvfqWJ6sO56dMvAC6B+LoVWzrWvlqL5QsxZKj5mMlOyHh6V71AILrxqmvLBWY5rRg/pMzz2gySpLcnXMUZwUDQJlxeLYSqSIYEkOFOBRs+syQL70PKZqZpUdT6a6p+jMmDXgXesR/2uL0J+k6a49XQf4AMHPVG5AiKx1pwgJBC23yAMf/BQbwf/igL7pL4MEq52EAowWy4MBVdaUaDA0RiONnD9od7yp6pXQt6ijtXz/gOUILVASx/4+fQ6D+KxCQcPU2s3Upa2AHim4bsHkJGaE+/wEBvj/4YECyOXSAMyaPBTy+i2WrvQDbZvptOb/j1DwHnj+HR+wcIP4JX14XZGCvKifa5Mh+43bhEd0SX/REjYu2YOAmtFJ9SBpIvD2scbcWT8i55MVb2s8Phl8PLHDCyL+CJCpU/vaQqn2BBZfC9Mpjbfv08fuBaKyz68PU/jsvsAW1waP4FLUJvPcNEdA/V+A0qzfXxnU3f/PwP5Krru7SQl9YUKOfPEQBPDltrqMRnsuhgcDf3O+zujX03L/4bYIEY4QCdPkXgwEwMf/FgUUAiLR26XP8fT8P8FNCFRQ7OpUFwcj3dhslCdQOykIEaCYBVseCMeXEdAV8N6Qj+fLgueb+gzOtPdu2DwrOBy36xxp8BfIJ/ggKi7GKj+Ci/n+A7IRht0kqQK5MEA2KXA6ONJIUcqh5+T+A4kneXguaAArV3ISVY93yZ78aJ1pWTF3qrC7+dJIBpTcYRnzx0AT/x0HLmQHR/HrfIAVB/kAG4B/0kHP8fykXsgKB/HpiyAwH9Ll7IM6XIuVpGk/w4q+rLNWUv8/GfIMapQZCIkEg3CQep2WtgOhfOdotYIjUEd7+xmoTBiu6Bcme/VLwcAQNfosZf19nHLSYByoySTFC4Xa03UzfCZZLrhx+c1xDeitS0BWQwphIInoNOwOETq6OofwAmb9r8Ay3LFpfuU8l46P2hZP/xNQ5mdUiZX+o/NJRPhld6YnFGISSCkq2en/onUiGOCLgETUJ/gCDLE30MTRMIMREYLDQSGwt4o1uhk1FsgjzngVaz2dXORaP0EVhgE32uRboI8DSrTAwZ7/+H1Z0GCNuBJENYxI8WeSIv02isqgODE5BQOua6j8z1Q66qV0lKZ5/cVRkZAMylluc2Sw+kozftsLZ1pyf7O8CQ8Q7Fc9F3l2bGQ7NJ8BYXsfAGjatT0qNXYm+sLFgPAjLPhE2aKWezMXXwAkOisSIQbjoH1MkCoQiSCck6qjwe8y0MsBdjGfqtf9gbB+Wd/K9OtWAnuxd8e8ZaiigqxrObiu3lctZII3ciBFoGEB8eJ4UIP4D3ipzV/t6I9ph+F8riYUmDQa/gVDJsmImi1AJrgMbUHURVT+AIK40J6FXXPURhACLJuB4/QIWFzg5BkF0KQzQMgblEaOffpfNB/0eual/ggcIK/72rM4SAXJYXPOKrSnGuCRNvdYEr0zwXf48S9RHtXf4FSWbYOkkTW2MSXaMFBbYqDQkh9bZQSGfO4RSTzaLrN3aAYHdtBO26F98A/4AWPTE0GMqbrWdgLTSt8v4IBj0z+trMP/HMBpS/w+oikV8YFzhtGBZ7kLcTPooaSGOm4Bp/HakKBo2a+X8Cp9N5w5CAvbF2Mt6oY+w3LuIThLv2Kgyd5lJYRohyopFrgDx/ioIJvcf4EwpS7C/gMuXxP4DABNKClB3Px8EE4fxq31tngV4EY4fwKpL4wjX1yiBNkA2wYHB9Xo+taUJvmpRQ6VnbwVTpGTAnsmk5vSsIOli2iIiSKoZwSlWVXm8MwOv+CgcwgshAJbawf+AwAQOgwH7J/8nCHe2LX3iXN2zgWZ7++7RgfwAxpPtDmk4Q1/xLXFZWnCpsCYvb9G2/9p1Gmx6M7BIQbnVUM9QgkeJ0h/jq0+1NvySbgHNEz0B9Gfr6FZmdDqbyFP+EAx26c/va+t0TJ8LitjuDw5R0+EhjwejMiMOSr/jtdvE8TmasBZAfCn1MedBnMQL/zcbuFMVzMXH7+CAWn6/vbEXASgiNMPM4PcIS7neHxNQ05uGu0p/jxjZD2p4/gVg9zfRVedzgIfuYEG6plNWLwT7pcmn84E8lBYXxIXT4+uly/8VAoiUGCVc9/Ah/PAXxN/wqfzXGJ/IR/L/FmET/GR+L/DzP8OdH+DAdXlp7Tq0wxn/wEI60+gsxzB0ka+vU6ICPiwMxIY0RMfPZ0HtH5Uw2CutJdCs6otoqiPRZJaK755nUuKa6gF8OkFzhAEwgFBN3N/AYAJ3QTEpSn8Jtw0MNitRY155cIiSUhn+GAJoo/hMBVVuoUFgsBW3pe4nHNvQHpJfYyeWaaslCm5sGAls12wg/wKNTEEIlr/qfWZ5vlrQsnpSMhpjS/ggStcT+9g0NVEN/ctVhdnKVkBbWUzkkN4uiweP48QMDBsmPyBW4HXAan/AQbvInTdiv5YiOCdXzZydIoC/Z31RCS90IwsSQ23IhwXBU3JFe/SOW0hKQTn72xg5A4/f+MEBwUIAUTpb0FS3X+KAdFCGG4BUpY3O/wkEl8fwJpj7+eKq+iQYHix4aIsKcmMVqVTL96MDWP7+N3IQgFkCb7w+Z0BMKYMXrMStV3DXAQyJLGX417H8Dn+Y2NfzSf5938vn+YAgIAAgP9/n+f85H+aJXQmx00boifwa+j2GVU/wS+liCh/4CC4IAj+fz/M/m8Evc7b1XHzHObRbzEbYioHT7zzOK6DTnrV0TkS7uxJOapSCjYQ5Rj2CsAdXnhgwN9UOmRY2r+ALyaaSUMHL3VTv1VWPQJCc4hG8PZoAaWy8x7sUACenEWajlqPsjjXjdlYiwmTxGtu59JvVB23+8IpQHCQLHVrbhZ3UxS+sOyZKP1ZxCoP4AjVgmyaO8Nrj3D2h0dISspnxBJcw74zN+uMr3hpnglqAeuAsmcw4WGjcBEddoizX1QwnSrZAmQcT9exRbk3+H51EGWeV/iys45WWxmPlD/0yh+iq7SBtny/Ebp3z2Tgp4ZmpZhDTdztwSzS7E9QQDdpGJk6vWv2yH+mu/0EbgS6Kda9RDCy8sPJC3p2yRDLqW7rY+dDoTKLjq1rcIm6XtAqjkGHMOVqsPWsm165GB7Gn1XSY59X4TqI6JwWtnXN/bps+VSS+4ecr2AJGcdIIrU9Ks6ejFm/udXnE8zPJuAvpew/0DHSsH5dzB0mugTCAzi1ymNXqenq9+irPXBJisPSWVLlwP8VA7EtdBU+/P8MBBGQEB/1oCCMjAIAgAGDAgOBBIR/OTgPFQr3/DhINxUc/fw4HgRUiKsH8eAD8uD6H8eAD81k4H8eAD84UCR/D+68mglAnI7RxLI7Zi6c2mpjrcBqvF3QV8YuYeSXLEMuagV7FdSoRzZTd9so5T5aijoDwrqGzzi7593QzZKI/nuTWCPXbQchW2/9WQx8oQ85GciBvhKf4CC3i/EDsGCzxYjnVk563Irsg+gnsHJM/VH79gi/gDOeeeDXQpibFIsX/GrvIp0+BX+H4C4EHpHcHqO+Nz3xV8jTxjQCJcGINz+AM+PEd4efQnsT9kLU9q5PFcsWcGUEAOcHd09mfxSeZU11HHE1mbyjj8hW0KXaK1EafzS1wByrDbNBmb6iIFwNsOKJGvMYFUqIJEmU98GaMILz3VHXs0AbyVtczXHEUuFR1rIft5DlkM8WeZZNuDBf+BMpkL0cAYieetYSZuAwjRqAJub0ooMb+CE5Io/v82u/kY2uATVMVf4Fl7O+UJ26r0f03G4kdEUHEziE9XNYA8dEKCx8ZtpQu3VanZKskCt2+ncMngPgtwtyG3ChIZqIZX+B/or+CEf7+GPoqlwt09S5wgP4FmWPfPFKRvGhG9SvNHC0QKeHrCISSFoiqO3SsxXUZg46j+F3/HAIGE6f3+fVfyMfVAQ64xf8C/F89dx5uFBh6ltL1IHlokg4dUx7FV+EU4TquPktOJOJC8AS28nI15T2CTyUssX/AFeh1NKx2b5JeiwF8gVxBc4QPMIBfXhA/wGACREGBB2D/ioRkDBPL64r5fmbkChf8HCMIH/Kk2b/AwDN/H2/wHf4ICRpf8LBdQ38cx038QVjP8SNZyQgv8Waw58ZQiriAs91zx/CWqofunEnPw3/DPvvzOuOnZs4/xy45hZHiwZSUf56CDZgD+EEKn+Ewn0vs4aMeOnUkRxkaD3DZes+PwZT0epYEQJVNErGIBvV9b+G3BThMAqz+GSgdNMVHuRkiOdk1MzXgL6wP4IGW3b+9mF7TL1q+WdpTuIFas4XuX/D3MNnWan+PLOQrA0n+BLEjrBk4BoIis1jKHFyEKnAaANQdVZjA4g1hol6e6g+cdQoD8TEVBI5TrSYCzgbTt4vbA3cV1DWA/8HCKQKyoQgFA0avPwGACMX8N2JFJ/IViQB8wN08IDBgVHMQ3eVBsrhcnU4k9Jx8kUaTq8wtxAZUwG/x9Ykfw8rtAeB/HyD1wf3/YhH+TgiRP+JbEiLjGB1/wsIle4/8MX6hnpofu8MeVcfwGwR1Qkkun/BW2TqNad/C8Fd/kINUHAK8lnlEG/h9+uwfQmQSOdUSD4yx5sAhgFwQDTQmZ1r92GP8BCMSZ6zDAgiv8Af1IRburLQs08W2S3HTjcKvIL9Wicr6sA0NqawnDXQHsncUcoV3bNEhGNZ0mRntdbp6TRwWCGERWNwNA/UsT8Vgm+CcQBobGazzPlu1Sekgu4KVuJbh63Ucst9/3hw8tJahDu1uMnz8XUonOBP9L9IrsETaRkMi9Nsa7I5RLU3m5lbzJnpIov+y15/AGgqzfBaqP1kq1Y4g3Mu9pi8Cs3RfgsR8gt/D7aoDOk2AL6KnIHlMeg9+vtYmXVaj7wYOtP20H8AAbBrGi8M73bqxv5ECmiiPjlMlQRhPtSNoFiTdnmbNyTAZNH2ZCPG910PMTTUsF1BNBfA/fyQbcEANK/IwgOLbYSK/UKUDryM1Igy8EWSyxyep+sLNInntXT1x2/FHJ/PIbe3DfMITPd5ipYdAixYJFgmalui7fHdWBaMCaD2srjr23d9+f4CBhT5jN4mq91OlvrKJ9lrOiAEY+4QRxpufsT77BhBWy4M11kzkHv4FFDwroYvTJnmkvTL1AgHYNPUaa5HZmYWXiUHZIDWUs5krR2L/FgMxCCSfIwgGHQOB/kgGYh/jnm2/h7K6Af8YBXPwxAfBJoZeLd88NxyhMF/X8PgT4h/BFGD1iV8lMq+78+85GsUyQCLqERQw7Cj69OQTvojAMd42Q2cn66Pp49M1OhDNfCuuYBc1IUuHEZanx2wQIZQgEbWJtBge33/50BN66j/HQJvWDlk/Dlof68IB0rYiXuVlcZ15Inqc7xPDixddnv/Wg3wKhQt3zqW79cwkthVkLr2iLleZBwnm4NB5WRJrtel9Dl0IPog96c09L9/j4E3r/gkEoGnbHenxenggFH0me24MJ6sw6mwUi9cUtsTReRzMhPmiZv4YvG4/+BGCv+BLxv+fUwTAP5bTEwYIBwEDBgkCBAD+crxuJm7pw/hwAfnan+HKJhiZ+kyP5cAH6KO4P4dAH8np/h0A/077+HAP+kjBb+E/7q6WcSKl9yN3xQOH9LvSWeTeyfq8kBwQZttoctPqAw+DjcQ1pY8y3ShNGuvPMxdr8Lyaj8jOlgnBLK9HFwzfAPBFmnWLdp+RBN5vx8H5NYtg2FnT114USGoWDq3K8sYPmWrMTDJXPZQ8zkZG1X2dd1sUVUl+Q2hv/z1VjOQnmCqvNL/h/+KYdQhA++ZJSrpTNWMjFEHdfLGv3Za5U3cKDbfrT/AQJG3YMogKF9g+yvPMz2rMyjhdihfsXLa/pYgsBNlHO9i644cCJff1ew2M4XyUkukU5T4UL0VsgBOULhnvvl4XDcIRKM5ic8/dmHI38AD0Ez5xb9sus0yxl7Y1OriJjEdbdNbVYQFlwOfwQ3UdY8sPt9qRMxcFose5k7oOpsiNk+rPAEc697Wg2vCnmn1TX/jYF8zBI50hAR0Lag/gMAEr4MBuLz/IQL5n/HFP6Qgv+Vga1lJpzge5fmJyLxc4UnK/PCl8Es8Av+KwB/+H6gn/HwPif/F/l4Bemt8w768ejik/x4fkqtC/fw6M/hxx/H1b5/kIMuw/w8DZ3Arl9Ch8gRfBgtxSwgFnqppBgUwLfzQEg/wmBGf3+Eg/NPhIP8CZRn+OgVH9Q/6kCo/+v+/AVH9Q/14Co/+v+ggFR//gNHvIZORObSEuRovzhRcQYwXHS+h3nCDqoYh4GzAv4AmnOt76X4FbzNF0bfYZGpFABDXd7MdLvsD9SH+CCRGt/62nijdNeGDE8vYE3gGRKY3Tk3A8zoTz/isIvJQZMs6dXD9E8Mw0sjCTypXCU2xf/jwJvQHrR/gVN/Q4pDz0ChHBkNys9SkLLYCEMMA3qIKY/4RxYzFv+AEBIlO2F8vJ5f6Xycy8oTDyv3kkJoH9mTwYf/gf/IV67+JU3/8kL/IaAG4zhKZ9GuOVgsoVIQg2zyleyv3ceOcjgCDbWDQjrzORr+X/8gBAf87Ag3X9//5H8n/5Ed5X8N/BTMvOM7/D5iKDAtk4AMN8FEo5BEXDY3+AhXxLxsIdNFVlIiM8XcpECkDvrVai73kjR86hxZs2sIjxwUE3MRQF0rK9+iixpW2I2L9LqhEjFIoL4qzA+/1IPoTVyXLvTIQQewurF1lM/JJHenRbvaDDq3RKBWxw2Xjw5XN7L3h7jJCai8Mk46R+wSDw9bh49o8ILFmUnzoDO9tqkpJjhU44dLinQRDI1+WdU+8hUxoc6EvZFLoikTYFpsKruZ41fcdzys+GHZH/wEH85CycSr5zf/w+BvoyKBDDTaZTIOyJTWL9uucrkv4D4MkxPTnEfeXFyNIl5sQUBBAgv3q+V5BhhuwW5TxMH7amILnYQCzVcZeO06a0ORkicmdQe034dIF+bwBzGWcBac0vOg0Lgj1cc19+vvAtpn5B0fnj5LCQtuP31zs9Hn7j+PquD4AT0dbln677btByhOreXrIir42G+jfIhxrxjYjDuN8eoj1Fc953x6CK1gxOpLW59YOkx/mdbMbRLmrP9JTucNLveahNg38AWS0pYMjL/ArAelt7RQNY1O0i8jJCw0f4P9nQNyH1v8HJUwppljCOG4gQlEnyOHE6yzKa1KPspjM5Jp09eBb/BN4pIf3/bVfyNbVAZI6if8CsM+n4tW51QId4icYSh/I5KQD6utQ3EzaasOrUqGyUGL47wvU9/p3mJgM7tY+t26txwIyZYDD5X/BFftYf1tedVhUlyHG+FkgD/sG0/IZvbQ9EhrtfxZnHCkUR2EbgLz+OGoH7Dm0Zjz7xn8eZxwAEOO9/ArZepzUgha01SO0OBtoWH1d5qS3ufjkzZMgVb3PN1iEvEGVzTTosQ+sAsgsQhSSGEJap9u14PwqfwRGNjf97YZTgAhk1ZPjj/AGH9PC6LXkC1VgPJgUS4fx4XXtjDwfwK5fq/CFFfmJv6sU3RjYGMKOcj1VYJBmURCMQT9wcRFyNpRd0STvRO+M+6F0KfwWGMDEkp6+fvuH8C30ID/EV9CPSlLg+BT/CN9D/CYCrB/+Agr3/ohklul1bO8ayHo46HtvcWkO5AS5yAYCLDJEzu9L+BNHXraxncZ0g8qJN2D8qnA8hydXey5oD/AQLGBL7iWr/u0VHtJ9UJ/CZ4UCfIVaf4IFMl9WEAwXFKP4DAAQxgxiOpZT+EzzeAuQok6a5cUwH+XCHAJ/j/0/A/48fL/+H/V8IoP4+zi/4/Wdv4Q2qf8TA5xH8ObuXmn/xRy4f8XgLf8grqwNiIzN+QGwNzlfcq3BRYs5OqrSI5etlHxZyykK5UoZhKukRmMmA/+lgT0n+KUiNWRceqiUmwE0Bvf4+MFP8gA9Tn+QhXuz/SQVuUpX13Lg0ujsYnXFZnAfoGpX6lbOn/FYA//D4YH/j4HFN/jIKf4P49RlAREc/MRFf+PXTID9gN/AaD+Ewx/+ExBP+EwQPgANSwANn4AMk4ADmaYXzrWJYkUt9+x6z0PxqI6J/KcXFrc79l8a8Vj7AZjpW1Y4pZUbqIKcHmusiTg7gJieSzlmsRc7ujTt+C90SWmHAAA4Gnt7IOw/gkKi+ADg/gMPpBgMA5RKKywg+AH+HEsT/IQWGb/D/GGKfw+KFgIB3AwGAeQGAcoIAegAQH+KAr84HCv4eF3AGXBQAg/l8DJ/h/21/gYDJBQBAAQBvgCA/xMKbc/wIBUQH+SAiBf+OgXL/HwIvl/AoLlEQKAUAIBM4BAf4mDmoP4EBUcD/DAkEv/C2bZYK1phwwjGXN+vfqBgYmCgJABgZoAAQV/Dq6eBgaQCgKABgZpfVAIRa1Dg/b/AEUeKg+HQNxOa6gd74QEAOAEBuQKAsH+QIMGaA4/+BQEYFASD/BgYM0AwH8B5avUAfwJsNoAC6ADBgMGDgBAA3AD/AjkwgAR8BJoEmg4AQATCBM/8CgCPOAXsBewOAEAGFAYgAAAMAEAGIAYmAAAQAEAGfgaQAAH8AYA00gbl/AhCGBwwHH/wKANWAHefwKAI3gHi/yfpD/wXijf8hkH/+HhDb0UfPCec3O4/abxICHxsPmNbaGl/hHAg7GNpzRSZON5/hxijwPhg+F6C5wiUY8DBmigqG1KwH3VpltYlIoH2uMrggLWAaB8nbnWq9mj0lXZsIYTCl7wyko5PUyn6HbiW/Bf1CMB1ugUNj6a80zVIRpjSizXMKBxhUqvHo0wAOwrfDNg9y4jla7wFUGk2omHtWbhTyo6Y1iK2twOB+E2CAQSEArJRxYKDopT/EwZQrDH9USkTAARBkahhQ06qkek9h7M/TVaxDlSg7Bgh22n+CgokSy1iLPgjun6243EaJdGx0dbtu8v4ojozvdVqSrXgqgPDht68383DzT7HKKb/NQ/Dq6iXJg4VMYNfkgUB4mwL5D1f+Di0lBSib/hS0lhuvrwx3bmLkPJf5ctJAD+BQGb/Twc9VPxbvKByB/FZSx/GJTx/LVpIAdNxuS2yOfxzzSARlzQ4BPn8e2kgaAYusHS6N615oRBor/BhIB/CYUJ/BlFTZKja7aJTxhocD8T40ipeB3pMOP+Igm1b+DVVX/GgYEuWfOAhNeRuMv4nBc/4D80WnuDv8EAvqX8DgXZQD+mRlRn8gIXTcKZB9X5llYRjnaTG9Ao6enP4rWif4fA8/4/bqfzGAPeF/EwA//D4Kn/FoT/zFnuqhIJ/LpYoFf4QDAFv4J6EePELRosGxv0hS2X4d3iCf30RP5lv4+7qKD/RwgUcf5CC01/4PBU/4kBY/4/MmEHUmQq3JbsB3CC0BUzIjXRyxoXB4ubnXHKc1elQW8QL8ATfKFmr5PeOCUx4a1pmjJsMKXTj/5GAj/AQmuM7qUAJAWuSjf8eehP+GhDIRNrIdF2m5swTLBHQdzesvwUq98RD+Pvoj+Cms3aqrfD+/8CgWuf415tzCyJ/C4LeFyLo4BorLfwk2G4Qp71Pp/AYPN0JADvX8GAT8f0HtWOi4wvhIq02iBc8eP1aKKe/wZbkY/aSb3VroESRQQS35KyQxkb2Af/DolX3/f+egtzcAARh9Zj/CAPZM3Jad8QqoFAN8LTtHAnXjWCZj23H8vJXnAf5CBctv4dF9/4Lcjg8VtOL+Tgx/+OoS3+BQH87nW78gel3AVRVyRjkBaKDH7/4WVwKrjXuPbnR/it1t/jIS/z4MhDQ5ECfeo66MhPzNtr2CJhTAueTKM/hyIp7tC0/wNBjPXFlH9/xDXbBo46UW3stEZ43fwU8HAaWwXI3/BBMZmSJ2UEKmegB/O4L//A4D+rPzG1f4OEwQf4/Bf/4xET8AIP4JV4APoDDtGYQRUXvoJa4+UTB5STh0NSIj+Hx2DgP89Aw2TB/Jb9b/JpBH/kgFP8oP8+A2HwB/HIN//DoIfhasc5Pwr26n/nwGbZ/izDn/jIi/Af4+Jo/5fh7KrzyAJ2tGBcanJgamv+JA+n/+AwBkxNzMi/w0Gj3nBmZmKbmpoZmBsaGhyZmBibmpF/hYIddMDVjYzY0NGMwaXBEzZKVE86NWJlYzEwMjN4eEZETzRrTv8PBD0RubmJycnCaG77/GMXvP4FmUPlAl0PPGTal+J+hI8NJcVFv9f4KDKAtT/gr6RoZX/FwXqp/B6H1/Cf1x/DUcf0alKIfwNN07wMaD/1ZyGIJM8XuiZ7axBQ1qPg/ECkJoYDAzEVnDOT9z/CgFV/Af52qfxF+dgZy2Ii+ExP8C/nfXBdPBGf4CBZqvxo3e8EhFF5tI3hxelfoTth/b5q0hAjtuXlw//GACarT//HgTVb/HwLp5/GNLAPCn8E6HUpVZzZ7j6l3gZQPhZtqDpF6T1wLGhjPcnMVqcI4dubKzhjqYJT2mPSbEHKFkKDa0duy03JZX8NZ9/Ac3avQYCXgcp/iYG1O/xIApDn8Pm/v+FheataRLOi9deWbv4JSVZhe1qB39CzE9KTcSnEAMqvWhcq/EPWiEkcxiQHCJgQ9mL+EwKb5Csw//IFApwdHeP8WBm7IVSf1EbhEH+VgdUn/Ggp0I/BLOLg4ENrlt/wsAH/ywbhJ1h1BVhZ5xumX4IktoQfOey5LR0iCjeBH1Z3TtjrGz7o8NIEXh2uiGJSwdWZpPfsGq/MuWHSpJU8OujQ6kKPd4jn8em4X8GEQX8JhaH8jG4TveEPcvPD/BKhpCK1YfShf/BNgd/CILeGUQwf8mt/Cfphp+Yxif+ChVef+DqO3/MQhc7/DyTd3noh/k4PnS/iM3Cqo2HAQAanNzmjoF9MfYwusQ9Qtx/n43CIP8fA6kP8EVKIQIUXQrxSb/ycGP/xvd4fwQA/gLLgQX8DlvyY4MHqLowDZCnlK0NtRhGjX4laEH81m4X8PlJ7KqHkjqRxjSeuHYMP0RySCalvKAf+PgYXOA/hEL4/h8Mf/mU9C/hEB/Av8UQOIT/Ftmd/iYHol/os2i/x8Dqvf5YEPJ/5lNoiD+NgYf/ggio/hkI/gcD3kzAR/z0D0QgTB/OxtF/E4n//BkNvZrXMABTzc3A5nWR2HwFfSghoS8z/H2gh/n4HSpqP8IAhDScFxmYHBkbf4kFjwjguMjk3Njf+DjeIzNTU1OTCNDc1NTQwNDMwMP4MN4DdkMTc5OTBhQUZOTmRLTjE6NM5YjhkNDU4c1dGRUc3ME/+ETeAyMDAwMTZ9/hzYfxbEuzDtWFMGm+27AFrOd3zU/gCAXPGf47YIKPv5D/xB3+B7/EsP4f/xDg1q4jtOeJbPFmPQ7zr0qxBJMOptFnnRXLgafG/NDyRCrhNsSvLg3GftG5s0jrg87y14fJ53+sDQkuQouyRlLO/4l/xP4D72aB2l1hiAmMLtuCQdbeat6QST82Vgek59U7xO5HjjnmLbxEgcrNDTtVCZibP+ORhGzPf2N+nP/BARNvd/+MCJtx1Hc0/YxK/kOtJwUCXiJG9cWbCnwgOAOz7TawxCks+SGnt65//ZQRNv1z3+3gibfUyv/4ARNvp5YRzBrk+j3iSEq+KKADqdqCxCqsulp2R1TNrL2cESKT8Dpe6/vczjf79g2ZNsWL59sbDi9dGiQam4ZrhqZbGr/n4Ik3xZIPKQhnPFK5AAKuSNQ/2rIUwS6Bi8oLw6wHgsutcX0DsieFeujUytReJBSIpj1YZ04YBj1HsHLvHyZNpcw7YS/z8EUbmRnlgydkciNsz1oawhAIaouCJoBTOK0at6TKqhs8qZjto0lB0FgpSspMGKw47vkQT/ANh8SD1V+E8nEgFbp3yHYFP8fgP6DyrmoznbhndJfLkEWHd0gMbninkORTn3cRoCQ/ohMq6sqNp1pxaFWbPQMqBfDV80nmW1mgFnlgGFJDR+Cw5XtH+SAhlKNY/11Yhv81Wo8JwbSSW4vCG7SRH5A06CDjL/xgRNvXUG4hbbQNmsRkN+PbPTbsYSls0Xjrfsx95EC51JWwaz/AwlJ/sQCJt+ue/28ETb+E3/0AIm364J/NUbF2YEvRxERBwag1Bf1nGnla69X6eufR/lqNSgHjcsSNVlxs3abjk/chpdyFGUgUv7MNfbW4RFBRV1olC/j8PktguxwwOBDlITg2MkJKtP/rCQ0fUl1L1B9GQevSMQaMrQE0YP/ryFaBY2oYTprS0BLKE2huimLLV1DFPopHNCXv/iT9S/xMKC0f4MCPstjIYQCf2d5/gMAEgYN+uLaUSw83J3HzNc9tWiQ5+8V7ztDsxcYgKQOXAEe/gzEq/xMKDdfwJ7nUe3V4ygJdDLd728SrD/gIYMG+o+7RCoGXAn4Rss43j08TPsN/h3/H1H+qf9/2/n7/fff5e/3/b+Rf9+zg5vJuWOw73hXb6UuryYXiHBCkxuX0Kn/eX+ABFJ7L3P+AMvPE76+TIej9S5HloFfOPWrD+5C4QQKC/jtgg5WhAKoB9r+PS9ACV/4Hi9DP/x8EzAwJ5vazsnw60qCJvVtmkmujNoovJhiAYQ1FZDhpmQq5Eni7lo0CmKHZqJG+e/IViY+n9Zsp9ndry2qpSQPmz+Z/j4H9dHP4JqMrtJ7ZpRzDeL6zX+ABWviI7vI6QcDYED8JCylLaW4Qd15bUAif8WBMDcd/kYJgb84c/tK/+af7QDjiMs3KHvMB/zb98CVsC3cadzoZ01nfg/gAZeVTkfS+D9Hgv+Yf7T//wP77/Rf9pGbPei/wUGGFr/EAI+DQ4e+huOGOeS7lpg4+lypaIoPjRkSeomLp44oJ0p9pou8EbdW8WmXrr19/+Ahle+NjpCjfewyDlYhkwZ6d2ki4GY2VkRkE6JkTMZajiq40Wb6NGxEMCwvEc8nHRgCqsEWlYWcwMqdo1Q/Qv08xFVkDU1blVHWyvP9Y3Qkh2awfK5Wax/biZFzwhmSW8f4A3rBd2zUXYXBxjICMCP4A3uNv9nKXqsEmEbPOHxjwlfJOVqKp70G+9XTFjllGieRshX9VQiydluLYDMUq/h/+0X1stlcpnWyIVXKN+My9nmcLZPt8zaiKGTQb3Fwk11HsPASLMhlDt87fDTj7DbqRNimapZnZAk6DZ2/Fvm+w9ofb/YCFlLG9jJ+WA80Du5t7wqlhDCSs22E4dTWK0eVPvacUzHwXS/kBMqyRek83pPatG/f4AJ2QdxhknW4Qkd8tpD3D5WmaFXpH2g/wBDObU9gCqaqkDHrzmk/7DrFsNOpXydwpDRQnxzUPj1DLC7iOsgsnB21OKG7OdNPJZT5H10U5PRAZKRYAACNPb0d/jboWtuII9MwgJAjfw2e0ESixZlgSzGMNcOxfQ45kCRinK5iL/wAUpP5VJzm/jCLtANRHLC2ZcgXyBW/+CgbN2/woC4SV/8K3f3uUlBWRVH+vgXCMA/gUBm//8C4R/wcyLXp45COXzP8dXfxuv8fmH/6eBcI/4TCfBDbgD/WqijjGJNamjlxxuJOs9eZpeQR1i+KgLgsHH0/hzo3McByG2v1xbo9c6ZZsDb0YUOwwC87TL/Bgvq3FQgErvmb/AYACa0Fp2P+Tgx37/EwWPH/DYx0W30offwqBC/IWpf4KBmq7+By+eDgwxBuP8WBZIgWS/5eCyRP8vA6U/8BgVH8H2Sd/nwL3vCkD+KwL/+L/actWByM/hwAfIlwt/zwEjELM72X+SAujf+OCGgJBP6hiC/8fCZPoR/k4NYB/hQGzz4fBw/5DiC3L7ym/qaoobRCtn1FMCvA9hMxUlVIaGezF3IuxUjqrIlVCH4Yu7vEnNmnTz6oaW++CBOMCubKS30vetxaPfT0+Rv+Slfz/GAV5r/Il+cAABWJ2n8GXXxfOQIv/w4C/rjYK/4sGb9P5liC+3wWv89B2Pn8gRBf8fJfAA4H8Pi0f8GmRAF8fycF//x4h+fwGA/t36IN/8ENBWslnIsGcGVT8DrqvxfG8a7x83q/zXD1/4wDlp/4m9oP8RAPRx/RUFX/J44H/N9yd/F4l//II9H/Tdyd/Egd//L6BDVP+RgtpI3NjAuOTAzNjYy/hG5T/gMAbODc0MzYy/gOgczc2MTAzMTf+DoGwzMDE1MTRjNnNXMUxWb2hSOjEyGRjMGJjYWRuSkUzN1pl/hWBsNDL+Mrk0AUsD+GLTegP4FtN/tX/9oD1HQn8E2m+GeLIp6KsseRhvkSMbkClqomj/AH0WW1x25EBEHpURExYPXsH8Or9ql/VK/b7/z8vxsN/Ly/b7/yKv29p0PVtpXaaMm+b5Flr74Xn7Op9iHrGI3NgKFMUnTJMSH/+KhGnaOgsta/yAFtEMn+hAtogg88iBugYD/owFrj/xIFtEDQCDA4ECAIQBhP+rAYlr/HQDqXGAlw/w4APpJNs/w4AfqlUvfx6AP+X8OVYcUrU9N/A4BfrUY4fw4Bvrcl/D43uCQC7RdiyHyVKMxd8U3XB5c3tA4Znv+UaGGit5H3zXFX6qY+gaZe+qLevfA2tw6M2AOa6gdFzonlPb9m2XTihZjdmvSLNsizrrHcjxbmDfaQH8APizQmu34c7QAa/XEkFYsY3IzWtutCCIx3S7Z4SCQD8JfQ1eQTpxZgz15chWq9W7v+f8QRyoLdmaXLPN6klAPn+epFOjbA+hWpwnOCpodrEPG7kRtGTCnj+wDNu6lAI6Wz6buf+AgwJ4JSTGvLantGHf71JvxxdG9fMgiBhlwhcrBWpV1L0IhyDBUf0PAXF0IsQXZZld+G/JPoSXT9Ij1D8DPHJt3iOU8BulPB5orH9/wEDr7upAvgU6hNf8E5DZ4/K3Vk6oafC0ltdim/DY0Bs3TY836EoPIjVPvAWKwsz2fw0l6RkIBMjt1fwGACecFlB38T5jn8UpegPi/bfS8uU3toE/wsl6BEl/Aab3G/wUl6CHbv/wsl6BDL/F6XKf5SChO/4JAqP4+WRARva3g2/jlo8B4D/VQUo9/EoG8g8erDv8eJegi3Vn8fJen8PTBoMieWITfw4laHd6cwXzPgKHQNcR/DsfB8838faUH8fRtoFQf66CGzAEg/it2cP8yBSb/8GUGO5yBCNlXXbIMlHfrToika7cfwA0zB/n8zQAfgJOISWZGRrMJwao12Da2AsmIZzyOZVL/L0TjQfwvPLiB/rwLx2/mwJf/jICf/gyqRZzPrLnWxYl8f5fGKNLLK6vaUQ0Z/l8A//geB9AQInEJDopbiB0hH0GNd7qIXh4i2JUSRS/7LCX/8gBWEX8JJaju7u7uIiP8Kw0YPkCH3/wKAP/4KCkUvoAqe14fzeT22aL/gB6APm5ufbD5KL+HF5Pd/wUCy2+69mY3FOzZRjNwXhmCTdMsCjBP6Ifs7xhiA4QZ6UUKJLErUsup3prpNpbSOU3Pb8YzOd9czfwER1OL/tAKMnaQF2M07fE/PXNhITUw4rPKSF0Zb+AKgDZHXabMmpmpblv++Isv9kBRk4D/f8dT/REdT6Nn+FBAFiEggwSv8gBRI5Joaetp8WZJfDYHnBV+10drbdRhezQVziyodvwo5V2U7pwTVbOxv3U3ADxHEPpevTD3Rxr5cHTOwnUQ7DPzHNmldMw4eyumhOh3B7n3SAHZatmsawNQNHvKI8i9sYBS2NjYQXuY5QE0Y/4CG6drhGda/i8AWVhUAkJA6Yj648vxzpC5dSlbPTNbU+BtLFBgrg+8wmc1wHb0r2ShkKuc/gCzMK8kwdlmrTD6vLY5JNpABevh54mMSRMS2x+GUpZQ1q+0KbdNuP4fjqTmq313Ne7ZJXE5XVziPQKHJ+hcDvb4CsF7ISPxLPaTL0014Il8x4TeNwj7FPBlO8sS1cdLGDIYv8BCepfSTWdOGM6Ez7xPNB0V6v8A7p+ugJCQAQ5oUBwJhsgv7HsdisnhWmWZQQOKaH2vzjjtT6Jj2KKVQlXcQhzLW9rYJQ69xWwh/uWdoU1lOwwWiL2U0WD+5lI0EfWoy5QKgcZLli/Pk4/kosK8/hz3zDucFfPx/AEZCR/sXxtNC8eevEWacAm7oJ7+QwmWByVCkLzTnWJPo8hed8x2xBB/vp61/hAI2f8v/2BIcanvsrLY2MGHvnsGDM3Z3n0Bx29U38eb26Nq1/8C2jP0/LMrrZWYBaPoTfCItu+aky7ewec75Yw5HYhYnDkhKUT8UYzebPzgIKDmmcGR8w63ssh2BnyyrfkGlIJTKoQDRzvB/wQIMFf+JhAVH/BwcHYDMn+DhAVf+OeuQF/x8ISqkH8ulQYIj/+GhTngFDQIz6L5qx3ElMS/15Q1lKL2/YJnrM/jhQjH1XJkFiIINXEIHpkdAAXQkLkpx6acFLiZpY2YO3MW3KRmvAZYs6ERuUAA7YsItJPk3G8ta/IHZhGNjKwjBzOwI+wEMh8Oi4pTE6KSqyAfNhGCzgadw2Guxu2wkNGNRGT2MPXBBPEt2pMplngEEKP3Izubgz+4AVqkqt1vROKk9OW2OP8BB0VTOZpP05WikXPJeVC55GULXplQyL6Yv0+EzaHyB0U3KbKhAbo+75mBjJYCC3F7zstQUXD4zfkonzwA1SKuFgUiPaAyZS6STv+Ahn34m6YIkiJg+awPLLhq1Ioi/Fd+8vg3yir5Zn3pXeB3V82T4w4BBk9NITtMGNfIYgnZOpmHFv8wtdtOZsgWmQXy+ibLIreyT47Fy/+AAui/gDGmv4fDIf4AxqwMPREBYYmmAXYeynvpkI5HBkhZCBMAB1RFC0SzvHKrSC9ELinz8kysUaCUySBx8NjL8QzQr7feCrxXtw2GB/FyrsqqZ4Rop+FtT7K87cRWwMYTYyieN4NgAJSDnIdxo1h1mtq7/NNwgAdEtj3OC9QoPhV4wGJ4SQ0rzVDdlYsxcnQbO31ttCNorCzbzj8QgpiIyMYgElyF2QALAAqcC4AuQk/2JSXBv+fzKdSXBfV2cb71ELTbMnOGnO6t24lZGvEro/xs8/5Y0oPoWVEghQwnR61uqAEPOoxFu0SGQACylqxl8vSy72W3JDh0PS8Fbofe0m+oJj9A9GrGAzvWMZCB2miqfqzgZXwzUtwJeWYwe5I1qfgGWeH4RSW0c18e2AENPMGyJszQt4G92XxW7rE5Od1YVDSFeAF9ghyfgjrxuUOVGqwFkyi1x9EbgLNsW44OW+iEnfmP8BA6cPMvJ5UkfgvV0YCHFVQFbbNeuiXc+QU2CiSr9fQygKsifmoQ13gR+KucbTEt143pZw7StgWZ2zoPTEoIi0240lGX0juPTrZxqw/wCM+uoCH8GeOsm3xg1H38+Ahw7dmzHNKcGv9xOeEMQTKOSrXNA+jfXsE55Cr/NMzOdTfszE1PdckqhMoK4xVsRylNr+AM4nY6iDvpwAQLnBeUU6VJ6+qyoTN3IwNuf2nQZTg2RSBUB65ickvv8BDYv4FVFl8aZdkz4rGJBEJtQn7+n2ycP8AZ4qWpbiEjkfixQpbxP8B6CX1/lIOpFDA/g0f8JB1IoiOxbWAABOIcVIE9Jl3ZDEZNDtPP6lrxrISgsvAIAqgCTygym6r9/ADEVstA98XkEVkYl7+IZAEmhgmUL9zMeVf8CxwrZLfwPwRn+Dha4D+BwJP+CAJsFdP4GlLgHjDe+Qdxy0xqe7SN0LhbfdOXMnXodbl9uEKvFR2A0Ej67XRlhQuvu537KbuIA5mtQTOculKavH8uPkmOQHO+LzCg3LTxTJ3ncSRiEmTqtVtWJ3WKUFBg0anRCL3vwA/lxOh1GjJLjFBPAf82jEK+9vljaYdwphrsvJNHCzRl/TA0aJ1AVd+rHiE0D1GY6t5iwewt4OII9+/Etw1I63+SweQxuLxwnodYRWVnORwmy8d8iC5lcNe/BUve74miPiIidr6E5ljigQlPWMwoAGmPyM3N3qWRK3ej/MtjbPA/aY2nds4TGILZQ8qI0vOEJR07/m7bruaSrzsOLbOWfsYNwt2bwDEkCSnkr+ILYDziQF5/jbnXae3d6P4Ib/OwKyj6tcjKsWljKHj7wIVQoIQ7nntvkRW5hW5TL8NfKTjsmHuStScumvcc6DiPq5L4RQYNzDY/gghgov72IYPc4c7FJv32pI2dqpIkGi2++/SWPf48R11+/Af4E1euwOs31hgK1lxSUBORN+A6yAWJ3EEARp4dwi6V9Quf6fHvtzqj6UcCa4SaWTe7N4WEwhCoPDrG/ga10/w0ETMzv8LWuiKP0aywfzDa6fwIAzf3/ayn8c2unqGwMks5v8d2uk8IfwAWZyf8n2un8JhPg/xM6H+ToRU/EvwCE5+SrfQPyDYYuBSGPE0RWIwku2M/+OBIkB/gUL0EpB4sVhliFaol8S7SmqGal+qp8fdSxrzPj0TO5rNHSp038xp61fxDVmSJ5x7IkN/gAwLHFWNB6f/BwOLJARfPwgGXjjr/AYAI+wYJLL3+LBcxYDxIKtVsv/RAHfm/xzPpfwYaDEyCHmNCbg7SVmOg0TN5lwkD0rHT/HwsO/w/LZAn+Dhv6f/FQl6H24onsd5T/D3kE8fw/NrV3GsY/RVJu5XilE4FIeSq8N+xuky+nrREQmUmyt1xj/hgGC9KWL4c0yVGqfYIVdNKpwfZwnf1V7/j6c2DRZLYHGQZWlrLrP2ub6/jrWrctP33E1vPe93BHnLAQkkHTf9u1H/nsGfDvKyZWWyfDv+PwZ+yVnlAPQfv4cJq0zJV+gEURXhRiROuqhrzLA6PohA6f8MB5eMuFIpy6FyPmyzEGnz/APdxD9QAnOix/H4M+KiyWwO8bK0t1n7X19/xdul/4CA0YLsILZaKRan7b7iNBLc+xH3riD3gFz5/Y4cXkKcQrVSda5hzdKQSXXqN7HcUMvO/ZxBNjGQqZ1AvH/DQx9wQYCIuH+Khj7kkG/bOYbwAgMD4JNykRpG8KviCkFH0xHmolZgYPP/Bw4MDl/wcAk/ws+P/4eBlb+S3bhvxs7TsrMe4sJjQV4CFOCg/hW7oAYOEf7A0dWvA/gOKE/hQT/AEz/gUT/oP4KE/wJWkJSLFynNEInjd9XrqUpM/SDi1LJpseAuf4dD+wABhaeQ/OO//j8P7tB/omKpjUxyDZyuIojeXIXEPFG6X+coOO8tq9x1hYX+DENnehM2c9C1cfMxNqMo96jctxG+Lor+QAltLYHGKjZXP19rm+vv+F/M/sCT7LXBnJSy39j2qLANoqxDPPGKVm5bjtooTbkF5hVYEWXPdZxrD3QOG/aXtIh4rbQ540+tGf8IA5t4F/eyasugMrzEwG2sYeWWm2zORby452k0L/Hjmapjiv/ApXqU5XBb9fL62bPRmYd0ouPGikn27DrKKOm6KiU0rD2kGQ+6Ut4IFAMnz1dJ9PBfQtOgVNsSBcf4KBX+lT/bQK/12FtDMCiuqDGiJOqXtEygDUOlTsMeyVsvSEWvRUWR/B/zD3PCsWWCHP8B7I8dCASiT+X8BgAn3Bg3CZ/wpFlw/+DBAC9uQQF/i+LS/izyX/gYBm/j/XHb25lSW5jaFY2/jTXBwMg/ywCfH/xWsL/4kD1nP4QuAbM3tv45grp8pXNXwAD+Xdcf/EQhPImHmZMYAiB3zlefpTw6SKmTGjTcf+VgUCT/h9db/h8Bf/izRn/kQGDAAD+PYlIA/hq0rfQAAGkAQf4TgBtDjDbACoP4fAuf8EDso38Jvwv8JwLP+MA6tVPcvqfP4YEev4FQFzHnWJXyLqBrh3/l4xLWbN+XUFYXMyaZke+mx/qwYt0USUoiSNrGsPMtJRcwrMEIq99XzgeiLqG/4FuyVIIlMEY0UUW+UekfoAxn80//d/erXd3hRaNvFhqID+CKC3sNb+2/Ks2XOfpQ/Ge7DnuoFkOM0ZskoYCATAGYKy3JV1ZP4UFhvCC5whV/yME4JfqEoKBGMQHzDN7xQ//CBcsB2w/glN7PS8bRQLvdI1elE2qqUcO0DNIwNbC4PNajivhR5PVmvS2/4dO9qH+CzvaOGv6oO9gc0/y/lDW6QLq588MJyGggqnYVVf57O9gABhbHxY3X4/hyO9rX+H9c2c9TvNuBxOkeCglPTK661HmhXfHloMFEcSIjI46YE0P8MDkHXHz42gGZT/ptDWTKwXPuCdqo9g5fwhGqzjGyP8Bne2vtfx2d7ZUp+9ubRqM+qnu1bPL2nlRRK1Alfz0DPv0DYfi+ZWcB/j4E0Zvv4f3bYF3FgiKGH4ILP3P11jvYqKYHjuGit+/gFljDjBISTxzlA7/DAzvxwfBTsYCOIUzIMuXRJLE0Kn4mzP8gnezA4u8bK3X8LmTR/EJ3sgKMSV4+48YAfwzSeBq+dI/uUTcDoXhXjZ5HvpPPHGFd/gIa5r/xcEsjeUIBf4CCWrgD/GwSyN/owdLD84Zo1zEt/yMGiu/w3SMi38P0nX+Kg9Uv+BwQPz/iAEDz9iBLpTV/DNk2+yUVUbW9/DtQ5DaP+SAl0rKQudfD+EA3ZWYDwy3SI38fa0lbH4bYICh4QCb9zygwGB9P8gB8Yb/CUpyKjda4fwBDDr38Gyqf8JgNP8EtLqJD1ixwQc8SjdMh6eGBNoY3t7FrqtDlUpoxse2Tgbq4MsiOvFUElDMcaL0u5xxNzVrvnvDDf8gqpBBRPCAT1/gIWc2gwY8qs/FAvSBDArp97zAAXIKSObdWBfB5Nm4LAjOKc3RSDopTRUh/XOa0cF2hxOl+MLwmxvhiv+lgzXn+EwLH+HZmU2Dj+PpuUXYmZjYjVhZTA5M2Q5ZTcyZmU0MTFhZDj/dAM8X/y+QO7FuAZ2nyQ2Hifhj+Bh2ayHDSlr9/FYA//Jsz4/wKEzf66B+jwCEDormf/5LEd8Buu+/ZQDK4VCTVGQG4qM1764m/huwWA34R1v4opKC/1Bu2XKVgO+L8EQpYUL6xuuX3bf4rAH/8gDl8/+WAh6QFcX3Miv4/Bn/4fqJC6PwveJURdCjVIRUEcwSEs+zOsxqUvzUqA467+HQIk2DgG/hMA0/hMTQ/hM+2/hMEK/hO6xZmfxF7k28XqmLQ80yKK0NsEaYXmVAQFBf8ICuO/8Ev9yRaJR4ZhYjOweey+VG5MK6ixJ+y3aDzgdhLNLNfSwBiDKR/fShMJX35oBLXlW4WZdPc/b+kL/gwHrSJP8MA9aTp/Cry92/jD4mYYf7AB60v4FAZv//A9aX8cvL3Xibik6D38dvL0LPX3d/iGB60hRth0DopVn/DAWgH/CYUJ/rIHrS6DkWtZ6J/ykA9aXXjtPcH/4IEbTf4HAuwtPz/4AnrqWpJcw9cHvKJ4VZGJbiTUuvrW6/izwH/h8Dz/80D1pdByI2ZSof/+B6xL/TQPWl/B4Kn/EgLH/oAG0PJl6p1vndknLqqI8Zb4SJHhEGEZHXga3KZfYFo3lnS1YKlZ8Ow/iwm2DM6/sTEUH23U/R8XKZcmg/4Aq34nZGa375SD/qQPWluTk7Os/+BQVHS89WFyOtg/hcFvC6vKEXsj/xMFP/4edkumv/swHK0vDs//8D1pf4YB60jgN3/5ODH/465rv4FAf//qA9aQpcLnAfwR9/GVu0TlOOd3+8getL+BgH///wPWl//4HrQv45Bv/4dEPwtwCJZAWusn/7kCZB/xIKP/y/ywVf+SAHrSNTgxMv8VA9aRuYGhx/h4HrTNDA5ODc5MzE4NjM4Mk1NTUx/wwD1pmRucmRgcMhm5mJikrBsxmZ0yHBwymbBnJmkrKcjObixIn+KAbPIxODj/MQJkLqfwLb5qR/CstNVhFeuNeiARlzrXZR/wBt2Ku1TzsIFg6fvWPKDnNmu801e/PNjxqLhaMNYxWEnM10yycYAXyC/4IFtn1VIhAJh5iW/gMAEnYMCGrb/IA3o7/DUwD81lvCDJfPeHkqXZtBoMLbxkASMMIW+sSXANYro8kq1b/joFLwgqRW/h/tu89bCG+7T28picMVju8iwPbAw+YzluyhkuBZwN7ne0mWi5WiYP6Dqwp7/H0CR/Hv52Awv1/DuMv08H8PAr6uNsSOOtNm+y4lLczePAj0TErP0FpHz3ZhtNnnC/vpc8R/H4C/6lWMLEKn0E9jEszBLUNcraYDaQrYThz6mD9HzWjKCUdTFNQ1djMwBa4u1ZUDcjtf4CF3MsTMMqa6k9Wd80JOuR0uwf+Hy2xqrzB8dt9F2GN3SxeJzWLN7QPv07+h7AIecs8wnP9lAz/Vw5B3/EQWuBfxkArgf8Ng37T29YcAACAAX+rBqXr+MwFcG/iL5QAv8yDnhH+Rhz2D+MwFcB/IgCL/41ArwKxKAGFU7eIJZCi74hTI0qHifuFP1HSFUYUclbPj3aZpP/HAU3b2K30Up31msnjFIOvPD4SPr3d0QztYaXC61EPlBBJYJzo7ylEdpnIRdde9iYlFoLqe/RfI5x/4CHBy/4HqPW7+9vZREGheSw9nLf2ox0jUVFztlYVGU/89BlmkDrs38C6mbEgb1w0eMbn3Q9W7ZrFk0rbxAYQta0polbf4CHB+JzNxlfw63Gd5mfJcqhkuSPF2U/+AAC0NTlMQxry/wcNf2gQjCAWRdWv8BgAl9BgT4zyh94exdHHXwxtiZ52CEemimcAbs10BcggI490rgf78Dp4kA/iv23/hMD0/2sFCW/4iCa9pph/14HURCZqRbwot/uuGoIoSFioIXai4XE7WJSXxlReGn0s3gCwDEb5AT0+I2UXocASnG2JbcZmLH+iUGJvvZS2uMxfmWOMeL/AAdNSZCGlWNIhwH89rljNpV0Na/6oVk84VuNFPGC9G2CwkAWbIARV1CXRZzxXGeB+44JZrwYnuYMLOoHnXgmCNxa6ys0Cxu+arSmXvpWjOkVJijs6Gdt9tIKwc/fkZMuINPSF0Su4NYsFZ9s1Og6g2L42PEFbZPmKt+5JcDgoctbMtrrICt/gIew9V9DxVs0CgsITOzxDpB931FggFF506E9yXhErEpApsNSg48YUpfN7yarqzWaLK4Z7A4ftifeyqHb8ABqjVIkADN5D7i2sAZX5yC2jTi9YVzZlMG+mwQOSPqSYPFyH1E/T3/KR/V+AqhHKj5+MToc7u1Tysso24Zk4OpMS648jZQF9jXi866P9T7U5/YqxcFAkvPmfv0+OhIH8+RjzOEqEQeF3uLjJdCCohWhsA5PQDGSHWEzvK0pJEsq3HSuFkiBQVQNq5aK2dZQra7iAIwKxXQAUkpVfuD8yu8oHKYEVW4bigSiujlifbIsqzvVeqyO8FjeqWHsfTwbgJycUVmE6i6YXM/H1GkKLUO/gsc2AmkIAtJvTUyFNWQKkPsUrE1AMx7ISWFgjDZ20qF39uQ92hYHEaMnjB8HFhmUiURm0R4XEAn7Cz+Af0F/POy33TH/AQbPzkZWpP9RpghCJ/wALABtySKardTRdZsyGrjzX7vmMR4LXYlHzNQHeKA1boUxN4RFIebqOlTpUhOnG6dsNL/+Ag63IFg6MNakhw2j+ABOYUZYmoo22t7ND+BeaNnk5dg0NSnVGUDJA8wCuMH2gKvEJV69HK4LO7Mb01dPYVyKtxmjIrZYrkmseCUw19IT4YqXq/gbI2D7T/BQ00wR4aMAQYDDUH8T5GxEoH4Q2ypTlt3XxWNqafnPbLYc/4sC8Wx+DVN7TCkNA/gHw62/kh5masLQ+CECYAWlk8AkqpNSO2uNrQxSunzSfAbTxl/9M9vvD8Wbs92FJib+ARdHE9VAka53/Bwb+0j0D/koOvUZP9CAkVgHysaaXnYLyNUxyoWaBnnQ3q8KuwxZR/ExYXvlebYTglb/nwOvUoP8iCm/n+jg563//wdepPxYh1/4qDr1JVr7v8KCDPo9uw38MbvZq1tXV5k0F/BoCf0OFTkf+FB1Ivi0nyh/h8sdVrIoM+z0ilmizkCmQyTbMbeKEqPN148/gAm6cbiYFrd3nEm0AxGtn/qcl7Y5aHIcwUCnRUulnih6Y51wgsVlSgokJgLCRCsmSBrbOZV04rvM4zt73W4VIlD+64MpYjWWdvTvkuophoW/sNYM75zN5PFwzzRkCKTM+AJvr4EIMpxIKm8xex4dLJ4WOKjKUCKZUwULG9+v9Ad4uAybpL2GAs2zs9EcyG1W6Hz/AQqs0/FNjyWGtjP2PYTKrSg7so0fY1K0HNaon/5ADP5kigQT76CKlyPUvnqmkjHg/M83FA65KnZ5FeXg4kcGqJhFdGexQMp2o6MKzR/Db2NYSMZL0qNLyMl3vuyohk0pBQvYKv1z2bRbbkWlhvTpUYhED7UeQOwyUU9ygLCQ+F4TojDl6x6HDkkJarBO0CUCJolMxBksokW35+rlIcNsXyC9CN7o0lmCPBipRjcaB1ojKchli8h6GA6JZyKHCF20dP8MNvo5GsZyFGgHGZgf6/sZn8PxL1gMBo15BASbR2Dm/gR/IX+zRVFfJSzhudPsGNUkPCy0P2z4hcz7a2kZffMK3O9aS/4oVJKINTamnf+AC3M753f3KCwULYW1/wcGItjU/gIJi9/yQHJVUn+gg5KqBLf8+ByVX+RAxDv/OwaCsBAB/rwOSqgP4+VusD+lxK//CwzEH/C4K4Z3FN/jASTHZENl3OkwX7+G6s4Ep+zQ/n4YOAAYqBEiM499D92ZwY9MBjJVQQPgNCpUvi6s7Rkwu2d/KQkWB/t1xNE2dbnLz4DSSLZ+bwWIpljz77L7kenrl4ZS3Aa5sGl5c+MvkOKDWDe18gh7Qkv9+9WJW8/gA42KysoRbmItMKL4NFEUzeusRkj+AnWtOn4sk2Zs5kv4MulM+S7OVviumKo4ZniIargM0Qk5BI8lViHageuy/Sz0qVnOY4kakhtoECuueNgWcO1QVWzCTmCePwYelIgEHjv2PdMY5irytP4gF7wEV0KkmjOiEiHmJQYFkYyu0w57hHGrdaDfnuiq288eNudflRwMI56eeQMM7erLLyuBVIeEZ9bTSo/TDdJmaIzcvi6DZCCfSFXUXqq9VJM8Iy6Ya5rPr0B79gwX3wZcUfc+ioo8MiT4p8rak8Lax3cquoLbK1PNEz1Bo/wEO9TfPrASK6DqHsKGz/AB6I0AI+JwjqbEATmL2qSuatRuBnfL7s8uoKALE2+amzjJy90GipbFDoGDm7KyrUYZp8sjE/KuVrikQhX8C9ZcG1Za50YeCHmgxpQyKiytDOoWNkwKgREHxOzmAKJWOCJuWYzrvMFj4JQUdfDZaDBscyW5I2f+Cg4RwA4QCVonZ/gMAE4oMBS2OU/hM3y/yoPgJ/42FYCf4F7ceHqUMe7dRhlJaxVu5G9wOQlkS4dQLVIB+jQhUYNJETj/bBM6nAv4ARKga0iHj41G+1Y484WtRf3P8GAibpF/kGByUtV7eYLldv9gA5KX8CgM3//gclL+OW9rTj4owa0I/jlDtAevvUno0/08DkpfwmE+TxjZfW4SnZfS7dvaxMDn/0LB1p4CQ1taf48rDzrRYV/jgGFqb/4F+bIvPcTrKI2dSosCQtm5SMB9D7kg0Chldpvw4WNjJsisYrJnwd60AnhivQdc1cadUGx0DRew4wC+QP3/gQOh/gciuCIMCVydlP8TCzAcOaZyRMXFSAXIHiCmhH5X8eu+H8GuOH8JgEz/CDdCwxABtiKdvwLSFoy1a44D/LHegaN9Pqi8/Qfwsjiy9QYdCE/wY+IMjoMr6y+/jjCsTn5IjEF9m8vHXnz6uDtAP4GPB63/PgX8AFqjk6ubp/nAhfh/i1OD/lER1P5aI9P4lCP+D+LxDX+EwyNPvQcPKUa6Nqxkw+H6rvjINghDJEARfnfJ94xlTxg6wj/IAX6j2TJoi3/QXu7NIQxKGQ3EAcHRaVDyb/vcfwCZpQSUIIYkQygF/Co+q3QXOBAJCATFhJn8BgAirBahl/MI+pONzSqKtKjZaf+uV0AgQRpFKqafvw7eWhImp5uBgIlz9Hf4qGObKG7hAJ/Z4DzgPycF/lW2LdBfdZf5bmFbq1v7YDl2ayGTfQOoomwgPOCey6i5qT7x4N0SYXMBtw7CgHPqgk93ToUUFgXzSBbosLT7nOH3xhzBCyro4CHM6JMGNdmb9C2WD+BPJ1h/gggOo/+GfJ3cFRVy/XGA/gX1uqPquG5ZpbZaQyI+phyYBInmnOcrCYIFwRi7unKHRWRMAf8cBdWEj/ezPFUotO4A5DVJunxCmGb4I1tLwhO3l/Ha/kChi78/wL9/Vg+epCQZy0C30lnGHdi998rhBG81K3hFRK10pE0imSteX8JhIF/hAfr4YWqm/gMAEr/5lK5LDjTVZ1dVKZBY81OHAjc8gpysSDQ0NtECCvuyb2ZCMWPX+GPv+i/gQrd/90HSDkB/hIEBus5IjZWjgnPh7nNCW+9rlhXV/D1XChW1/0CFlN8wBjXXPAZzlikkh1vUy9Lgsi7WPIXOysLk/wYI7wA0TQgGgTzb/AYAI7QYH2Mv+chHeAA/hwfRae4Iy/8fBCC9mjBzuFzCHB2nQkplRqi3wxmdOkaN0hQdl1yB3ZMAIGGaHrVhWnwMVUwlG9LuhHSEMmCX5d+I6CuBZYI00/EMP+PgYbEcwP4T4c/jtggoohAJvXIj/PQSJcBzf8PAf3zDj0IO5rKAY/1Lpqe0QbgvZB/T9W9+TOf9o3mEBhJFRC0Gptrag3aD5YCpUyhiG6HTCLz4tJHWhL4uCCU3FH4MVcSMfrB1Y8f5QCRLuy/B6NCZY0HpLhlnFmhyf8BBjLCS84kSztK23aG/tQasdpVqDRxRuXTAgLhEiLOsAJEqqJWNqkll/8DBGAShf6QEgEQD+HgjDN/h8IwrMtIcxUkMJJZx4vk1qelCQzWZtjUfDizTWtYW9OeyRCG6ZLiZueyLmTtsmki2E4pQTY+pQro4EwdBHzmhw+6P+H2QgbwExFGlZxZz7mtlLKO5eDAavlPRxCwL5Af8DCZNgJCAXE8wp/AYAJOwYHylkoIbFxtUTaYOkTAHwgjvvNOAklbCEOD0bTxIjAAFyA5jBPUED/Bn8b/jwdbf/hMB5/isAf/ywB9Y38DAU38cZ/Q9FZuc5T23/pYMyS/hPQu/x4Goqc/8lgHzn/jFp3/wQNYyv7ngb2ANInI1yg9+vcvAYbOz7ZBi3zn1cl2J0IBLo9bfwGTDEg21zFFsCPs6DqiIb/0IbFmu0QiQ4F7zycQk37/BRMPix9Pvi5XoFyAkIcQk4+0M6V5vbDAgeGC4aEXm5xO1NhS/4qIC0hH/BAknEG6/hCAI8NGD/AYAJlQYoR8UpDql4wqCPvAJFFOpPbiUA1ONRLk1AXIiCBRXHm3+SBs5T/OQNUX/BhyXi3rieo9elZlM42cjbRSMZIAIS4X/FYA//mAEDPABBg/jm2cCOBsvSBAv4dpD/rV/lyPfAs4y/l7H8/h66cDMD/XgX8Z/n4J0MP4QVOf8wCMDQFnfxR1L/6YEYdv5EA/wr/HQjDt/FwG8/5GEYdv4fCA/9RCMOwT/noRh2/iAEDRdZkvxT/GgiGzJJdIEsc5f76EYdgIA/z4JccwH+fAcJMF/0sGBIf5+EK0r/MQUAoACT//AAlS//wGJVgP8FAAf8Vt/v+DhfMn8wkIxLAYB/pYKyLBf8+Dyj/8Wb4n+Yhn6cG/10ItrfwCaqzOM/8AEVlv4SG3/8UETHID6f6eC+V/8fA52YJHNJO+v9PAn4H/FZBX/iYfc4/z4DpL/x2Cvkht++f+LgVVUKi/vFW/J8fL+DKC3/MBAph/lwQsF/j8F//h8KgN7rHSRJTI205UvecvaWW+qfoKwt/z8IR2fysIPgLJAY4RrAJVRH/JhGbb/FAjXsTiH8nAn3+QhF7rTPuKk/jkjD/kQD///SCNOwBiU17Gf5YCNpP4xOw/4g8ICGByhnyDfNWQaxP+xGkIELwy6C5p7n1WCM3/aUV7U+KP5bNg/5fAe0B5lfqkpztfn+TliGVjbZlGek/iPZS4JPpW+zScIfj0sdMHebC7LHQy7TBYjzVVDAChD+AM+ZsrjlNjeVqYTfbwPrwdv/HwRvAkj9Vrv74FyTB08Q1g5/BPWjvEmxrNXIIv3ub25YLwQT/AvQ/kw09vxywiZDKzFHL79sZx7Oa8s5n/jHFfwAdvN5cE6PGCJKx/ioTHuBBP/BQmPcpwgEpD9G/wGACS0GBtN5/iwTHuA+T/QgmPdgfx4gIEm58fw8AD/z/DwB/9f76DQHQL/PwmTdt/68Eybvj/SgV47/SYEv/P8/Ab+//LwN/9f6CDfrv4DZR82eK5n46Axg8ei0QAE8r0lwfw0sAF4bk+H5XfHal6Ay3BSOjbWHf1YbuvO+EauuuXbe43kgH/BAVTsgf+MCqdgrBt7wWU1K+PdNGRscbmLuob8wRmxXt7/BBmdVY5rfp2nt7Mv/ZQmhd2ZX+x4TQu1v3/4Amhd6YTN242dB+QuoOgEYXDHTYwe3G6VgOsbKa90GJyDSsWG4HJCDU2lOdqFGyLUfNCYpwk9KDmcAiGUCM5Gt3d/ob/H5Bvna35bWINdRJDPuTNpff5u6QRYUtHJbB5VwqKuoX546AXPx7oySb4kBXzw8eTiBU3GtNc5z6ARQpJVp17UHwLGT/n4TQuoMZ1KWIa9mFLbArA88BAvWLXLH2X+vJ0pqhTl++mrh8YxNa2tWSWXfYC/H5xlLvuyLGmGBHzdZzW4c0UhvQIqx+o/P8fgP6KGJsifRLR11TZSZ+Q48Q2YwLhRYY2/i5gCZGLWO0BBhvlhnMIUPTUS/R+mZ5SlO2kow0fhtzqBI+OVsrrOvKE3+SBMMnjY7DKF/hIVAEJN3BERi4mX/jSQT5A06CC2z/xgmhcVeIQOgup5sZYCpTeXPhAd2opG5RnvlXE+lH95+7vWYx/AMJSf7EE0LuzK/28JoXeV5/0AKp2A44cpX3l8hQZ6kCSDHGM5e2gYvl1hekbrovCVHoccT0ZBszJyPTBHf8CRIHyJBfLIKACuL34H9T3oSllONgAtE/j8NkWXMvdDD9B8nIQ1HjaTQuK+pR78G2w4th6vZuJwbXRuKfaHt4H8ZvBaFt9oeiCcJX79RAO5niJvN03Z1KHSB+BJ/yQI0O/44AQMpv/vabnMbTgEJA6CenlPk7u9iivoSzENP88BK9YEGbu6/wL4tc+fYAlyDY2g4uPmruXDfGADWszVXDPrBcSQEYUw7KKub3+Kg57gG/gTbcV7D/CAhw74/w13WA3+hAPs4agJl5xgjCgyUov2VGZqZ5NfvZ5jYNAjc30Uipfs62pjoZn+PQa8E/h4HPBcAB/zwHofgfB/f/dZ/D+Of/Bb3WUA4i/wUzCiY+hw3+Wg+zjBg0gr/wsH2cUQB/FnfGV9I1z3+FBZUayN13+GExY4OhiRrhkVsf4baT1rsenA/y0IZZvaUff4SHk8xBOlJ6D/zoH38YFM2wTt/HnD4Og1/DCMTOuCu2xxKd/w2LHh1sOFP+MAgoL/FA5AkrXKnJKp/BWvoIc9B/locgSeXqd/glpAA84lwP8YCje3dWFIGyGcl/4cCVe190zP4Kz7qb2k97+H+NwatyMViC5bQCBoPqraiulBrKELOT01YVfv40wQC/p4Eo6xzATuEn1UwJQ0AV9IleogSgA6ne0fRcUvyXWuCTezPSTr+2nKxdxsvI/xn45nrSeo9VYv5C8KyXvg9EKJ3LjinkNI+rew53wfPDrf+/msSQhlE/sbAlVnvMhTES0e0/4AimnbDTW01DoqowDHV1olrxfHNzxuCdSQhohsKvZKOK01QfvBV8DEeVHQ9X6MH8zyrudGoNNuPqa4vxSnphrOsWBW/4fA3yC33tkwy47z1HFzkB0VIyV6R7lcN5wO0wBkdIX4BVBDTCQvug7Gt5zKBNyIcZP9QriNQZUhQEwPGqzlzxM06gOAnEYT8Z38LC0gWKuKswyL8Pg0DCe71SZ69qM7+AAulWDaIoTyOi8vC03zSN2rvoCG4iIYc8YGo9wJ4VIlK4XZoLFiuSBOuZNDTJwnybxy9KHSmSMYhOcGLwsZiL1MGGpkbjn73QH24m3L2baEmGCzcZ0X6upGNmlGkBmtLBlMpGiarBFf4F97OspQT8X/wENj40cld17PRZ0CqON93MUpSL12qkMj7B+lQnsceZNC4/1Wq2QeYSNJwm6Olz87Nq78wNwYDH+SAqoMAiZYRuP9hBVQf8CAM39/8GH+dAqoPJgSu8bD/OwVUGCc5AEfyfwYfwmE+Lagxd6+LVH8ADBv3K4k5c5gH+ffVIbCyVkYf+92SoXQp1Kd/DfuqNfwL3o8cGdfX4rormfMYHJh5zGQNUBVXLQVK7iPqGgfkMaxX/gCF0VP4XuDyn+Q7g+c/h2quCQf+H7g9VWyNifrYVdpWMkpNhg145LSj9zBT3bs+kE6z6NvcN693xD6TBcJEp+sSTO076wpdXSZacUgJTxD7GCJZJbq2hj/JQWbb/Afn1yRAsaclyWbmn1x7OR2CITkJPu38AT+VYBfTZwqzrszLNGJFqIaao+MFgWLFrzHb2eK9am9gm/4CHJ7vhsggJDgQClFdjgsqt/ycGX5BoLT6aJCiVvH1FTyNiQ5ZEYlBoC/5QEnXez2o1YAItMhihpJ4FwirV/B6oFxk99InHeIkMSwU/OvFz4/x0KXXqn/UhS6/7/34KXX9/8vMoH3/ogM3L+0GG5hItYdkwPITT3Z+pn8OHj/3UHRcTQh0EMat5C2dfQTcmVg9GoTotjbsBTTrTsYL+KJ3Mz/CAjL0qf7QBKW6BP///CMvX/rhKW4R9t1BDXH1Kf4dXqAiJ/38CJAa6tTCmmYeIL+DxRhMvHHRrSM8/rOoz9DOun1vy/5pnDBVu8csMa2EA+Z4f0sQju4z0RF/DEMlEQBR85boocWQOx/8pBQ/I8fiWqUH5xcCPlisq7UxTa+aKomlPm7HpYcmLLQu4+Z1HzXlv5mgniOo/IqgRK9kQ9bDE1kVhr5y6CZA9TsCnybI0K8DsZopBPLbUfx+OyytNA7NQiE6Kn6rKxEewtzh2a0jRxVqu0IB9RQtqTqCH23+AiVgQuF2zzgTBAXGu/h8yGe87A45mhmq1rESzmr4ALpBm63Mdg5vvC3QHlp5f/NioMRGPAiI0cpNHmXqnO4V/MnBlECjkSV4FwHOUV4hXF3aSdxERxos+chZr3vVqchc5UGVV0BanWQeu9B6IaSfeTLfSTBvMtpBxQwNdsvOvkXMJb5n3oQLmS+sP/AQRllDzIP1pD36F0GqSXTecL/gzdGaHHhiMZb8N7PDQr2UqWSgy+7j52Qy8veZvFsqviBphRMnRPwHym4Uf44eBX1kjgiU9/gIble/wQKg/9ocUndJV8opSRf+AgQp2cderjeWzBkRguq9kxyZwel0K2DOCcf8WCpiUf/kGFTEtFQ1gR0AWBWjuuVXq0zww3wDrLq/gbgFASD+Ce8ZgwLUYP464Bev+HiPYp/x8GQa2TT0+ErcmAcD5DhExApywdQ0v1W03OVcew409hU7Z7AACmxdas+wbe3EQvxeFGGq611qJZtO6YMz/Xlhaf8RBg/xBwC/wIMcjRYr0U4Oi6yCGbsICIDpFGm4gKVUWgev0PWroOl4wu8yP+Kh9D9kv4Fze/H+Cs3sBFAL+Fvhtk/w8HQR/wn81/w43wHiPv3RQaOqbj7LiflalLPbxeoWxGRfVs14XloERZz5yQwbon+CAuGPtHcTyLI/aX4w7SZXIYfEOYCkqlW2kUSC8DwupfHz5GxiBP8TCsPAD8gfd/goVkus4QVp90HgwUaHCULh3uITuo168JNMSaIxh7q+rKh2SAuQPE/yAFAJVaAXX+qgzfML7TonFrmcUv8bCQ++WZ9OZTXX8O7IH2W/6zAcH8g/8sE/5f7mCdkv8LBP+TmCL/IQ+Pi4zKNmMyZ7AEFhgwcKn//gn/L/FwTskAmGXL5PDG2n+PgnZL/FhHYSBQn/AvCj5fwUJqhSW6/wtwohPJ/N/Cj/D/Fj/GPGiBIB/komGX/jMpPLnBQrHMk/x4HKQuv8ttswCe0L1/PvAognB/V/Cj/j4PWe/jIFj/jgBb/2IKO0Yn+gg9boCdACIQDg/hO3N/hMHv/hMkh4ADU4ADZ+ADe4ADgCTJAZKwC+/zklUt2yXwkElRp/vHEUS6VZAjPZEsE/dKA7C4P9KpAdcoy/xb0AICcCyZMManW7fMb4zi+2gibUm/wUHrcXclcACdNIP8AHB/AYM+DAYBv/8OD1tOgGAwDsAwDfBAA7gCA0lNoAZKZXTU8leOxoNM+8mpcF8b/IgetpIgoAEH+Rg9GLgAT2WDAMBuwUAYAMB3P8hB6LgD1f4IG1sNw/xcHouAQp/kobX3Mf4FARv8LDa+/8B4JvIAdH+Cg9FwbgF9AX0HACABsgG4/wQHocANwA3f+CA9DgB4QHq/wQB6HAEFgQp/Ali8BFAEY/wIANASkBL38CgCK8BM/9j4MiA0k3aG4x/1YGEF/z8DBi/xkQRgP8f4av89Iuf8vgPSgkIvydpOB2ZMDZSSrwW+KbHI0me/+Dfhb3oEqGIKMyI1gs032Yql3Kp5aSnlF09pMGSX/TiTL7XpywzQFy4w47f/w/io/48Fi+lX+qV0r8/n5dK+f5ewi/z+QF0r/HAgQaSX//gpPv/RgUn2ArLPafwJga9pCTxQ/2Baolj8hfMdn79C99cIMbcI/DEkXPlMz2NnxXkp/joWratIQDrhgGP4EABIMCryX/ewtW0Ff56Fq2u09/ha8WekBFpxuc/gbdQHJjsPROSNz+IM0d6nf4BMaS8SkJjJxWgkLm+xHIDCOmJAu//BQLNk9sB//AYAIkQT2GSj/8xTNsO8b9WAYpQNvO5r/gIQ/b1y0Ghkhr9PZEJID/GAJhsVQgE4xQ5AYG7b/8dPE07/DzxMd/w+8TLD/E2uHwYQxrLnwqeNKSNO3helvTc/7SUPa5uXmzEPLq8w/qszLUGZB7JoGVfEMZVBYCSENwcXAEOxgdbgtXRN/5QDdZe0lTMUFEMAloJxVIhI9mo2ZPLs0m63UtM3BbdSkmXWH6F6/h0K4tYQDbBOqP4EABIMC4rv+SArj+P6oX+MgrhB/halNEreM39gcr7+HqXMSxv8XEF6/+PhivaDBKuj/48Cht/6kCP20z/CIIt+K0IBK/IQfwGACFMGBMab/Tb+Z+/z6/mIgB/L3IG/v8iv5hS6nqD9TXci6cXxsgg23NAOymdoxP4Fng15ADvKetxg+L6ngptRjG3WTMmb3mY2g92/BYauH8EXod9z/sEPTOIzxghkG2A8ErejnFz/AQj4KqPmgIAP8eGrBzKDN/AsK4Wpi/PPMjwxla9avWYBQXWfPA0fl7+CbotwMe7HC7NqM4MA9wUFVsTcwlAqShhueAF5MERgV/A8K4NSP/NMK4i/zBCuAAD/nYRbc/v+Fc/04KtS09SKfwV2MDP98dv/P8K5+OjPBPo8MLffOLlcj2G+NzPlmBKQWQANozzMD4S08d/YhHnu1uaH5Rp3QPJtsrsFVkRBYwGRc80hKBtgftPENbVQDBxEX+AhnV516UGR6MDLXl+TNXzSaZCiPGo+T1T/gBcpNvDMH/gSkE++JIVXl9DmD5CieLRQEaQLPxbDgBrwGxwJvrUtAf72B9tzBSgeCL7OCteY9BVDXDZgO6B2/8LluRyN6mhsT+ACHA5qURLFMccDbMA2ZxB6ZJxzy9XfTrqvXcSmyS7gL+ILeNUKdo3OkZ3oabBuYFXK3/AQOO4SpcDOryfbpN+d/sgVLMmOY52bApHuvJZBGj1sxp7S8bdvjKM/wARx+B/WbakDfrgpNRFnMlRdehJLnKK2RXkTH7UWxhV23xFnnI5NjIWZvi+Vbi4Lz5f5cMI3p4Tz4GIQG67KFcVuhFqS2CCwA1Xo0KkI0hgXaOMB2TMMo8BKpIBTvwOwplskOT/k9WeeWJbo8mRVpnCiPxBwzJB+D0E5moCcdKc4gHCO9GJ8bzJi7v4RcH8CwrhlUIBp0uLKEKtOr6a+WQKzYdlHbFsQk9rSaHw6YCj5uqVzwq2Oh41/Ar/n5/wU/5iOwof4qFprVrMpS03Ygm5DuTPy8LTW/ZMj09lW44E/uoEscs3G9HwdZ01/FYA//CVnMMH8u77IWwf6wBRNv4GBRv8+AomzJ/jgFE2/iUBj/54C01oSyf54BRNv4iBAwN7h1j+HHEmNuSD//wKJt/pgWmLf4edLP8fCmhf8ZFJ4l/Aa7Vf/4CFz4/8LAonX+YgUQ2Af8EEPZYEAAQACf8cBRNv8+Bh1QIAf58D3C/89C09j/EolOBf7wBQlkQZqUiW7X3/iwKEt/D6mV/j4FCW/zICoN/8sgr4tZffJQOkZ/ZYK//J0Qh/oIFSW42OJ/5MCoS3+LBa23+MkDL+XQT8CqjVF/n8E//hMOz/xULW2gP8h4J//f8EJ/VEEJ/GJqH/IMEIHeWg9mY7m+CzyG70jfmD0iaeAiVZXBrUHJJ+uymtbq9/LcEJ/H4D2URCgkIeU4FBi164mg6Q1uGJ0EgCaeFfhe/D6eR4F4TZ/C1SeGfwDHmPp/BV5KFw5R/Cp5ict9TuDbS5XIX/GB5jpieJGCCKyvwBFyYSnApwpjq658N/FYA//D52j/KJ5iEfyyeY/wDAo38e+vIP/Dp5j/EoDH/Tx5iDfx6eY/xECBgRoEmvzDhdUfxgD//4Xjf/ks8x/lI8RDP84AwIX8DgH/+DheNP/1ELxt/yHbfAH+fBAF3+PTtH+JREf+eTtH+RBUv++w7R/iIwxP2nF/v8wx/v+y2/qOy2/jEpD/kGy20FBifWjV2Sf+aF/luhhCqc+PYjqLXY3NOeyYdurRgZ7/luy2/I/AewmCyzP1f/8BBuoDV9WgaULM2B4DUHoiUMAR+/XgXpeof1HP/Avb4eY+LatCGoOlOfLvdAkwFNpUzzV2vDz0gUWHjgO/5B1vaoTG7CWWg7UZbPAP/hQbYd/gWBZWVD/2gEWlNQGwZcI1QgCJVd0mw+apd9KfPjn5Nu40uAlk+PCU9GEHQV/sAItK/z0HvQ//+CLSv9OBFpVSA4X8FXCaQQxOi/5yFKlgFhVwUEJ/4ffeRCFNTOGGbFMVKYb5wHlDPZl5o/bunIUKWOvR+OOoJMD2PhcrFtdSHhV5YYuUIGvJpEKUJ584kGybE3ef61lyNIr8IV0JJWiTjYGw5VCSki/rK7GQ9lQ2D8tBCbEUPy3qv2DYdyqlYnX3qBo/EDZdYWe36B2S14AVmJQ4WNW1JmY39JRFq+48G4UJlL2cALIoOM3BDFbigcd6Sht3Fqd4paQfaCE+Y4g5X/AQnBgYmLAitT6C9Csik8dF+NiU4BIkt5DrLT/kAPfhY1Xn2KNITUEKCW/Aarbkj/5te+ifScYImRRosxkvtNnF4M6ES3Vsl5Kzd5Yvgjg/QP41lLKo6a+wmJYQPClU9sOb0ASiETFjyjT7Z16YLW6NEmCeeRYNdB0OUwLAJDCMxzntOAtOqmPfuPlMSQ51rKh48ePEPD0JLihGrBqBWxtG1uRYu51QFbg+ULLdnFdX0FqJu0mhFomVfSPI2eqfapBMo5j2JNRVMh0PAETltjrjLNILgqfzhIgf4A4wL3B/wERW5GN6fwLNcpzvGFBGEMLnN5voQN7H1d8366FaWNCDzDWoj8zFUtQczn8K9u4T5/wL27+v8Fdu4elVn8L9u+J/N/S7/KpX2AFNPnvPJQ1UShZvyWgx+3L/FYA//mIjIDCSDPjmRqEqCHtq7/HgcAr9r38uYIQLZY/8+9u4LAf2/2788RAsf9Ndu4Gyf6EExMhkCqwEg/ift3/hXNH1ND+A+3T4OgA4nM7gdmXNG8ySbyqfDwXrwyJgsd5slEgluM57lMZyXN+Uj2bbSBAQl8wk/wZMcgCHSq6W1L47gZUVMna0j1ZXGrRv+Be3d70I7xoEHamvgA6AWyxwYDAOH+h+3jiQYDAO0DAOEEAO8AgKrz1TQiMejpjICzdErbnHcYwIPMP4i7eJIFACD+E8IX/Ew5AuBANyBAOaCgCA651fzfBAfFB8Ff8KYYH+KgbD0BAeoDAe4DAfAG/wUSh4AYBpgYA+QIA+gAVgQEFgoBAfxH3/6ElGRC/TNoGBIYKAUAGBMJ/EPgCBOgKAYAGBMn8MeAM8CgHB/EXgDd/wKAjAoBQc+DhMF7+BEcblAOn+BfAFvAX4Bfv4D8AbMBuf8FEZgvzAnfwAACABD4EdfwGCbAAEdASB/Ang4BIAEh/wL4AtYBOn8C+DkqBT38C+DlkBfwL4bgW4BdH8C+F3QBeP/6AYyJ/jEeD/yEDZE4ClqsbpvrBw9tVmtIQc8SWR0svHCAg55HHvoh3R4t/E/y32b/y8A9oPOh9SaVTaBNI5yzpQGIl+/H+YzcVN3JgC4wjtfIxNrtDVwPQlYhqpi+7lk7us3UHBxC206eysWQ5SfEXMbAedpn8P9m57X1v3eOuyDQzEFGy9yQ1nK7+MowdLpjteWC9zJsB/AA78i/wLh1KWWCMGX/jGtRFcwPNw0ADrAnWoeJy771Z09mt/NTn0jV+ARHa/2lmnkYkSMXQqeLG5qcIWvgF8WsFzhDLCASR/AGgfBX8BgAgnBgo6ESn+JiIDOGFkXtzXtouISJooG2/ivBnP4TAdv4h9pXWW/yAGc09rAbapXBl38A2ZnlEmwn/ADhSqQNJ+MGrh4HbYcacFeqZi/D1o/+LAmz3+BOoRHoj/aAw4k1gNdQpkYazsUaCuPtd1KMcsCKl7KcoGKTjafBdUmPaUajH+wBR0n/PAw4mAsH9/n+X8m4NlCPK38FMYQIUP+QA+2wGchkb18WJXqfzaLYVhzfO5BSDvZeqKfrTqfl+Sc3w4hd/wEU9C2QJ2nJMb9pUEF77gNCfcFDo0nLZPYyy5vN/gIXYsfQNM5Sp1dR5+BpQinXZ6J9CJbgNkZ8ejZL3dNE8zWMEf/BoLoxWpP4ALxvopG+feEFA0DZgS5GAAAoiaVZLfUdQI+0oGphdepPEIKRllhRGDVjRZUO/NyS7d6FRwVU24VYrdEOsNCkxhICFRLaej9jiS2Pck2ZqMcFdbPVT9hYKkVOjxi4u7+H0JJOs7y14+witGomDybobnBP3jYR86TNm/pElywTsKUh+QLfG1B1yerPL4XB1DuTa5OYvn3JaUsiQbsqq0kaWS6SIuBSk/DygYQ2wEOfGNNtpI+TE3rbBzsKcKvfksTN2a1k3VHOPCnoxFzySyrexzfFvh2eTCpTFTOd6knsxkwGdiI1FzRfODwN4oALX+tTfDRLChdZh38VEBQNdgr3PjDhtwPaVPJz/SGdCousMpieUOi3BGXhn60WETuIM27xF8MmLA/hMcy/wUErg6BCCoFck/8BgAhhBgRGWSn+JiAlL/EgLGor77BJMy9GwLG8pycsJIJ/4CFT/gkR/xgC1PFRrsUVaTNxJgdxpk2owRhHF8bjsYozqt8a4yFIi/DhBBd9CATYhvcGB15xCgqaO8xNMY2VlBGUSOhF5EwgVfUQwws1oz8yAFwgwlHHf+GBd0m3oXkxAMi01C1+QkglxcW85aZc3mJjX2I4M06c2duMAsAgCH+DQAH/BhNa809vayZ8vFKQE4pXFsCFjc5Jo5cLVYoJBlDhwy2ObKfiCI2WOrB3pEJSNcUKe/YJNU35gZVnoxtBf4KCPIyGEAnfLiP4DABJOCtsv/JxCAef5QHlhQH/ARRawqHzg8+Wg4q+B/CgEJ8j2j/goJbSz8GCI28LBjOrif4oCW0oC5HqX/LwS2Z/lAivv/goCn/N9CcDw7gf7WE4Hv4fBH/8oCWvQD9AuqNqqBH3WOyfxos/oI7v5GNnLGK1P+GAlFZKy2xDISU1SKfUu5yEAyzEy4Ev9f/HgwwIyC8pKx/+lglfr/Ewk835g2UsaHC/PJrPo8jywi5yxfgleoDLYT5mp4WE4KaaRt/pYQRv/z8EscX8Jh/f+OhKkqORP8+BMiAD/H08n/n4lmC/xIPUwf6yAlfr/EwKrf/FYy5/D4lH/FCgJ/FAcH/n4FUnBiQMMH/PxwE36HdV7Wn8ViU/84AgY9HIf8PLNCx/H1St/5gPZ8v8yEZm//GA9n7/Ew8s9/5YPZ+/h8Gf/98Hs/fzmEv/yFg9n7/ExP9p//4PZ+/if/T/x8KDLfw+Rf/w+2Sf4+FHrv+ERhtUa/4YL3MZwwXSnmCH/PQhsf2W/+OhtUkQdM+PkSH+HkWIRQP49KvwBg/jwpf/24KFBfw+D//5gE6XgduSP+XSV8P/6IKFBBn8PT4od/DykMFwH8OAqf8QgsfPoAMevWqO8+NDTV8O+o8jZ6N9uXUlLRQxVhbA+5D33IKtNecrDaK2j+u1un0HuM2i686yuSLL/dWMwPT60dXBwxFTc2k/z0JpfhX+1BQoL+AzxY8/w0DLAD5rL/wuC3kMUSVDPTL5/CeAoYuk5u/7MFCgvH7/h4E//18KFBf58EtAn/IA/S1/DxsGEIH8PjYf+OBMh/+QQz/+PjL8DIP4TxUOLwEGo/wcRLraMO9GNOePlmOAVzz6+GbKYqxWQ9/K6E0/wJA/xAU5k/4mDgkP4R4Tv4URG/4pAP/4otqYSc9pk/hvzzHvULc/8MArjL7KXXsw6vbFaYa5IOjSh5xBgQfe/8VgD/+PhSv7+LCUcC92OFq8GS/isdH/gUDQae3LWn+CRsMyLCzbrXFRTv8BBgr955NdNJQmNLcwc3JPeYJxQF38AShEY3kHdUg5imkh8Q82oshr6rkrvgDTKkm7zjzdZ81xdNPLqNuz/Hw+P/HLbOO3JGxlSGL/wN1AgK5CMn36GzXuOe17AlX450Qy8cvP/+vBN9//DQ7C7hdshkhRBFN8s+TXFn3eeA+vVzP8NuG7T29qX/HzeMQf5WDcyP/Hwo8EY4OLXfwUEtnX1Tcov/vARfT/gYB//woKPBAF/7oUeCCtxyZf8JD9PFLdIEbI/Rx/vAUeC/gcB//woIjwQH//4UeC/j9/y/lA0j/5wKPBfw6L9krKZxPQdIJlz92EZM/4/G2/5CiQqX/IQo8F/igkH7NTM5OP8PCjvhkG5ocmxgcG5yamZqbnJmbGxmc/4aEZL2JmMzE0MzhJaEoQnFLVkM6NzliNGQ1Yzg3M2hHbDRuRf8UCczhoanL7zylGV8eB/jYMD1tf81Bgeu8HJdUrCk6+h9hmSnFkFQfIZw+tn8K7Xi0qll5oUiiMf4sBagvkBb4MEq6SEA0CecSDAbAI/xcNInon+hAWoX/LASYJ/n4aRPXcdAA8Cp0Iyt1CxDtWwL6fVjU0e/j32EHf8eA+sw3efw+LRf4KKSXJSAH/b3qkkk205VRGVSlehlZDG/wQKSAp/+RAr4wGWMojH60aP8WAvYTZBBRXCAT1W7sF7Qn+TiDNn/MQcdsRhR/hAb0v7WjGBLTrEupBSufU8QAIZLHRa9+KpJtSbhVL0f7Cgkllur+FI5v5A/4DA70GECW60FP4DABB2DDTeMv81A5VB6ODSB9EaB/jYeTk/woRMb/5gCgh/4Hj3f4ZxdC65bpBHoD+HnDx3/+sDlUfw+C//8+ByqP8UFWw39lDg5/D4M//mYKPHPKIK/Cv+AxFeBsyBuoIyo8xPYYICMvdDH8csb7z0py3/BQPIa1mFq7MXaWjHa0WT23vUEJAwc5AlxC2FW9QMbqSL3mGOQf4qDTzwj+BPCZVSYQCZTcBf4DABAyDCOYg/yAUik/w0ilt10YhheUUkwS7uwd4/QY0gu8HICicAtRva2e9/HmJ/2f46Hu24HF4v98Bp5/TDHLTHgOW7Z0bHdVsdkBSYbug+SYlnKR02ylm0C4nC2TJJV/nwJmu/j08sAYRvKScOgRBjAEimtv2JC9+aHQD+AC7hzB5q0KkTmLOGP1XvKHn+HgV/j/n0FfnGbOch2RDylpCX4biAfLhtltJaphtrJJ/ElVM4SdF/rGO7+frPMsCzepPyOyipk0Cq21xZZJ87LI8vqkq1wuKzhhNGwdsTcCETueMEOI+cCcvrPlBVX4vXvrhdClxg1Ea+XXTkG2kgRQARlp00x+ic1pB3r43Om37POG1cMVRLfl2ihuP2/TKgF1ifK+cxvtFQWDDoacFGVCIX0X0r5PQSRwCuixfjUVQ+fcRJGM0/kCmDfqF4X1ESXePeR4GtJAEpEqNIKHJhlhspzDpwbEA0j3SxNpAW0S37qpzE5ESa3zjPSBUxn5Edh5kMoM54bFqyAzN9jWVa7kaK1//AQ9kLxUMKoEBL8RRNOtuWMEUaoxBvZpdm8ktcwb26u16X1sRXLfKdP/VxNSlLcgCrfYvPNjrQ99QD3Yna71bvlNn0qPHwkO0gbpCCLyQiwMG1ZP3GMvkULnZPqIiSmVvARo/4CJC1Iyxn4I6l56ksLmtRy7XAmujWiNy5X3NTtIVpiDxl/XycYw5hfP1vOD3qmMBAajrAoa/rFLdLcHMJ7TKQ0Z3HuMsY4hH0LV8anFSDzA857vodgWWTD3tp+gToF278aoa2fzGVIlpM8YWPj4jYBODVZzK0GZUzaMa9EWQsuUAaD9GrAJWRML4nE0VTU89gcYWkoVat2KHo9xKadLitYGPxrs3zkPC+DmKU1hbMAYSMWZWpFjOlFEK4O5Anml0bJiUTFwAJw5dzYz1bpCBcmS1InbPgJT/ZJaNQor0CLDxvZ2fL/gDCRJSZ1yaChOUqxxojExf3vsOcspUfTQrrjXyTL3sDGDGGU/2HWpzz3jRsiBlMFbUWBLaKQxSlQ6H/fWD7zkCNgv14ipXHhHdABcj0yTBsF2l/8BBFxOppj0xTVlGoQGEymawHgJRvCjOS1B/gIQDsIebYEmZy02Yjes3dOnjDLfCyd7Bwd5a6YqEIJ8H8yW3hf8BDxcgKsMpEPA1ZHf4CAhnW6wfsJdde8tLQks4hvP4DyRL4S3rrtLv4A6t9cwpB58qCrtrloClrwkKtznD1E6tlk99/jnRkGIo8ms+lQEEoaFKOVYJQaSkhwejB7kTh1NFh7ws5AvZPWFMrHlu1g4OoUxdM2kfz52gvsTrco0cteEq2qh9Ux3ttjYI2BAHR9wKNztQAKaGImaP/CzKpaKW48h9/d2zui7s8wUKj7OAGR+T1HNQXGAGboxwS/n0lTWON262WptyHdB8EJpXlAF3MBxUhqwGurLbqxh468WVZrG1grK8UVT4lCIAicGfxqdEX14pYHZn8kRSFJdq74Sx+2X9P4ZUeD3WIgkqZufD8Scy1TbFKUFKR1GAZouC7A+g35kvJm/7d3ZSeUiobU8lLhViqfFqqBaIR+KFLh5/clv73vJ4/E8OHcqmEbP6QntW1zf8BB39CjciqY8Hek1ABasMOwCu1LQ9nJYqyXTATNUzw2fwB39rHUsRev9uC61bLUNzegYs31QuypQ9DMr4n7gFf1Wmjvr7slktlsZJpDRXuNV8V5gRJt4JHXUMTdgZ6lChfu0ngsrQ2EBUjhQvESU0oTpQn0S6aWb+kjJJ0dBOlYrIyNIvt5FIk3P5Vh9ShYMP5Ol+29qaT3pmjs9DXT8iAF9SkPcStLAFAG+7hGp7bMO06SC89MODmiAABLBIFuAT/7gORFXKyuQkOQA6d31mp1tsoHMGuuZdbcHAjrx1jgtcApiD5YG9XyXTVEQVjQ0ODLtsKlUyxo9ZOUuoqen5BZ+zmh9tlNFOgs+MyKhGKdOfnQawxkTpEDNrATeK+h1DSZwBp82io7KWUWm50v0MmsthQE2ko7xNYqqND9e1Rjj8+LR7B1qu31ze45KbzIcGJKb+TSBDXKMC5w8Mlx2dvAATc+5bi/wBq7V7IAeTGgLEGi0CyV+jAx6LTH8Zy29X3/J8Ga1DItjkqMnVEJjeMQMMwQGmHnZqKdCFgdFNg8zicKj/jgjqN8liEAkaOYP4EABIMBhwz/Fw0ohYgak24iquf6FcUl4pagnJ8gGtcjBKCPwi1Od1g+SBQxej58EkR/roNUZ/h2xaarOu/x8OXdja3G+5P6Ad8aWJA6vEku1Zuua56ndMTaAbH47ofEQOpwDniGMmq9+ZjsBshabgCRFb/wDimcUiCgGlvhX0PT37szvL/tgGdagUzyJ9ZN3dm1bzU2InB0T5+32X/PQu1O3KP8C7hreEEsxOiHIYB87h8PHPKuHbIvE7eTFdmFrUrjavg+HAXlvWBcwYmzs1b6/NbqVJ2cbxQEP/d/g4bImGZ//aQRFun/7GAZ1oB//8DOtfzlt2/4qCUPopSeoH+CUt+VwLuQ/38GaHesaJoJ39JO/IShQnip4j/1WJevvY4BK1ych5bKg7+cTOh02lHnYmUiKQPpTJ9s+eUlQdteJPTjK4ETA2UMfvhrrmbcxKxyG4jCe0xV4Wo618LkQgv/SmPnEm+mfbsnFCwlbVH1A8zLwEe6n1+JipluiDfYgL2/C3jbwxgAjItYLBFBctD9dh/AawPdGnwto6uQVlGi8DJtLRId51+BbPBSEHBjX4MaSRHbXzyHyhdRhTlb5EgsVtAhauiHS80fuGDQv8P7ppasimmhwGIl0H1UpCE+UGELnaSoggUs/EgQ7C9LB06tIiXCeotIffNc/8Ak1xMMH0hkOS9ESzrFqdBTVPfjvTswhkk4JMDOUQIPsRO8JiRIlwjWHK02bmqT8JIXSofwD4S0RZFkkM1D7lMz+CBPCNFEZgdXbwe1BzCIJ2W68ZvI1QLwE3EJULg90zuxymeV3k07fQaFgcABkEx9EA92jD9nRkgAUOpVGAYaMF8ndTIjI6NvG9up5vK4cDzMq5I1ag71UXPPYH+MBkXH0hAPnJIv/mAyLjArp/2EDC1X+PhUMP/JAyLj/DAIH6wgHJw4T/3wPwuMf4fwqvyCVd9z76IKO3H7DI+7fjfngyXfggVWPicRC99+/Xt/UO+V2AL/xURlY10Frl3/Ch5XoB/1wMimDgEABAMDgYCAQJ/uYcBoiu1C/xwHzwxYT0/w4q8RYYWEn8eAD4fP9HzDgD+L1Rf/LgA+MX/8PvyoJ6j4KxQSo/hfm9kN0EPZ7E3hH7CQ2ZT1aBPi+DgNnP0jcTePQs2mGyHkg/oFdoq4h8dMdlJbemkX9lfD3XXc7o4SZY/0c+4z3OVhGrzBw8hQOjxpCiSebXvWa+AhVFMhw2jay1OouROR1vdRkhau9DWjf53loPIG7+SZW7lhHl+uX/D9CqjPmB/AF4z6JGR0EQ2rCTY+K803qTYpWPZ0aXUFdNmlyDB67YWDEgD8Y1SnTAGow7UFTcWBzCGUJyAPHkapss4lluD7djyNq8PXmNmv9dhsSkSCNe/KS+0y53rwqgd9dVOXKXzjg3+IfCkQ5TV83ikIWSIXmiMoWThkVWVe4cAI3mUTRf8Es5TuP1TrsAHVjndvnzDjdoaj8SBPdJ16ddDPuKsnSBLMJqA6IIpAEEtzrGL7Uu3tzsMXsHtVev8WX+CBLw3ZfYMFHPb8xUCFdCc/5mEVG/5iBCuv4TAbv8iAhXV1/kgdfo7X8zRzoFnm/1HQsU+mP52jmk0ffRGFtltuqdXIn1DeZKvr+BT5U/gNq2QoQJ4nRa/gMAEY4JSCJRENMsffbkgH1UUEU5mtxREzWjFcOHByvrwwjoMv8FDR8nbB88pbReYwFMz3btN9K9EoZZdamzJIDcN1ifgEU0A5Bohf44Gzj/v/9INnHhnwngOGI7gSctIm1jAp4La2Ys/+eBTpQCiCK//gFYhoavRL7ZE3Cjw9dLJNY/GpUz+GDFikaB3Kur0PXbT7X8akfPbPI2XEgne+CyhqGK0S6Q+opN/jzggT3hAKICykgwFX5v9GENpP8cs6KC/RSZsCr0fTXQdSwEhWZMyQOOHJMex1Hqa5e9xAbIP9ieiulAxeEusrmvv/Eu+wW/QsIWgit3ItkzI/NWDohvEiN/5GHbE/8TCxo4F8giv/gIBBWgJyfXb+AwAR/gwJ0hf8VFwEIdIdtHKIABcghJRoIf1X8fj9cG1jFs9UbpJVXxNukalbxI7AMt9GPvEe3gidsNJJm/X/+PhZAEdYeElrfOSsomanP4B/3iqWMKO9dDXRxrhUbkJZgO9NySd/mv2l/h82G/ywLJgh0PNj/wuAB/lYH/c/1IOfTh/+CA17ZkZXBvc2l0b3L/MBG+7/n4QDzf4sdhP+MCBpv8DBk3/uhA00JUoOb5P8hDz2XbMUIE6wmNF3aIR5VFyTwqKuV4mUHpeQmNtqgCLx715hi8fwAHg3WPAerj8Y9PJNZ1RpA8RBAxSBt8WLf/AZEMwf4yAS0EMNKEwr5H+MSIb+EwG3+IQ0h2NP4gIhlvxNipfXgDzxliFk1OxCj3oYoh0v8FBuPoX/hINSn/wQSVB/6+DcH3tnNRa+rIoNpDj772lVZ5ORGecAxHf/a7+AObJ78jyPFTe/b8O/xYC3ffwIRQLKp/tIEvLAvu8FbO0Atz7feOFpZ8FHSRQ7ezDCEOp+x/UVThjKHqK/9hAl5YCAgH/84dLe/nJTd/yINMK5WtKl/8Pi0QMjwawLkEVNCbgVREh9ar5EHEYWad7p7B81zuWu308uGefa6GY8LoUHvOYVrKqRioxw1daFhBoe1Vzf+Aig0/rNrvGJa9XQCtCFK6I84B+Twa0+RAIsrQRtNUEKqAOT5Y/hUfHPUubIu5jcVaQOkmHIdFXdpj9vOAr8AHr2o30jg36Xpnd04lUz2gJliG5JisvtcjRyK8PwIPS3K9mr6CT5OqSE2PHuOCuCvhfIbQnM8BNarnHOaT4I7XlgQgLNPYyjsKf5LJ3+H1H0naAAystwhswUMP8BELV5XzXAEMMdvSBh0VjPHyzOIM6owHbnYeMMtCuNKq4b43TkTVj2IxUuHWCCPUNRul/aDLaU0oRcaZmjqnc1C2R30fc6rYROVGU3G3ge/ZWMV9ut5pQcrc6TMIAiMYqvGlxt6abYoGsYoq4NzTQS1pLMYrcOpEWJVPy2bNzxupLJ7oqPfaj1lKkiv53U+S9pg93rQUoU5t6/1HnxbwMIuwAX8rQEHeGyKDWkFjpNeO28YGNyItS6/f8CmmFAi5rIAi3noX9IG3/FhkIouzJg9ga2O4xZkEKVl+tix4FyLgBakIScL7cHNIkHrYYfEF9lYv8D1qLMfxrWoz9vgjgYNxQD2plxEhW1CD1IDOnp/Cu8SiyosKACkJX+EAorOugXBDs9Kt6McnOJvBMC2RbkRYfQRqcwJBuHkPB1xy6Dg/hsCtZ/5rArf4Dlg7ps1QikvQAlWiZ4La4OPXHdloFkEKDQ/aV8TAfvDW7PY3/HAQcpf/3sdGQunHdi6oMGNae56CbN7MogAUFm9f54G+2A7ZxcAMXp3GQsqMFbhdQmbJWqHX9lAY9347YIGo/9DCz9gJ/jwKisdT/Hwrg3g14jSJCpRVrN5S/NjcvRfs7qfJ6HOa89JtjGfwAfjvy4El5m2vDOdqVDZ9mWTIS/mFXh6WgF+2NQfAdVn3+AitjyQhscy/ygFRW9t6gH5vo1aXz3vgEsIZTYd7BuLBQ7KHUtUDNPJOl6qwSfZ/jofO1V/+pD52v+Pggom/18EQcGf+vB87X+HwN//JgVALgsYUc5OdHDjdGQWAvGjduM2mP3RU3fI1dk4JrMq8pn+AhMpqHOeJdi6pcsDL6JFMLcJw2xYyqKb47YIFG4QCOKu2gwN28f46DOQ7zh4M52P8fBV84WXME64scny5PL9a9htJx5MzfTnUXdzOcKNBRyHokISbli/M0/IdrX+jpueM+4RO3aVMukJu0MuvMr5l7WCmjBH+Shquv+A61DF2nTDWwUDZhsIVnApuS69KsVxtjt5wGqZtyvZmfgtmvM/hvJ6tv5qHKEcSHhALM6ST7v3D07r2y/yjdCjZwL4toLnCIGJ/yMCx/jr9KEbl4/8YrIn8JgO/8RLImv/EIVHkhygtkbjZEPCMIvG1ygTwvhZ3ltXdMu71KE/RqhvasVVzBhrWb/zpEOD5iVrPxaQOuvWVRhf4MClnQqFhANBCZ7+AwASngwPb7v46F4Zn/h4Xm4f4fCrFP/AQMnxOq7yzuP2X40oqvyBErQZWDxweihXYy/wKeKGvd2+qpg+CqLgaf5tv6nugaqEKEK8JA39gNPMThuOER8NH8QSD/8C2/OaU9BCIN9eQ4SEhvYBau6PU4C9VVysSYV9mj7wEf8ALCy1m17kI5nwudLNlRrOhXsVmadRz084/4CEzM22CCH2EAoUgK4MBFSmUkgHM7nfq3wTzWWaxWldEtYFtfHm/z0L1CAD2ja06MdiFmc+RBpZtY/qkZl/ixdm/lIZlNf5eGZf4fA3/8lD9DfboB6OG2YCq2DKGN8IrUr1m6I3HFiHF1gqMt54SJV/gIPjJEkn/hve3t/5qMZf4TDG/kKroIIfv4FDHAd3k/8XC+czCf/UF8+QB69/D1BkzzRv4XAD/9XCH9Ierezu3g7FZClyC5V7x8rXInEBS+8ECN9yksRLd5T8NimaeW6bFJfhKPwoKE0O3zr/wAUC3ORkfacmWfaFGPV6WdnZDf+lBfM5/hMVH/kdjXdNHxtZLJGP4JzBx3wNHnQ0AD+GQW9uhA895zv/hMH0Pij9/2YL58n6L/9YXz5/hQck/glC7DxHBhm+/k4Mf/jjb8/gcB//wsMVeCl6yjmMavRHW1Q/CzYUA3PzYPxyf7WF8+evHjSs8eYLp1OS/YpToGyzZ4HeLz4cbM0qXf/8rC+fP8DBfH8Phj/+zBXI3+EAH///wvnzPxBWO/wmGv/9WF8+f4rGH/4HLHf4ZsMm1pUNZquP+2Avnz/E4n//z4Xz5oP8gC+fJh/iYYrVMjgzNDb/DgIawSJkYmpiaGRmbmpycHJqcf4YGK0mQ3MmE4ZWM1VGd4SHEcXI4OjkzODFhZmQ3V1loSkQ3VV//FAvnianBi+/wTUJ2p2Zek9qzqPPupYUhK9C6128V0Guj1cEfzcBQC8qPinWf/DqfSswgEr9X8Bp9PU4wYExbn9NJ9O6Afz8n0nHPLqfTugH+iA03Dt2let2zVMIo5UjVZf7J7vhltACNVAMi7oTrwQYsQP/MvSf4TnXv8FGAKZ9CA7joR/8BgAkxAYCTaX+KB1/KHBuFTUIpMYP8RDr+f8JgK4vbFA8pkE9LQaY1cyE+ABzF5bdEGmRKUC0eXS/aMzEEuADuJa6RiAMEQIUBgrqPGvh3b3uGl8gVdBG5L/goKEEH3yglP0p+jdLB7dDrIiox1XnHNn18+A54C5AoT/ARdjfoMf8fXjfPhgXt0FKCY72zAYD6tCdKL1aZamHRhpCL/SwIo+EszcV7KG4pZ2dCWN5pbrxMvpHP+GAc6eFiVWDpzyoEcyc1Q0atyM8DbDYab+DNEjNVuBw4lkT1jGHYhhgnyGR2hz+Az6V/x8FoHf4+FRwaTap5UiLYlpicsjOb9l7GhR+vD8wwFQMazaHEips8HdyiiBmkZCvo3PpBUH/lYKDlrZjqzxcbi43dhWl54yz+HODWdX/DAJKF/iYSRb/wwVS+gqpPA1PKug5KkUwFOja2bxbJdf/BkvBDU63deT/U/aO1nsdIWMpLs82fwYBfnjk3/QS9Ceq1RMhPHxrP0lOootT/AY5Xz7pbTWcVoBxv1Pvh9BI9PTbO/4YBQ3vpEQjbpBaN3TqvPk3MGk1kH4CPs/8YD2Kv+CB3Gyl/gQLE/+cHmwnbvT2kPeMUHHyUSx5nvGFtNpxN7dw4B1mcq8dncLmsxRTP/w2ud3H8vPMgCJFOroqVWPXwAAYG2HrhZVUDNsFOTlDRIrHVsR7WaQ3xYuf8PSd/+bgcDH+EwG3/PwOBj8wTkn8Wh5nBSNbQcM5yy4RJOkMOzIQpkLNz6UtmRCWoD27ozXX+OARAFKhAJlNWD+AwARngwjmWv87AiAP//AQKh/2ACIAuFxf9fb6s7NOA3hBAgY/NumKr5G7vg+PJGcrjfBJOf+qn8PAr/J/PoK/PZxsvhZh4z2j50l+O7h/IzB3Btil4qXoMEjto7x9DOGlFP+/gRAEIyKligj4U8r3ME0AIVRrEa4AuS71gK6AW+bidI2ETdsoNnqT4s5/xjE3bHc8FH1FID3llmgZEKlO1GH0HY40ykIPkWrPTd8fjDCsztB4TfSOVXIluRvH4XTUxtI/qVq+gTHkrSe/Azs/CudQORvJ1clUNM8uzj2RASWYXwXPKU64dpBrIcphypkaBW6Tuht3sCAtuvUNBVHfR3wjuYxPWq8UA/skFwDSnQfLtJmdzU0RsM2tpV5qo+dQolqbvPdXSMFOtlwGIIqxJjMwDgrlgq0+WGY4mGwk6x2Rd+QRt5gKIPraKAvXpWYQNzk5zbFy4i0rFJRuSJuj3Po684oSHpfEPaofP/QJBSe3GSGmDlkKVDYeU3pDWeoUyn9fV2juHDUIG3729cKoDrOQTCke6xlY+Vhu3dEgCPsZr8Q1IRjdZYbaZOAUqCkbT7rKINsyicjMeTMVPezn2UKy4cf203etEMsI2k94DC+ye9C7C8DtMlqBxIN+mp70dYpEUKnrr/OAiSX2oKtzA1hG3Wv82Z/xWy22UCelZXlTbd9z3T9dvckrbc8ElIItpZwAsLrDnFoov1atIYN3kwCUvsMkXRSY9UMB2/AiLAnvZkZRpsU6++NoIK0E3Kax0iTLTFkeb/AQp6JAEsh4KIDPGFUgF5HUYFwccIw/olTrFE4sT30YfdsAWWNjEFUd0Y/wC+E5X+AgobtWLofyGvCYLTH5E/DRfAeAPz+ddzUJg902vDGajKWm56YcV7k7E3CQBSrNq8v9I+7CnMvoA1qQGU5+Z/SIiHd3ZRelgBy387hL8e1yuPTW5OZekn4C+LyqzxmqNZhmI2uhCYAD3qqWg6NY516Iu2qX9VpGkh9wHQmYtqeetHcv9lnkwkrU4D6XYXPF2HF+RwJOmZtmDmkehzo+Vw+yeXV80RGBM/k7lS8oOZDSBCYD9/dlNpos4MqSZpY09rbPagCmzCSTRBQjG1DAx924Dpy0B8TudwpuAwCkG5IffCZoOEijnxXlAJ75vIodUzD2jJ3vjrlJMMympiAn8VMGMvoIcmsadJ76ehNTg+oQwuOuM/qV19s3tUGgSpaTdJbXRDNb4KOqlW8BEBkviT24WccQVJt9vwt/UHlxsLsfuKx4Lr9yV/Bwl7mz1wS1ncOWsVhRgKx8Ch0yI8Cx/I5Qge6C0V7n7dQcVkrgVzrLkhwYWnIZAukgonufrQu6CkYER3GRUTCiNvw3gThOVyFkrraf2wZ4Vip9HtRZQzlgC/WlNryRmaEi9t4uuY6ZFFr6M378s1+KGTtioMnBozLvw7WIL2Sysv9vYLAWnX5v1P7qDygVWT0I1eSnPmitszNhHMitUrooElfOJRg7HdqhfNteSShxHBXBrXJAKwf4Pd26iVumyZHzziH8/XKEUrF5PiNnqW95DEMZFvXCRdzTP4Aibc2pmvstgnKxEnj9B8I7Iea+lDI0sstGsQP7KPeIwiOtNcZEtZl63JFDXPjgQPnioTJOaj/wEIcDKW+gu2E4ZYj1IhPa10QInZuQQh1W+HPPIZDYP6MVlpNpvK8OiGyWYE27hl7JPKmoY/ZhCv0pGPD6qI0ug/CL4WHr5XuC5J0or0jt8hUHVqSZjuNe68PeTav6kAg8txWDmn8j9LcKXXQ0E+sXr8s8cW68M0doiBFWFD6/K8lHyvePx47JLk7HLgjipOJrB/hR8cXjMChFfA/KfoM6EdoPrcNvp0we/HZbpLEgJyHb3x8MKfrfDSlemV4AgS2jahGT+97TJkOP+Ahp0SAKfXbpYnxFCcZZP8BF+h8nOO5UenoQwPUc0igQ/wA6O7jRqiO/rFHIYgsZKynOC2cO3ZpUSeCCDFWazFYalfFAybKLEKrhgDYsfmdoo4A9TzJ2+8519smqOzfqIjJ+ZIh1iR0i1a6/wLrtdBixJphLKcQD/lDHG7nnfWSiEaMCpsSiBNOo6/9huYGUJWHzjtI4S612AbwnfrsbEbq/e7KX8DwAsZCAVHI+H8BgAn+AjA7eAWU5/yMTaN/xpYLhh6Dfw+AP8A1UO3njKgEP7f5EOf8R2jz1HK/gc6GcVmsmx++5El/kAWwfanTHCPrSWW09g/r26xk/ITUk79hnFGGA8mQcv20TB1p/gXcI6x7vAT1/TiscrKbT8vxHLQr5X9IYmZ54d3htO14N68CPv8WFLZz/ggkDfCyE7hAJadM/+AwATYgwF2bf8XFtnLJ/h4sbOcGG+pFDU8RLLjPr4F8hhG8QSL7a/ysH3dfzla6cHzHgOgAACIknSfCLcFJn3bGxa+LyVGTKjP2l+2BQxczmDUrrcD9snwYIgW/56Dm0P4D+Hu+gg9bjwFa58Ei5AVA6F3zfeETcGk4n4Xs0gCq5NTJyRr/HA4adYwgFM0OD/AGACZn8v4HAHa70bclEXS9Qo5Lpx/gIHlsVO11NwTN6MLOgIn1ngT59+6A/9gEBi4CAgH9/4HH/FBg5kqwf4KkDMg8F0s/h+9YuE/f3uO6eNL7n69pjvjIuBusc8T5ruwPaf9YQqqQu0u9haPvd0p4UNG/gCnijoAtL0bFtKrjzM9Njg7y+BLGJtSIpCqQcQl5Zt7MFSVRsP1lCNuFY0gkdyirpjOhoxZZ582PGs32GjJuyXHFBQky24FlNwGPNXy6s4qg7GYVhq94ycC37hy0YxKJKQkYH2VQC4D1kduhvNLILBC4GBBLr49i5pwjKMruq8b2AoceP1sOBG+GotYAouHFFfPioQ952ravHD+H8Dgd4t0ZPg+tIHMNuUKYpGxKPQZCm4D6NzIihjdGNcHO+w3R/Ns+bPA4xoqFbK7GICE4Jm6iIi27zOgzwCmaxoL+JHOIvokI6a8sP43M8ElgJT4pWYrNwhSRjA5BWT2oVdXpm2457OqWa01MR8O/G6DXNHs10Br7jG1iqmLqlsj+AKxabNnjESeZGL07tlCfiN8pTDqpyLdI7vqHlcNntk3ezekSEUWXwHVyultAMR1Qx7koIWqx43ybVzqRVeb7z9qJmcIO/4F9afc/IW2lTHSmS9Vv+bb+bHGlFtP8Gis10ag++ADiC5hgRv/GAx8iZP/jA5Rqqz3HQBeuEnaW2pM9lgcF6ZdZQQJ4hX+CR+tUQU0tR9D+AwST/ZRDKT2+3+3iGUn03f+gByjWbm8b2NinjwuWVFaGmye1jlIyWwbp+cA0QOdfZuTi/rbeH6WaNDNRLrPLVemRZRFypeCOPPufhsuYemmc+W9U3/+fiHS4F1S7yOB8glZpIEEv9GIh4ZTezo3vqMRFXVa6VLMOF4REqc9bL0Cu08XRvhfMlOWJiObBvfllIPesSCCMBzGp1O/yUQyk/wH+yffdDYnykOB513a22urq6y4SGYN51MkUKScB6KIy3IF6IS5uIOhIfSLuuqVZMR3d3YBjVsvi/4CA7Upu/8YHalPHe5ZYq3zTn4+AoC7QXFnrg3BDOeFhcHNVuIbAMjMUP/wMHFf7EIhee32/28RC87Id/8AiF5S4cpanjGCBJI0aD9Dy6+y0Soc8kCWP3lno4nIDcFuEifxPfhy1UUtsuUODJAxTlFrU43gPOgAsvnvABa9l2P8/BGFOAIM6KbEez0dgHgT3OgtegUxYY9NnIC5pAosFjNGkTOG/5y7i8CcqetJylKnZcgOSECU7x9qgyWVR6qKTpuSc0JJwv8fgP7Ypt/rKmb4EBPoH9j/bvVFYD30VtQLnNBQCwHSi1UPioxofssHvfhs3AJo/nDqGS/6D8izp51tsl08aQaAkJb3f8fiZQ5pR2ojdRIUldok0Q6ChQeh5QEgOHxfTkVd2ppthry3q6zerRhWuHepP3GDdcqx0yG7xRIzi6jRdzJ+uF5aKznH8SCZX8KsRwV/goTBgS4QCUYN0P4DABHuDBwos/hZiOCv8IFunP8dfu4V/DzGcz/kHgbhGaa8QEHrzFesfS3OkJY5mJtSs2InJLvjhgZN8fw8xnG3/PrGdB47e2RiM/S9ITBdp9ZPItfm6WVKg0haLa5W8Awd1FTNWNn8PAf7QK3VkYhKIpvVgcjC12Fj7gXAH/ABeCqI+n5PpIV6LPkZF05H8PAf+5/H4H/UMzAixnSLWADiodVFgQzbZSjUEO7/wBfEcRlDBbAfg2kh7P8Ogf8MQUC4iMsKbGa+PXC4VzpLZnGDBuHHIpcGtvJSFDuoJNEif8PA/7TQDeKFpWSwbdCf0Lon5qYqVZgSQSdfKnKPxOPLip9omp/DwP/u/z6D/xMv7sgfVG6KEqI7bIB1EaOIMo0MkasHwhIsJ80NUL4W9kp/DwX+1APPJKdx5W8YyVwwAkCpUPkhkoSZP8S5C0DRynMpYclv/w8F/738+hf8X6NXYAuBKWiiFURmkwllDVibmJqo5e+5kt+SW7mFPoSIYX8PB/7WXaUw/LrlDt2ndfKAA0BYu7EX3bNguZPJpSmAjWapwJn/Dwf/v/z6H/x7f7xsh4hYX9scuSM73mrck1Ok/9cQsTTXhnybJTMPXX7p/Dwn+2DQ3KvQ2RhuTkVtsEcZ1lM9kpc0aZXqnRGBwTrj4/wBmIbE//Dwn/wfz6Jz8js7X0caL6yXj/a+6oTnyPAf4CJZCbCkNLA1ao0aAiD0K3HTIr+Hhf9tzOAPy6MOcI+gXaf8xdG1Q3fnq3jJsufHhYRNpzhQuGU/h4X/4f59F/5Q4H8ilR27E4LVMOCy4sZ6veMWFj0+bL1DgqBZnaVPQxbX//8DHof3/Bnf3/AN38C8o38Ab7N3EOoZygYhaSzA9QHhh4X8Ab9JASxrQzYiICjjYIktdABpgf4qBIVG6C5wgMhAJhcb7+AwATHgLcvf8oCSaP+XBk6kSwrFY/hUCE+QzRgucIDf8KDJ1AG70x/iwYCzDKX/LwrvB/l4KfP/gMCn/30MnUBeB/FYEf/5eHdURKYq1D+HKko5VMD/PAnfmVAwR/roV3g/+8OBu/8Ig2f8Pg4f8h8XRjyX3OpbLegSxlJ2XLVAkI0vnWDzltAX2ayYqgSV9kLyqh0ELQtFtjcjF6U/rgyTVnV9co+gaYHAyh71zdv3yKo5/pQcN3/18MnUCNVRX/8GB4EDWfB/DoL/FXf8KpLZv/swZOo8qT/Sw4bv/SXF0gfw+LIcB/CILx/JYY//Hn4T/AYD//RfF1/D99j/mYShz/h0Ef30I/n0F/fQjQxXah/wcJ/P/zkC/kH8F8Sn+PiUef+BMfD/ovja/k8iD/58MqUfyCSR/86B79P4iFH/+EDKlBJmJiXHBqYmpn/iQzKv/gQAbMjc5OTT/Dw2s4YmJycGwcG5p/BnFGYm5mbnDKbsKyiNiYaGiUqHTCasbIbGDKaDiYqSa2m5g9fwpxRmxkcf5mB7+G3+BDZPuAwxRdZM06JMuQNLa3Nb+d/i9ord5P2WSMue1IWvkiYKV+OyVTu9gv1Rj97XFvreeinSqa//f8HE8S5HR/4IEp6/4EABP8EAv4E38cWlDaoHwwOW6bW++wXg2noKUUBcgbIZX+Mm/6OhX4EAIP4fXY/9eBj+vB/nwFtf/h7CShv8fCXkDY2BIwfKd1Scs1UuaywRLwah2zLS0kjpW/fwy5R8x1PS0TATXJeHrFkMVpFRE6Dq1S3bkoc/+Gi9r/+HwB/+JzjFyqPuRbWeNEr4puUgJBxHRK/ybtQR9MSwD4rwmtraQ7rD0A+6/2DBJmR6j0+NYtnEueqLi5j3C1LQ4mHzsKKyNDjSl8fzPDGpsPCuuFQih9DSHoxM+CFj0MRMepiqEaoREZJzFgSVD3c7t3Q/IKAdkakiZREJA2ywNLyDDzOtfUfrkHM/KB8Yuj6od13WLS/NhAEcjJOPFupXkP0poDv424kR5ih/ZyPu4GMmM/xkNqnzBC25ayXFrSiZJ98LrzvInJJtfFmDr+AIdhEzxXqjPVls0xsMf8BGPMdpuuiKSe8IUcoWhDuG2CcvcaSENML7pbnLjnTv87XfbK5CSDYu/wBBNWyygnDynLZWKDdlqjKyzZ1JOKH7a5J8jsbOJe2SMdHcIjs1UNMOm2JJYL2E5Z2a6nH7X+AijVO2Yo0xGI/7Y80UTmqw8PB/S5HD8O+nL0ZME2EzKPQReBrjbOs1owwQ/JzUy//wEUXKDAG2WJ/gSVMI0bdCQfNckyGb29cGHzAVaQO/4QFw3Cr//wvel/o4XvSJCz8/gS087hO2dVpYoKKa92Lm+otU3XSkseUKmAgcBFvDmYieum80NHKlqflSPklCx+M8gXqSuZyTnZ/FP/CBFqqK/7QItVWwBHPTS+INkVBZYv31sx3TXRZuA5PlzNQY2djmjio4fOke/2MPYxgD/f9R7/RFR7DoIl/BX68KOhx/D5LMDOWb7RB30IlCFaxE3NL6EFHi9IEABRYVKJ2XxcX/gI0tp8OD2UqZZvIPzR1nsjaMb4ltGYq2FxxA/cheA8ac3UAtDagCqS9ak9I08uR1CTLK/VRjjsz7RRHdI9f/fMW9nt5FbUL18lODb9jPCcm/mrIAbGP3lNeA4dwm+0HbT8YJYWBWtODHNmEuJ99ZlBPfGr6pgSgsYkOfyLi1oaD/H52ycr+cMdBJw6WCHKUL7njPDiHmn5N5rjCKsPgUD/PL6Xo1hv4fbHQSAGTP8EPNU7nXUES6I7CtMzRjbamadt8llCBzDRZF42DQ0MINlOuVBUFzZGXFGNMo5C+sHejGnRN+7MkJN79xgyQKfM1DbTOIaS0gfuvDNUyg4gPbuWwpNX16Q7XvbtuDlUhFoB11QUKyp43OXgUxGHGCLkxCbMtZhWHSA8RZa6C0RIYcbN03Tg/sDoD8cPw77A17jYPw5HMOvkgpm6GbYX/AQL2Q/V1J4oGo6OwhPeu1lRp6A90kt0P7ZYyANnPSFLLtrf8CiR4tGH/GybcBZxzUf48k3yzuflH2lQhWLKLpTleKobPXwoM6qIZXb9JHG2lytoIx4gbxI0+thX8EAw+Y/3sMPpGzgkD3JRE2EI/HMapDtLwE5Zn+eAVuYEkR5ov4FGHx71/wEWSO9R6tiuZ107NqVIhn9szE5qrI/wAATlruTxHkRGGENHXH4n1U7pOm5ADlMUkadBjBgYsacZ/A2a4H6/+jgU2ICE/8dEnc4Uef4+BwcnGMj/swU0KSuR+ZAN0c4X0ofQfLEcycq84i73POzTr8wcVNfzGyMCP4nqcgvGF8wFOibelYydutWgzOBjKqOuwv8hApsX8Bg6aRdoV+YNLpzNBFeMuhxjA5zResYvLVFJ4/VBPOGyKSxcQ/46JM+1r/qBJn3ugP+/A4X5U/14CKtgP+iB+HvuCiRNUC7y39qJ0+DXdllFL14HfnfRCo3S08ULhrx6khADquz+4a0TX9ryjeWnaPFvHxPoTgDf8IEOYUv/+wfDLz9PBhEAEVLIZ/ApXcWWdicSa6TdjqG4kO3384ZGng9uJVEHmcf4CGiJgNy5a98zL6c8N19cwZjgErG8HnPRdDsmey1CD+CAY/3//8Phl/6OHwy0oVsv4FMfi8KCFiPZlAc53UL2M5Yisbv3K9G1TT00x76C+gsnl/uXeX/gAcH/CfrXZPiR0aw3gcxVA9GKsQA/jtggKWhAI6Kv8+Dd3YA4X8PDKqL/j4Fxl4i7V5quFiBJaisBub5dYM0oNF8+ZRgtZV8mCTuTXgXNUB9hgBuxNP4AHy3iusbLyiqKkjKsKL0TFacg/gAecKjoz38Fdlv8oEDqPcZrx001KgKjG2wR9GLn6GBVK75hloXLJvlJl7oo2NGE59fw6Msrf9UjLCBfz8MsrX8vDLIF/Ioyyf+mBL3ddSzJYVqTw1rkVGHiNj637iE93fwAUb185Va0Ho3CZ/C4ZzL/6GBi2cP+HiYPWP4fDOYcijlAmGFy/7wJHIW1ahBma5MM5fr90QDuZDjOT8/cWh8Iho5Tpesdon/sOQvpVOojsssoCMtGQ4ijPD80e4wZP8oDfevch0jYKQYewuLCtmzIjzAXO4B6g1Qz+GTsMsPwctUv5IgC/w6TJrn9UkyYH/PxMmv/y8TJgT/IBMn/jgVr7cP//ETGH+jCJjAVnJ+s/wQN7SdyO+w2CRa0ILQ3bvdYRk6kVAneDbSNrqquptb5T8uf5lvL/i4JrihqfA4QDBiaa/x0TXFIYIl/z4zHf5WKZ+P4yQRGnjkKk/0sV+tfwu9f4ZOa8abeUcBkaQYo252xSSFtHWzxPwbVMFlbXx74rtq/5EEz3/8iFbuQEA1X+WCboT/OBPYofxZzFu8j0zkaHZyUw54EBywBjstsxPkKh3z1gSazZQpPtUV1CVlETn+AhxSztWJXvodnM8z/2ubiqBwEI1JKMXEVvEtrBi3/JAO+D/i4sZ1Gp8E/gYLl/x0WM6/3+Fwv9/hcv8fr/MFuRER8El15HF/m1sgdEEmMH/oG0TcE2hjucwtUkvIEPsmi1DADBE2o+antffQeAR/gItiaoARgKrOuCiTnOu51xmcThzZInJ1W6Xioix1F5fhYtuUinRAbIT0uHrCT1FYcruZ91zjmj7uXEXvEsLiCvzWuzWPbBhxqcrQrB8b93fxQixlcc/iLyx/xYKTEXn+EBSYiVVHhEv4AVFUiZbP1uu4rMG9baL9Gjs4ZN+R8zc1P/BQpMR844CMD1n/CwRgflDn+OAjA81suGj21AwP4vdJJOimIRTw1LCufQEkULxquwaO18sjP/fQRgfXo/8PlflYO2A+BGwZHpBkC4IHDbCkSswwPS3syJ5GviTO3gpIvLGcZq+IgaqPM+AAfx6xF5UYOx+5o0IcTVWTktTbKGPyn+2BAMDWEx7M0/tG27ythnKCqFi/CeyNv48HtQZWGP4FiFEurHCKtb/Bex8Bu97PBetozsuLLzwNVY5GkK/yLufdDXz8xYTgdGqRff4KGBjAi2YP8gE4HX8NKEP8fDq/8wDm/+AozyE/xsSC5qv97MpzKPs5ss94LKTxJBBXR+L5SScY1P54D5fQhWthD+BSxFO9RzgD0qtVRwIos2lADe5zPAbQWH7WxE9BH1AlOo1dj+AGbUp+fxO66+tvqVbYQXMVQcThOlcyv+B325avf5pfbgID1KuyM3cYG33uoG28IEYcdBzF/HkTvgNtf4B705zpcNSE/mJ9u//8GLY/0W+3d3L84UI6Yh7Sju/+H3o5aLu4pCflb8NpjWN8eGvqqiLW0pCas7a+QLR3cet4hNYqOb5ySqDwo7JLY6rccdMVfQldMDPvwLR5Z1GB+CMSAnLl0116gPx+dK0AciEupIkJTzosVa9vATrPZxjTi5uV4DX8Rt2q7hAy+Ugf1q5naLsWWbT+ZvdP/ARbCjpGKKqk/stiE/88sEsEu+oCCmDs8K6zWOmr/G9qemrL+3qRew8NP/gIaTB4zWEWZ0PT3mAJdn1M/wvb6WfcshjzUuLxDQtp8fyf4fb8AYDGfgU/MZgqL8CIW+68LeKavuFXJAvk0v1xGrBz/vtZUqtZNotcl2uQzGfxeVgnAgco2Z5BtrIfYH8wq5Ld7nzNk6nS25qo7KuMqfGjGlp6Q6DuKEoA2eFos4VD0YzODfcpSLmXUK0V05AOFETAppf01gwioLsjxIEvRmDtCeylzFM9VbghivbIRGjx8VawQfGzDztPfZNdzPZ6pBMj84TaA2jpU05Ujtx0GrDGVGbYXU/DB6VDr1HfkypiBSJyOY8/A/xcTkoGqRT/8CjfwNzmH+TCclD+IT3v/MhRIv/ERxV/DdgsA1T0aIfpz4UyrhAGkJCmu4CWHw3eBNkyaZcswo/DaWemmv8cxOL/jwpuNnXT/LhTcboTPaM93Nwe/BXfREtW8mJ+rPz4OFSnwXa/jUv2/gNg8/hRk/bIICmIQCfxblgtkX/xcDmF/+Hh/4r+EwEj/HhTgTpf4KLuRv8VEc0ll/AjnC688Sw2EJ/IUNh/Hu7X/B1zR/D98P/mQOIa/gxj3/hPpDz8MFT8SrjaCwv0ZK1uRXcvhziqfhzg9f/nYT6DHFkt2P+HDNP6jOr+BaDBS3e35cPVj2xs1Ni6hcjLxMeVmqaQ3dViep8KcO0N1hu7/xYTcwSH+RibmD+BLcxa+AFGljzoggH+bH9acD8i8lGE4yv9oJWHk66OiqXLUXw0Fi5wNGJdqyjdZbuaGp3BJtMzi37AKEApmh/wQbq/ToW13dfsbBnYkR7MqC/I/KlN5Uiw4CUJ/YG6Wh/AoDN/CZyT/uiFVj+5iYYfOuv4Ajq7tZuAXsqZuzGfNSfNev9vFlZgnBaf3nM2DFSP+EzR//BxSHpIQgEwucN/AYAJHwW5Y/ys+J/8CmqP8B1FjRj5x2QPm1/gIzvZdR5nnZ1IzBaJRPhgk1f8crb1OsKfFv9yf4UArPkEU/4EArZGEAlPLc/4DAAR8gwaqXf4W8dwQl/l/x3/gMDP/34M0sIH8fQloB/EyBvzopCTSWu/xyKFgLUa1pbef66G9KwII/y8M0sfw+Aof+PhSvf/TAzSx/EATHPO73iATb/rwZpY/xcQve0IAAEWAACeBUH8PgT/cx4PKA/4mBnYv4TCsEH/FBUReAAk/jowd//gb/lYf8UBc1f+NhmnhS/gW69bMGacNkSrw2B2HxuZM7985t9sHrr4a2+STOG/UOdK5rW9XQYpY62HDFR/L5sAbSEmHIQGll/4FF6QyEAv4BGAvn+AyukBgwEgjD/JwuHUrEDyhNGBVnyWqF1C6Dr93vY66fwv4wFjJQiwOhL7IfwNMFN5oKIB+CUm6socbBmuD1THGaOqjRxkPNpYxbIHZKnPF0yX++eyR46dcLpE1fgAEw2fI+rm5AXwgv+CC+FUb0f8lEoYvJ/oQWwac/49FHAMg/yAH38/54CHNYCQP9eEoYuB/JfIh/xYjBWFzrJBZ25WVH+G8EnX9ED/xgK+KlO3eoTHPiX8OciDTrB38MJDUDGqybVHib/g4FD5x4kZHz/fwti1y2Mh34UmH1nFjbDukvr2bbDamaBm4cUgSiG3E6oRHwDDTnuvEx4czFxIxkHTkoVHbNwH7JQFOohJ8IkPgXEghcIms3BxvTRTIHL9pZuyTffweA3A8pVauhMYJuBI6jxTRHK8JkLUyPrIgIGXpP5bz+9EpqAxf/D3ecisokR5Pt0JIqRsT/wBb/Oc33Tq3jcGJhex6EltAx24+Ei8gzMkMTrexkwVj+fK+B4wNg0i1bGkrb5M1EjWyiu39EzaTZM5DfhFbP8QYFO0Fz0hLmqcXdPvo0jRXx+Rqm1cf4CNPF28VkKEK/Xef4GwCOIf0bArg9APDKwUBCtynjBWn/heeHY4dqEV82qVT/IDrzs9474p+KxYNI6dwepE8klamcUHb9Ba5c7sxLyl8BR4zAnbxjQLQQzQW5+2BkHP8BFqAkCdMJ6NvkVCHKXBXl/hi2DE8DmdRPwkFaJzMhQOV/QmqlaHgNmOw6KbgpityHniLs8dlD0kMGWSwI4tBxxg+K/mBEeQOjxnmj/wCU8mztqeP8C6XLj3v+AgjA+RjV8c0n/JDKbLjS2Um+aNUv1mILbRaDDN3F0F+r6yTwq3Lubm1R6IZgboHudXunmd/8EBMHqh/hAcmkYf/GRHhE9Pi8byATAfwLqSv5Cn7YC7+ET6o5bd6zA+86KH4tFtSFFU4+dS63Opt1W3v+JhdwH/Bg1MWBOMIAdkFwH8CAAkGArk1/E4zSUFwXpaxilYHYJzVniBdm0Y+0ykj/kIXdU/gPsNi0YhMS8wVdYjQT3JXO6bc0OExFiEuIodumSj52fG1sHOf8YEY6Ht/wYZqF/xkRjoPhNr2Gxw3V/ogjHQnzBEzTX6pF9hdQD+AMSoou9BWC9JVKf8BwlNwqfwHORb/8Bzk/NBge3k/50EpuHv+HsLei/h+chx0JpTFicRaPA5SnRazUuAASqFW8YFIcaKJ4PUa096uAwuf8zEwHPbijjyiMIjwEZ2vRx2mR05JCIsLYICVqZARP8kBCDXdH3KEgS18f8BcxacCRT49O+/lN9yiYIIT2zqc0h5sNK3l/4qDYJw9YgucII/8LDKvJTFf4qI9cgHgES3pNOj/18Qgm/wMAzf/+IQTf84BfEwDt9zojhKx/HI9fgishruUl/6GGVekmKgY/QT80+/ysGZrf6YIQTQHYRki+crx/ykQgm9tO09wj/ggNOC/gcC75Rvmzd9fwtMqIas779TC5qbDZddxNEDS//LBm5X+mCCCkB2FkfHvLH8//EIJv+mATNX+DwVP+JQWP/QAQsgZIxSGiAF9L1yAwycm/8AcZmae050GG3fnCVPWOzT6q/ONRYj+BfTXgw2cDbgutpoervGsGoL8Dlc3O63+wF+EnSqpf/UCEE0BwLsEMPNCf4Jj/u/3LXoYhP4XBbxe8Kp0y/hMijAY41KDxS/9mEIJvmX//+IQTf4L/Tx70r8b+Tgx/+ObFn+BgH//OkQgmg2DT0UV/hAhA+ZxhwJ/TJ1P94EIJv8DgP///iEBN//8Qgm/xwDf/w8Ifi6FSUiXHxVb/3AMjK/iUUf/lzTw6z/IAZGUYlxkYmJkcmX+JgyNEyMTQ5MP8PDw+5ibJiYGRwcHBiZmJobmhwamxz/hohBPNTRlMDkzNGNwS3QTzRwUVk6ZDk0MmRhNDl5NVpOSUpVSP8UEBNxubf5iIQE+o/xkPD/U38Cvc3/xBIqn+BPz+TWV3za1lLpI4MX2EhDLR2yhLnUiUjCvb4dpjlRVVky35/h3r2Xf6p69gT8n7r2eP5e69gT+Revaqr7JnS84jwhecl+4noxZ5hMpoyoRVgI6FlQTkTk8t0Af/hWCC/wUYhbAMlCATkO2/8BgAJgQYDbNv+Khrb2jtYQPQAC5AiT8XxD+ICqZ7ESGxEgFpdXte1yTQY5yzGbhFHb+KmL9ucPkKVN3XZJPbN+cJQruU4oJZnFncAFBIIW4obXG4N98/4sFWH/xaB1/wKCjH8PHxVr/LzPWGM7C5rTS4IX+aA6HL/JBU9V/An87aAkkPWvUBjrGAqt2bl2AQ/McnozBWoEPsi1JVmXz8TvkfyJivXTugwGaw4QDdUpeglvglGtnHPomGXkMhh/TtLPLDrlp47j/IZzDF6S3smA/gkUPuHkwKxAIQQLlHbkrdBcGz88Fb1rmTZPUrgf3yRbh6e8J/ET6OhG+FqaQ4ELPoQCUqoLjUxxOoa/gjND/xQNy0i+w+gwB/BQDJgTXpHYdL0SfTucaMeEAhEx+8n/EMOmHptcMmpaKehs9pwhyWRQAQLNIHns1KrqaO4BwirbNAXyE/wGusHMIBLexZ/wGAAl5BidjUSnLUSgjj94qBCR7Cdg/O1Owrp6FJwFyEcmvEEI5/5sFotLzKINPS7MLmt/4dTjvX+H1eYbiczQJJ4k61MAn+xTJIOZ6KH8hbnNQWl+19iB7ayPLC5YXiJ34nScFm6UW9Cb4yCcrIrD2tdGD222OO9fMndNyO/j1KmB8E/2EHaKgyB2e3ru1Tp7IBmjVzb+47fJLdBpJ31FCRII9qmhHn3/hv/NYi//j4WfZ/hMJW/xkEXEXtQWQzEof7zAr2//Pwdor/n4K9v/z8PCoAH+lhst3/IAlyn/JwfH/KEDPtKQzLXRr/z4LaeIH+rhsv1400g1pbY/S/8oFHJM/jwU4H/h8Af/0oJlmXiimxc6jhlO2f68Gy/fH/BhhuSfwmDa/wpNIhP8EATIOJauIEXUIggKPXAAD9s3SiAQ3PIrn2m1hWQMpyYX76a0AAP+KCFWIB/hduxJ2iFVf8FEaUqIfwHpeAX8K7OX8JgKf+jCiSn+HxT/+HxV/+Hwd/zixs+ErzRbYCg/hhxKDeAXrhdn/z0QZhfwG25fw6+viDyf66Dptf8iFNf3dkR2RV/rLCbr6+ef8AcQm/QOvs7YqHhIjmAl/o/sHenY8YtrV/CZPz/g4KSkXP9IBSUl3/A9BE8f8PwRP64PiR8bP8Aam7o5u4q/VVm0zTOWkLbWPPFSrfwix4C4gdQGpEOcLDBc506I2B20VGIqUs7o0QeIQ4dc1lscIOvOtl/5AFKQv8cDTJCZCATPKcn8BgAnXBgENan+dhpkj6/zwNJUQVIr/76GmSOngWaWiGV7gl006BmSiy097hSee5ze1jwxwq/D+rI3VAdX/34NMkff+PA41pRP4eBX1gdOg8jiMVdRzqL3re6qa8QFgMIzXAQz3ywtN9DgiCfrj+PwX8PJArdDI97Uy1wIcsBKPQo6RmXMoAcq7Y9zE5qnGySYeRUGUEdyaQae8JqH2owigdYO5LpgGwiqlnQEzZroQMUn+Hz5MXOOLkYT+IeFn+APHRbx9YhVBDSO7lF0Z6nLM18JJubvhLK/9XDTJH8RBa4L/oIaZCP4NBv2nt7pAAAQAF/1YNMkAv+rBpkgF/1YNMkfxmAo4D/EARf/GoFeBSbdeYrfZ1yoDCgSlW2bcWiJrUDZpNiYasuC72yiwXUD/w6m05g6lDKKrehTSMh1hEAQBK50IDCpKBisdZrsSHI8NtNUFn8OqvS9/VKr0C/z8q9Rv8vIr0C/yKq9aed7md2mqr8K1o5ltQLh5PulLhYnu3DzKIDg/cihvSqsnHOHSm/gIfOzXldhEKv0uemLppOf4QHSlOA//YSPLy7mYPMmcrmuKCzXNaEnqYOy4Q1d/ngkknJwJav4F5idlr/wEJpjAWRiaAl4/I6zOUxIRk9etrhupGyRKEAcsMXnlR5LW8SlT+pXOjqYDgYbOppz4EfqQ5/A1FxAd1/BNoHBgUlJ/x1VxYH8PVcXr/j4MwNtfiPqu3QsE9K9fBQIvJTDYnE4KGaWukVoIbhvaQjzgNNY1R0tF5gIvtekU6rnw3367Ubf4oCz9kz6AkKdDBpNs/yUC3Nc+JSMoNMrqaS3gmuSx0geXRBMBI8BSpSORjfn68YsCCNU/iwZIIvf4IVOzaHWS+o9H/yTF9FHsufz8Ozawfhhk3ypOlt3L+BlaEEae3u6xt8K1p4axBfRgoVoW/nLj8YXdFxv8BGoSK2jVbuwfwBV8MT+9bmCqgaLKtf7DMCjMFZ8IWuWeWNXU3x2wQUgQgFVM9P/noLM3A57+HbG8LBP8fBCUaWsMkl2KQQx4KeXnH6b+IRvGT7y73ALmW8oXKsgYDj3SlmC2LzIctwoDUrUPllZ412tYO1/dTUMKusn5HR0DamR/kgl4j/h1u0X/6pbtAb+fm7SS/l5u0Bv9dCjg0h//+J85v9JE+cztgSW4r1r6hA4CW0exxKA0dUKWdBpRPwQO+2f97Dvo64Qjdc0DhP0Zt5Mw7QbRqd5iIOfx4wIR+dSv8C/3Hms1qswvo9V93UG0j8H3Ht4lOuP7UyPw7BkhsXaZAWLsnRVBvObv+Ah7xeE+I3f8KQRGF4GzjeN/IE0flP+ItH4GApmFLLGi/gbR+92v5iZcZSx4/1trTVg8SAXLKzq7Upfbc7NLe6zrBX9+Uf4dhBWD+qYQUH/n6BBZj+XoQUH/kSEF7wKRBYxqsO1eeo3RNu7NC5v3zDTwNNPbfiQYJSLmGz197T/HANdqLf7QKZC20CaJBa+wwAY99n/Xz2iAWvHxgEmzDGztBZwMzEiK77NRt/2EUyF//zgTEb/ixTIXRig+d/4WGzc46Q/h9HLtqz3Ff8BDYjuXCX+AjqQEYJ1xsd7t3IBkOOdyONuYlEpd//wEUxfYR9410KiYmePbf9Qyx0zbJ37+W/NSi4z7y6tW6sf4CHUcUhRnxKqPASbxgH4vDi+XVH0GPgGAVkdWvXhy77QC+IAoAU9sG5pCrxafWuNeo71re73IwqpgWUY3XZUUNbMi5kkZWs6UhBYB5QIrqg2cqP3yBDZaReuKQELtSeeonxf4FRRkeEjkbDad4+U/Lhkor8itbG6dxi8D293Kx57gg3AfDhQ/4fSq0RR1s2FIFi4ITWrscFldqfCukZYeEioiHr8XUqYllgzzto9e9x3eVQLNNEz5X7S5h9O12ydaGk6W9AUPwOcbNS36UTrvtFFD32aCYK0ivZ3pyQujYLNIEBfoKBJDNisLICvd/gAE9fqStR2mvbxP+AAUYmkdG+QeZ64NbtzAZi+wEICt6NutnwEZpWr+7+UhZsW3iU5hCaQIQjjac5x94E9WLzCSBQF4NGFEwtePp5jkh9MJNsLFDMezNPdbC9SAdjMACDuuO/+BOnHvBj7D3CfIMWacF5vsZwwnY0fdnqO5OrQu0JPY7kkZScy9v4Yoz6f+BTV7/2wziijd5/AlGft0lY7LJYIbQ5GDCuh6K4oiXZSXf+DC40SlIQCWvc6/gMAEQ4MBE1uU/xMSW9f4uOeHgAf82hPKMZ0TjU0HLCJzT/Ihb6C09veHC4adk1jq+0Y54k8+v9MHMCV76swGvAU6uHOF1tNcCkgfwudSIP+hCITIDn/4eOpCK/h86kznSA18uIuN0G2egTnGkq1WpFSTSOacNwjWhtoXMzIdmUrdP5wfxZjWkL1hwuYoqauIqogLHwhRTUe3rhTSdL0E/ygLbid4kLe0BKBNpoort9l4tlPMGB23KppnJS5Ql4/TZIwQImS5/DtT+w/1TU/hH8/VP9J/L1T+Ef4iCuMzvJAB8NJX6UVLla/CpybeWjqyB+mLM5vq4Iv0zCTDANZk7P8YBLqTf/4wYSdqv8BFw90xQV7P3P5ysnIKG8/g9W1rdlBJopb5g4r+fhpbUbT295H/sorjM7yL/bwVxmbY9/8AJdSGxLx0c7ffVPdzuJ0ALM3t7BwVqTccsh59xQgKCb1mjJdhKCK3jCyYP05HuEOEOiIkG7EnvfSyOpvCCBmiJPDHn/z8K3BpCyl81NCjZVfHzUhm3jDR3PgtNxyc0RXbIRiWiNwbj7sMQFUlZogFpT2bINiibdaUiwcd66faTS09nwbF6lxef/5+K4zJIN0dtfVtmI8D9BHuNuhsKP4xbmE3E3aKVXSM7y96gtyssKZw57VF/olOncs2KyTItwrUfWiPyEs0bN9bhCgFh/4/A/8aE/DvabIi5iVPdMCq4q4Ga05z/wEDnAoWJBplp2uGxsusCMhzrD70E5rg3xoKKN7RRcSjhuyGAT0BsfZKMbg1zHYz/noYSdY3/xgwk6UgrgtQII7w2N5SU/CzDrzqNaXGTqREoo9FPxNP6MPSJ/AwlJ/sQrjM7yL/bxXGZ7f5AFXCP+ADCTrSIFX4SdobBeHMIY9rhMsCm6YfEvBg0xc9eN5cefZQOzUV/x+jhXMOoME7Csu3Ew69a1G7S+QsLzQ+v0ZPAX1ZHzH4jJqNXOtP8SLQEdpFYLqV7/9uDWLcQT6ZlQpu31A0InXxuxxuZW+hGXXmqhcB3L4cvzSpkXQHt/AAKsOVDY41hpJlKnpH+UCnrvvKsfyYdBjE7RUhqqHmeu/jxJENPyAFirMLoDVWjZtE7se87dt/hUwW/wQRHsU3+EiuMqC7F7/icwV/xMJd0fwshCyc6brHa+Vn8DpKpbF6WC5JkXUweT9H+AhNGyaTa0vgpAy3b/WUREf3GUsZ8P58P8TGZBb3QYECt8IBfwC5Ff4IMWCQS1n99AwxRQLH4LOlYN2INRSHme4YBCpglHa/+CcMXvMcJamhwCBAWKdfOh6l6PgQFumdyzVFGatdQEWeu2nBQuP4TARuuAhAJ/V+CCaqSRInMOAG10T6KXvLWltI2WXFJ03wh/wUZfzkn8FAMpq+My7oDpmRpLYvG61XGomBGVUkKy8fWTRmdtd3t1Ml5L8woC9fIT/gg387p4QCPkCL/gMAEkoMKrpT/FwmB9SfPCXVp/tYcVY/hMIu/isAf/zIJgPAP+fCWhUB/h8J9/l4JMoD/PgzuAgf68JbFf97CX/38Tgwf/Nha3aE/0ILW7EWA54Bgf4mGZRP4nB7/8ICX/1wAcQWc8+GCm7SD3B/tQnA8LjvARY+G1KxBSMWMnMVkmT366CNpjGzPjA6zIajDIRImb0FBAiyB0VF5OkxzyKLA01ff+ChL/7wT9B+/gAcVZjgkuP4ALj+CAab/ExckkBAHWBgHD8sE+Ef4mNGBQBAQUCAFwCAQlxMrt/+IDwAQD4YPgCAQEmAwEqBgIAcADASwDAS8CAFwGAwExAwCK/hO/kAgFNBQAg/yEJeRQAj/4CErixrWAwFnBQBAAwGI/yILWSTH+CBa06I/xcLWQgDhAoBAf5CFrJAHf/gUBGBQBA/wcLWSUH8BmdP+Aj0DW/gWXaAR0BJgDrAABAATMBTQEJBQAAAWEBZ/8FC1lodAJ7/goWstjwGY/wQLWWgNyA4X+CjWSbWAd//BQtZaAAEGf4KFrPwwCFf/xDN1BhiOgZBf58Fqdv8gDN1CDD2KQ4Czs/EYaPaFbkMvN4B6qicGdH+n7EM3fcXFRVgD/8+GdKP48H4/5BBf/9bC1638fgPeUVFxd13Ik3nvnRnBgqumGXLyJPZJboeRHzsqGCS28Cn8CsUZy8PLb1xERR8KhaNwoP4jhr4sGjuE+2NaGPVC6j13rMB5/jQi9ij4QAlUQ9P4DABP2C7Pb/JxUNP/lgw8X/wYdln/4sIcbf8EBrQPAyH+FCL2IDoJv/FglMkB8v+XhwF//Ox2bx/GxwEB/8C8nH+biYbMDAP8rBbHn+ZAtlz+LI++guGmRxiGLMbB0Z1KLp6wzjYG6/4dOIpdAtP48/2AFVzP89DgL4DWRaqEv+GDOET+Ewv3+KwB/+cASOWaP8sESmJyhiNNVUGMcaCOxbFvIrxqCZFo3v8O5tHdL/w9qZW/+VgyCv+M/Q8HcDeAK9zwP8UFd/FP/ArryeuKhATDKB7vvPM5p6WRzTZgSr2zM8hvzPb6zrrB2b+cSfwtVrX38EVax6n+itjI/+BcbdLpqNdmYECn2VQ0Mm+2517Ul/Ap9kGPky7txYU6YyxOyxZ9BkJ2Z8ahv/g4tbJs4QCNrGG/gMAEjf8+A21v/D7gopfw/tP0vVIGicuVyydF+IxTA3yAlA0mmkUIky3g43RX2UZc5kOsBZTiCo99/mBTIOLQBqTrkfw5XQl7o3yxlNGrNsRVs/5IDbRO9BYamKoMOFmIc4SIJrlVDroqcfwLZ4q2czlH3vmqz5EVdFmKpGjuImyazfy1Q6iIrmXoJO60/wYQHKsdPCfwHWxQoCDIVYAQXj+f5OHRXknhJ6UcR5eZR21Wtn4kQYii0w7/hfVjAvtHQJj2fP8gAuTUxf1TjOhP8/Yztp/L2M6E/6IGhNu9Ey6JF0GIkBjHIKANe37Lxgdu/gBYGBAM/N6Q4VeEyLfMBsTX/Fhvt4f4IBrh4CuhATFAFr+AwARhgwF2/xgUKiQg2eqv+PBDSHSZf/Pgp6N/nwcVD5wj+PiJki/z0NXlgP+ciwYpEyaHhaPE1f5bG1f8EDEWo2tGsBibhn7ob6blCHyKv2D7ksKOOMyzHHMD+LtHgigk3YlOZW9KzGV/Tq2RatyMHf+QC1genHSnTqWwEhiUs9rsWEfzAbEy6hODYl/CSylo3kVNWXHUb258SgQOBnj48xMMezDJJT2xuHAuRR85MOhxCAryzNIbqBABxguqBwoYN7dpcTohSfmwxvWShMf+vuIMyk638AWn4nqpjGqH+Ju4mP0xKDiy99kwbhmrX1pdgomB+dt5h2J/eITH8A4oo6g9u3QYH8NRpNRCATCgFv8BgAIbQYCmiv+ThzDSFl0QpZ3wqcBr0cLgvMRjnQweCbv4gI0n/HBWAqm//+DppP9GB00gBUsw9/AtXmkbO4WjpnHC41dA5Dqgd3qjm/j3cv+kOS31mI/BRml1XimGOk8fGA5wLFIUShmdWv2kw5B+If4MPRhU+EAowi6P4DABPGDARO4JTlMNKJYNSHCMzz5iqntCqAvEJ674dbBFhCgDAAuEB5Foscc3+K6fj/Exgqt/HFPx/AwEN/rwRSt/j+qXuE0PTB4MzBDNzQ5ZTNFMzA3MzgxNjI5MEEyNTBDY2VhRA3RWQyNjFDODNGMTAmYz04NDUzJnR0Pf8pD89pM2npkGpkbmJmYmhn/BAAm3/xYAgt7semRqbGJz/iYfpE/gUcNUxUnlXJ6TaFRBgerKhQzpUaM6HVOUm7jXFFhZ2/giDrJPEnto4ez/VX+xcsU8jmEVwFzy4X/hrgIQCfncpgwBSxz+JxGH+L1zcD18Eq/wUeRHfwkBAfIH2AP8CgQIbghbKfwmvJQFyB/6EGCjQNzAwUBQP4TBAv4nxl+AA68gAPD4APjgAPpsL1FXpg01qPPIbEGUxmSut6c0prT0JpGGGuQJU3NkJ4AtlZYLs8AvWJvNLcU9lOILbv77PAtvtwOUKk8fTdntEbAAH/EwbPz0AH+AiREpp7fBwyGDi9uwBEuP+ADgA9VLkOgCdZoGAwD5/w8bCuf6mJ26/4fypQEAQYG/wQKM7/IIAhQBAf4oEbFv8hDBVNaCgAB8AP/ACchg/gMDLAdT/BBPDBrf4uJ4YK0FAID+FGEgCf/8FCjYx/EDCXkMO6/8LxZ4QGBTn+CBRv69/yIKO35wKAcAGBXL/jAYNx+AUBAP8iE75wr/AoCN/gonh6/wQKO3kAF/ggO6cf4E37UABigGZAZn+A2CjOAdT/BQo7MrATKBMoOABABPIE//wJEqAUmBTP+ChR2+ZApz+BWEPFAs7/BQwcZN4F8f4KJ3xQwDFf8FE78BsBoX8CgCKEBpf+YAf6NW/s212dsdeuUWH2VdDaR5A1QREEAgEa5x/zwHkGkbxr/wC5rihW/Fe9JFMghkc42gcaMDEV4Td0HboVlHX0QzYlPPoBJr/CsBuBfwJAb1MIBJeuj/wGACKEGDz43KfwmUs8+CDS8OTVkz+W+NJp70C8/iCPlDP57kJQGA/0sDzIf4gg6NL+G87ux/wEPvwCRII/x8KVfw+SxIX8K8XgD/CvROgv+nhuPcAQP45KIv4FBH/4F2a14Kz32GYluybdiyNQUPVN4w7cCnbkN7j+YMYVFyv0OCBXCfbK8v6VvvRhCBD0WlUpVqW8qgv/CAvFiaf7SGaDpX/YQzQcAgH//iw33P5cWG+nczkXf+/h3pIcwEcll/kEuFK6nAN195HexY9MxRrbJtXtsxBAx4c6SYqw4p4NkAlEoVnnFDr2CLTM1aQNLttqFKt4kzHPOmnHlfvdamgzVIRwk+KraVWJ4b0pDJDkYBdG0UjqOxLE9E6u+0INIp9ypJXXw67gVQgXVq0YYUZI0Tf7uVyDSduoUh9ugsBgWdxcvY8V7vd1x5z9G5zOBML1ESSMKo3tk+2lUCUCx0/bx1KtgUA3bR8EHURlHEYvgSO39Z0VrZWsqHEcj+IAN+oZl/sjD09i+jcthM6E+En4P7ex6hNaLT+fOSQBroWxs773qechkpdsC961DNzagrdaAHBR1z7gT7QeBB6/TtS7NOgHSgd/h6jkQSbKZJZZW/pVhBJomWEQ/gCqwvY3rRAsVePlNGPn2Ydf7j8APnIRS/v92uskAFFNmql5Adqnflq4blEuwOyad5ugIvzAQ91hEbeV1zI71vQH9hm3p3V4lkjqYrz6Ykfvf4sFUEZS7iV3jpXoHKHKj8lYiYG3uRURQvJZxf8C7VbEVUN0PSKWxfNS3I0BIgWIWOL9+4LsgF5d2qFMloRQs5bjgVxtDW4zJ0oP++v8FedZ0yEfCn/goda8Gv9dCjzAFyM3iHgufBGyn0OB/CgEJ8iKj/go0R6/wgKPMB8e2fws8CiIS/6wFHmP4KAp/99E/24ggf9oFHmALd7HELPrgNQF7/DYIR3Az1AGGxC/cx4gn+ehR5gO5P9hAtQQIgbh632lEFWh8lu3g6dNZrPUCJfhwH/QxHfJL6bmXsqRL/8WBags+Ew/v+Gc8tsFCdVL2T//wo8x/Ff5F/n4Fii/0gKGqfwUBwf8gKtO//goFii/hQVd/xQD+aBf8FATYOCQZdX/HA/wBjogo9cf4sBYpAD/IwLEh/jY7O8/xMKLD/5kFGW8/JRQef/D4a//pgFiCBF/WQ1I6Pf4YGfLpL2+hoV2//D0KEcfw/ESiT7/roFiCCSD+PRV8AYP44E//4y8GLwdlNgvWykaMqftf4AaViz76sNdhslz/isAf/0wT5LgiS4UsJZHH/LrLSDsmzCHLV/nwF+m4P9rE+3f+WCfvv+HgUDNaaRFVOu3f4YAvza+3S/ic73V/x4KA8fwhU/NM06tKDPiHRQ15K5pWR7CL/rwcl//l5tNAeAORxssrwZOzcdsiRn9jjyEIZXRFBrcpIGBjzKNPy8Yv8/A0niDGyHt3FDjjHXtQTZwPeflAoTfilBrnjVANjKEkYhHTQKJ3gi3kIbjxDNLrSjqEaQv20Qvdqsmwn2DxeXGrg6z0IN/76FwVrg/z0RHUE4h/HwE//BtyEhgqAaXpT+KD2fo5B3HONAL2xf5YC5liDLj4cq6JT3yeMuYhwYblr86C/Q/isbjlABgalKNaRWnkUpo837dOOP2YYx/j5VKQP8rGgBP8YosMTiH+fB//sC/isF//j8Gf/h8Ff/zAKVeAACQf0WCf/wPi7/8fhb/+pBPLgg/ldPa6v4F6+WmOjm7s15B9lJCOCmC3OS+tQuPRCs5Rzd5cWLPp9k6GIj+E6yT/CA8uCqgXefwGACYsGAseZ/n4+C6/yAPhTd7q0kO86QTP+0ZpGh0ZAWqxJXlr9Mhwio5v5LyuYgkSuBR/Cvii7fwHhj1UCAd+B4v8BgAmdBP+j/ioPWuAyBa92cAAP8RB6138J5MtPOgbyWJuaX7HrC1anrb3IrvoGN4j6RZaApIh5E6q3RJ/Aviw+quNgGCqi+YP48iOyDyPzviPpw0GDbEdN6tcCWt0gQCO/4wLPfKj/jwJQ9/qIHLu/woWe+d8CWRi7Vskkk6DlwX5fYbo4DfPpoTBNt277cxaxWS/fJvr63ZfwAZdg4Wbkkx22FmcLFZnfGeBfIFL/4INzUwJ/V9n+AwAE4YMBejn/FRnGaBu28Wt3sd3IEyAAAAyVHcf40LZcv4BgCr+Pm+T+DA7n/MRo0GAAP8co/oot3doJpv8e30f8D4A+E/P+fC7Pj+DCpn+Ewd3+KwB/+Lyvn+IMOPt/P8gAR95/jY5/1wP4VFNm7/BRUUSvoVf/AYAJcQXZHfzCKYsSrgQflV4xZFuOljCRAJTbp3G0miMLmmcHouchLqSmDIh/jgTj2TYQCZTQp/gMAEMIMI5ob/OwnHt//wPOU/2ACcezjSI2RxjPqFFPLTcV1t6t+pQa9eiIM1hXF0lHsMiew1aX/DwK/y/z6Cv0Rzn5lSulJzew4LL1XugJoFVTjNZoixB9+BYKXAxO4F7hR/v4PT0C/f1Pg8I3K5w1nvjL+QGOFqTVZZAS/Akhz33CWTn4nMLnb4151ikmK5rNXYJdbqgI5XHGPupzulZta8/cNWj+ELt6qNYkVCfakW4xw9XfhDtE3AVgUAPKmqpnmDEVMY6nH7I8JKl7VQyAKuWEqsh+9cfEMk6w2bTA0VvbPyZhZzYId7h7DzeR4wiSTg1zuUk1kNJezKYm3w4Gixy8eN2taEJAiw/PdMfQTVYLRcmfd2m/g3Wu3PdLarVlf2X/g+DnKIili+K4QBkmFk8J/gBxG4cM09hJHYrgZn4FUhQnMAptvRBQWr63cRqxEEDqeQU8uXtA5EKOicdpIxm9w4N3WfV4NTEwKvzd9Pm/yJaDHQwJGn68s6Ym895ERATfPQYYwvX85kRs/QU9xQLLpseVKTUyxCyAiCk/dkaXfOYHBiIIzEjO0Ck6+ciPcUr+28vOw4PvnilSfrTXLYF+oenCGkRTubP0svSUL2AsCkTkL4w2SGOKp3ME5jyFQJ6n51y6IkRl8yYSdmH0KspOVCBEC74EO3GCEYDh7YOV4HgBbWQmwcNLnYjEIVWifwKCUiZvgdIuthbs27EG7CFcSZ/cSdlZkUJ/YqWRZOSBxqREWoxGZxDXx+xnYAOgW6f4AdUSSeQ8IrSzX4Oxig4Gri3L8mycP2BU4XruAELVG+yk3+8F5NwRgdzqA3BDkE9GFk7PtulnQ7e6G5IjZU7PV8V3K0GmNnQiR6Of5gbVRDC3X0cZkZE8i2asoRjU6o4lZH9e8lo6uDPhq6tRR8yOvkvRWe1zVxOQDZDMrAGSfwVQTt3vbYhPzxb1QtqxnIk4oHvprj6keFAxP5sYNrjznLUvQ17h31ExX1U7n8wIhxrmGFypLLXj0IVo9wqB357KrhiDhqFENPgNuNJGbSyJSSAWjSdrZaMPNLDwrj+ARO4TdXiaLVcha7o4GG1IhsfG3Btq9FUHqeZ7v8BAqjJKyNATIK8YoH6ebnLCjiNwRr8mVJGq5KbhPTQFgqcV1+PvyMc/hHa5XN1zaFLrLLbO84ZzA374gdQ0TwomfT963g/t9KvrPaE29ae8UEWGaQv/YL8pBwiA+B/sIRYT4IYcFixzv5i+dEiLfLMrR6xITiZTmgYDkhrOnAAWLbmdp/ofygPhsnYuHTYYXk99mt1JpihfBJA+SGCQqEzCD5aaY41XiVzHfUP4nyN5n/Q99UDmsUbL4OA+0YpWuaXOOEBjHV20SKjEJ/l6XB53HlOYzwYjM8OYIy9Nd/AF9l2O6w5SKc2k9UkSbnbgop8ltjPNgomR7d14dSh+hK8Z9YSPl0jD+v2+ZJZJp2TmgFLlsQDvUbfiyl+zQDIIstV14PDCdqC8E+znMEg3cMgCH1aZyLPCjnLWtRDSECVMhidQTT1LZ1EUsl5A+1kZ6EujD33F5C1CC5vQp6TizcJGamAltDGujVOppMJEeJnbR5McYQz+rdbkGDE1jyDyaV+ovNEEsV3ihuBfKu3QrldJUfHcyqi+EJPvNDSivgDHylr8U1f1ITQMrE1jjTjZ8dABjumAahJrFXDOQNW4ElISDTcO3U7hQYaqCLogIoaqRbRwFYBH8NKwp7seT+XAWedGN8pBT51Rm+wls4Lgw+nOEKGwTmzJX1Z0JckUkrD85IJmu5DMU6BfbxQxQZbnfQVYjrmOSqNVvQ5OuRU3qE6qMF8cKxa/TGWERLrKh4226g1WnAFBj8KYQvaFRWLQABBYIrH/gIgAl3DrQRgeyDH0v3bhBsoNzwuyMMPZSMhxdzstQ6dXZ2suxXglhm9ez1N7JtrAbqqOHPRYAVyONphoikNi/WwCnPoVqzaDOJe8jTIMmkwXsRCJI3z0ZwyVKN9O4ADyrbbSqmHffj7vWj9UPV3EjaYuZaLg8v/dQBNjULkP+BcQ2NcFOSgHqbJcCRQi7H5Kz5rb1pDYP9bjIrcP4u00RRcKi/hWBA/wQWaSDn+Riw0kBQW5a61N1v9fD/Vv8DAM3//h/q3+OWMP6a6lySurzzoWaSJuEdgXZ/p4f6t/hNpHrWdMEt4ku/bd5HO2X37F2wNTBSjLcTkXH4Z2hhk5qf/HANYLtfwLoBymULlIUsPyGms5jU3kRL2BUHFNRsAFzY+WJuS9hwDexOf8MmOABfzcY4TE/FQIQ9mWFChBd8Tu5GadFeAyNRo2p7QmX3WLvNUi6KX+Kgm+7/BhbmBpfzTSbT3+yC7LQB/v+k2/4cFuYGspKNv/BMqAoHjaofz/SbdqBlg8dYxqT2g5i3tzyBFEHBna5Qxz+R60iyheWcqjQt0JTw5uTZw+f4oYQ9aC0pnf6RNMCc9vN6xq59mOnlG3Y401uAIIXkKXmU1f/sybLx3VYFP4aY2E082wYxoSWylIk2l3JPLiBQ4WgdZIBWZ7SZfjALDOKKpDg5gE8rtGv7JCzNmomsvIgVUuPSZIyqFSnpsUdLP+Ai1Fi1GOoBDerOdTTLLC/p9KzT3cIwV+BxOYIoBJjh/TKrzLoKazranFyn/4faOwZ+velkj2j2AoCiD+2PICgOARN/gDlRx5TyorPk/TPZgLUZPTEauMV9YzQB1hapfK+g4UJtBCmn3CfKdNSPZ6ti53xzl5yUwVIDdyeUhJ5KrZ13Na+optFiLoajBTAefeTT4IVkZfSBB44B3z9dL5D4fGtEM67PPTDaBjb9w/wEQkSVM3JwuKvn68jxrdxpkhiHAMv/d5zic5vhGFWLQGsAETDJ7/AEzuYzSPfH4qgB0QYvm919fSrgGhpDgSS8bRBEi4cRVRRzbEUB/AtJtOUlqJVu+Rnvhixgi2ow34XNyU2jcUGvSNcqkEOpj5SI/fwpy/fwIj1qEIBLaQJfwGACDcFasX+og1ZVCjcf62bAAMD+FegMEEYLnCFH/CwI54xAP+kARzw/iy+3/gYCn/90COeEIL/rAesOCiA+Ck02/zoQxFC3DMIzFAH+ngRzz+Ewvv+KwB/+TJPb+HwSMKHimt9MSs/0dD9LH+gARzwF/1sCOeV38C/KVIxP51Y0UMfQPnejL4SvmgLwCqTvA5UHERjXrzOVBmFJJ/jokjaooQDDqUkhADvxtAgwJnlpSCwEwkiTTnYvLPN3Ppy+9fEx80M/8fEkG0G8Al/AF2YRqmB/j4FpEPf4fxe7Ph+2IEtHzbws69iZt9uPajy2Sss2Z85vSDShbdvWu/wwOaVYYnRCGPXIKFTDB4K7dk0s5dE+Bff6CJM1nGNlaWsuv8cEkbXfWeXI6+lp+vfek3rq0LPuajYc6h7Q7uKNLOVGDGZw6RKcf4iAcxxsggpC/y8KqpfwmAkfw9IR3n+EgY/a4FrlMQkGHLrZl3WLCMURtXBLE5NhDq1cfpP+ADwknzuZFaW7/HBcAHFP8HcyRoSH5wt6JJHiaUp3ZA7QtLo/IS6/+mBw1iSPMd+hsFkDK01NYg1HmvcLGDI3P88FvCX2Ss/8C42d2G8WGN/EVZd+ttCMnLERvohK/XBwdZJS1dnMW9VWK0tU/ww8gVP/xAaF//BR6E7zTYtzd6+9M51avTygqlKDnzQbZv/AsFEEQgEuwgZ/AYAIKwYC8FP+LgQLL+Dw0LFBVNsgfevZh9mNRJEAOxxp6KlZH+HJyTlGqz+Be0G/vNWksf7luiOUvz4Icfri0ML3AEb1YHHGYovw0Pj0wG1/xQCTNLlBBD/CAWFWmEFJvf+Ki7jz/BR2dd/CMEz/igSOcFCQUM1Z6nXMdG8HxqsJUImvhoT0fYGzdI/dMtR6q8T8gkPsw9SRpDPqOQL7xIAMT6pIGeLiHESAattCWqlohBiteSkuP4dGBaOEAx9BV/4DABGCDAl3DlF8eGRfADCGrPJfSaHY3kYJnpnEV/4dGBRCEoXuTmva//yADAlfxBvAMudx7yRsClv5DNbV8hoYO+l/Bn0tXHYJYyck/pli9iwP8GM5ERIsKU1WPWJtbP5BHpRqCGLeI5/oYkPld5GV/IEGBv4VGBQVp7e/HIdFj4T1ZU8VFBEiqUmvS7QtSKJYCv4AEao/duLjD90es30IuaUzWRVxz6gACTWDextEroRUdFz4bYIJ4f8FEgkn+VCHpl67yJqejfZB7oH/HC99EMStHVQf8WBSuQqkVD4slxcb8AB/jokGo9/+CEg1AB/Azq7gKVW7bMPLf8OVhhubWX+ehR4H4/54SFWP8I7Fxpz48L/jC88/j0ED+f+dBUlT6Y28/6AC1/ogUmJRQBuF+MISpshFP/ARzF4npCruxmFBmf8DkdhS+f/sB5w2hrhwjfSBQCgouZgXK2fNK+j4uqv48uRS9eMP4F/cNGTfQOTppLjLKOopQ890Mubo9Z9XmQCEvAWH25Pc6KAXz/FwMHDOH+EAY4ZjkjBmYeWm5oCpj0PvJcVEGbzTaIYBFYm8U6P/woDHDbNbXgY8RZ3PDHYEeGwCij/cjp2uBzZkHax95biVkEvFan8JgI3YAwgFMy1B/i4GOGhwHIXT+EEKPPgoBm3CmJrN+Wm2PXLeMkQs9B8jgQqk7tn6Teqmcd9MraVO6BlhqxiBeSQTX7bUuVJbKvX2Uxl2R/wQKLzKX+QYHkpKuPcnTm+gP4FpyODzm2k9bql7xXr+ACrnFCiVuNZgM8GKNoY88A31ur7Bs2Pu+GmlP5YAax3jdzo4njFIJRT8cTVQAL5BDr/BA6tgWQgEwsjd/AYAI7wYG6Iv8KjYx/gw93WuQQF/lxsY/gYBm/n5Ip/gyt4/xMQXG/xWAD/82YrBUhOEDNOH8dJFK+/m9nIfx+kU/5gIMov4TCn8+KwB/+LE23+EgSP+LKaIqLAI9S+FF/ivM8/jFJJ7/4TZpHuC6f/PgLMNQfx2kky3+CgVX3kKzIVAL8/uZPAtXDfhGxg7IaoW+p/TCDjo80bqaCmMlPI/Yni0YY+xuoHc9/B8OOEaMTIgF/wYDr+zH+1AUrOsFxbl9dg3kdNXzGjbAOjV15BYlL/CBfZj0/3sPlLPX/AMOdw8ptI9AzcqfPAHbSQrrCJruh/notbqWIaCwP8cByU5l/+4cFX/04Kw/gEht9f/AuOD0XMRG7nvssygwb+KdodPfsX8AemVj+ifBpf+AgbWu679mwgnzoZf4cC+er/9hguHVFvMNMKSUAtlSjJDHH0A+ulzXX+eiRdckJs/+Bc1HyRodYs27HhwlgMuHMbfEJqVvxken/wECW6g2e7gTLuiYAO3f8KCiHzgwE1EmYQCS9c//gMAESIMHnyH/egJqIgr3/+ATUQElP/GhJBO6V2yAmv7//4E1E/h8Ef/gTZg8AJHCBCwZUgsAupGcm6oi8zNHBCzasmj8rK+EHTiwwHK0f/FAXEd8hj/go6RwVYQCTYGA/gMAEiYMU/wEKvtSn8JgFkBchv+xAtMCDMD/fhLNYQf60IK6f9fFpcVF6UKjDxH+VC4uT/cEWlxBWIv7NcXbv2fxPpItq8rh1oYS9k5wLI0/ZGvAQAW2xsOOCoCq06f6WI4Sf4TC+P4dU+8zM//8SpZf9sChNT+Jwv//3AUOo0JVJXh4f/FNph/n4ZDK/0oOj0UUYrUZsPJRAf68JU0v9RDIZX84AgdDmZf8O8u/+AiOfD//wwhlf/+KHSf//FDpP//hkMr//wyGV/s4ZDK/xQaBJ+AOGy08V3h6pdZtinnQJRv5ngwmuF4TpQQBxyd+VSFDvhQNZzZkPPLdeWgrDSwNe14prhLvB3/BGM6l/zTjO138wSM6AAP//h9OP/mQ+nGwYm7/P+M7OhczeOfS5WKf2XbzCFxkuDWPIS3FCp2b+0EGOYlSFtaegFZpe8AbfQNtrPsGeXP9xdnnG3NgMfUxk4FxN5MmE0H2UKD0FXpdYxGf4AiqsRsbZ8dMvTCQsNoLyRJ9phYAjy8rNiOKciKE4En+MuIrI5p2Xe9ccNL8WwBeRwM5nzfrXLxNxJ8mhM6uScgvxTMlHkkvYQ/Eiin+2EwtAmh9SQGV6JdZBBGzP/dV0CbxK2aw86h16LIqXTZwMOzpyVYcpWPr/yAR+rmzfLdHEgGEemJ7yd2ijm06ZuKFg3723JD9Dv3OHrAcYHbK3kQIWZBunSI3WhSggaDaqYwUz3TK0m5kx+lc2ERJAArDPCQaCSmTCPfyBjnU21+CxRhssiuLwraBj8BH4b3qhKkmLSwydhyHacNTOxckuUf4CHUNfnPOIjflKc/QafcdURQLqRdKeAnyV1hY+kwNrSLWHCARj81UHvY2GAxooT4TgSVwCi+yGLx48J7V76B0jXDRZXxizxMS/PXuXHPxNEk7AP8XG2JGNN/5AKY76T/QhTHewfw/g1/6UPtiv8SGnvAPgwADgYMChACCBIF/KiPh/jgLJBjCFVh/49AH17+HORajCJhvf4cAfyb8cv4cAnyzK/h0A/LfK8/lwAfMYv/HMDaEAnbDBwvA/wAPnHvl0aw3ZIfzRwqUCk6Mv68mC18eFBCH1KqBk3innFkZvPnXj2UZ+G6kITLfw2I4sC7nwSIDQP4BV1GUvzcnmzXvI1d9bJg39aYOMOygW4NTffz9EktplYdEx5JEWm74jyGStN0FF/ABrr8RlHm0mvzHfPA2H4zOKzmaeeRTD/D4E+ue0K7XajBkubRAn6w09DAUrAM/SfJqxG5e2jhQimH+mJJBuWL37/gIq+tfOpcM15/CBXzVIBQbrPNt86hjOAuILW6ibvc0Ua3rpgIDPjCdPyQ9frtsFuqG6CT5Zxyb5w7hTmghEzdVc/M0vIvKPyT2Rhop/wEDhyhaOdYUqziGmqfwAcSvWsoc/8EXSvgF2DO4ONYgPUed7ZGJ3g/OlphY2veY7jwGqTsl4n8Wgu6/4uAolpD/0MEna/48Dwqwwm/x8FEtGWLrwEF3YcSWmVvZ+TVzH+AiRLkxgKGuKfFJlMdC8O8w+8dQC9OUSzorhAqnKVyDLFVQ09Cm8vzRMYktLq+8rzP6UL/KAqsl4B5Tk+0O2YAB2X8AHs1ppxGjosYrwWE0RhmqWA8HCWN6sjPWtT+JhPgP+BX41L/wQb72QoCVAvl/go6x8/iapo/xMb5AH+KgdF4ofi4hV8xGvgAAwP8TDEIIF8f4IJVOQNS/l8AiP8LDESv8CFiYnP6++ANYi9PcpbK0isJ2thArP1dE0Tdwe3p3mBApF+5//jgvWXsv9oCZJDcB4jMWJ80WpDGtNdl5u6K7TNSQxOdEQYILci6BHKAQUcX/7IFigf//B0Q3/xAzVgjSA4/goHxGnan/h/kPBsSW2nVn5UtYls4mYR8hyS6GBLUQ+nRxGyuMOhVfcOVPigrt41CbKUplg351DRadNp3w9n4lIHotulrwf6V6XuUjSKXiVzJCILvOzJ0IlDqUKnMUUbKmyodROs+pECH04GhkIGQT7/gGJTJMHwVPnZ+rwyf4AOrKTv30ErJR+a+FI1xrwYXH//ARjJh2L4SkCBa9FlrNSkZhxmgrcEhIziocXsNqRSUD2/N2DsU+u8zbKeKW2CNnbj1WQnfDMOtkQj3rySM2fNur3/D6qAJkJPIIgUuHI45nXtE2EkWrCEb97CICrBMOC67WK9qFOKB6OzuVGCCAQ3JbBM6ynFtgelCXrQXaXfPgkB8ncakZJ/rT7YCPrOiWiuMVB056P8VQpMPUvf0YmWglAinhdy5Dm1IJ1XHATwN76vU7MFzPcLoZvR6GY4dJuMPCKH+7KMui1MJnVTdyOlIERGwl31lKKVA+flhmfGp4LdcRBxw58GiRQ2qYDKBIVc5qLtkLNBj7jkqodkUefvCKP0YNkiMlf48NSImP/qQgpMFf78Hmpfj/XgpUEFf6IL9ovAZLKeBP4AR0RIxxtRpM4Xl7nQ2cd44Dbk7rh2mxpOzJaWs4ia8EvbmnjxafHz2g0fEOoolW/t+gv+CN288/vaVNGx5tF+Qfh2WtLoYIb/x5nvHQ3v/PBHmiA5KP3/wLBXFcFc3O7cs1HS4+ej8ND3XxiTBTEfm94e88vfm1V0AbGLM/jgeg1wP72lTdexdMcfA0btLPTlwesPp+QaM+T/PRiRZatynfwKpXF1H/gJBHI+whQXcjTBYKCIO/x5WEGrZnN/a1elW6yOgMtQVC/hynX6/72p155kg6vCMHakFUyEYcJJKYLg0wJVP48DH2XJusB7K8nroIiqMC9PotHW7OJigJnmGD/Bwhnf4f4IFmNP4DABD3/PgsCkV/w8a2H3+PIWL/BhI9rpt2rdOCF7GkqSdIN4aEfINoDxOACXpbg9wSqWCMLWfqjM8+EdfOM1QQvyfTncHEKv7HXo2dkR/z4zFftJ/yEIZ3gbT2+BkjgOE0JEX/gI1fVLr4fMMIEQEBIdl8teyRHp4d5uA+AKsY6bVYPJ8T5ZNO4y7o+dBX7LQCfmmxQL5B/8EBkxomCwgGHndFCAcPdgEGHkRtKIL3WDl6/4CEHGqDjwh8VDM9iYe18uYL/BxTD0Dkm0d7I5/nMHg00D/XRXBn/H7ooDfxZOWfx+6afwQfNPKi/3MZKFgfgAAAf7gKZem5P/LPu58aYpIeYAall2FhwosMn6wESAmdcgHAy92SSjU5eEva6b3IfNsWlmr1isEFzDUi6KFIBwzXBkSrcSilQEDg4OtMUm+y6kjG1XckO11WDdgtFFI5JujmFzFnLwJfs+AhbsVk0qpr7/AQrvhRidB5zW6IKAxW0zCBTcmjlaH2EjqLQCCDdNsUb9MnF3C2OGLVUTjXCpaqs3y+fqodX0pzFjOh9lFKnDaf7D1jCo8e6p1tWDQtTY7F5uhpD6onaifvwERrNgAK/BU5IWHXlbs0D27kvxs3P6+2DvCcYdEDaQfMQX8AP9KxZsiCTOzCa8rENQRWK3ZLq488T1Bg2B3iH8/Ue5n6zxgiGwCDOH3IJK4zs6kcByg6hu6zt6lwnOhgM7sZfioaj1cA/8AQFo86PPXEfAOaYenHoElYZ2tfBDpbnLtTZ9ouze4A/WG6hAAA9J+uD+Sb/vY7EId68don+AkI7o9calPWvtgYn5/oWdYL7X0j/Rp+DVF96kUQSAitXpHYCNj6CloCFZGbWQZFHardwhQEJE4A7MapABwqdTwIZLSMGoDS/RoR+mQAF2uVsQLoNKHdSu/1d8KRX7UQiWsXF1b6VgdFs0LGwxrE4GU0xXo4/fbAQoCKevSMPZptsRFwZwZIbR2N9qQ6Mhqlwbp/wBGxPyOcrzPCLrasW5Jvr69aeB7h+eULGpmBXh62V60HPhkSNyEJ80ugAYRwHUpXieriCf3o49fTNdLR2bHSEMkyxbIty03ahy9z+nTY+8JP0S0Pn+F59+dlo3uDiZ2Kewngl+US3QXDsS8EAhvzXuMhkj+04VKfVfl7kRRz13KFL6cPnKj/ADhgPhnOz31ldLza6SE0uchP/AEMCnSL5XkFGkJ9zczGXBFnDXBXrw4N9NwMAh0kXwYpdBUtVfvYJA3sOMliGCeQjrpP1NBl3nBa+PTpnbekJOKsTH2k5VscKrDxefJc6jn/hrVLw0WnkS+rZN6EAAH8td/gIuETz2EL/qVpLfhZGfSgmiyyt3vxSg7hdkU8d2xJn49t3WXmHgOWzBTplDIMZxPsf1cdzn9Wr25valsVw0MRoBEHyMW8KNRyE8JOlNbrnsp3elLPRJoOJBwxqNAlKhtRbhneHesm/wEJ/YmF28Ch0tJO0R4Acqp46sfYr7B5hZ0RPu+dIAH8BkGn+VimXrqbP8LFMvUqDAAATiHCbbrs+WrYFU/GISpf4ZFsWlDzQX+UimXprIv4G14qH/CRTL14F38DAA4W/wKLXuQ7/A2wN1f54KZeiYHX+NYtR0t9rC8JqkyCajrwg65HFc9eiX6CBZszNDf+kq69T/ARy49NrKKeGHfcMIzaLEkx8MKn6AbNMnzgqf04EXvfLcX/jkMj7DnpQdMvyLwDofqhht8Wl3xqKrT0q9adlH+3IIWAOLLZfAVx5TQ3Pt7EUEQk0ivGB1C5dCuSvTNyu4d6is7SKdsyAgosKKMh9pxCWAZ0+oIodpih0TLsVR5h0HbalWUiqKEezo0VRnd9mBSMdho2EPfj2CllFoVGRHcSfS1ap9Iqf8eCvz/AYPjgfwteq/wH9pwM1CATYU7/8BgAnRBgKYyf4qJDFwEZ8X/AR9dJ/mgkNc/hPTh/isAf/iy26/gYCm/zkDcDTL0ma+fHn+ViUmT/MQSCB/IG1z/pIkNc/h1FQZP6pQVAL/n1FQRQT+XkVAL/kVFQNku+AJkUNSIziZyvFJVgFj6wdcncUT3odmoQaZROtv34mWX5msQ1W5r/wEjN+pMXTt14ABeJ/4CMdlP4HXy0Wc/2gaaWN4CyAXxxZ4NbDzRkh+Wj6YPzMlrltWuS1353X+AhR8Q5ZUyF4/9hClCgAf/+F6Tf6KXy5JTgP8KCxBQamd3j/D9hZ3mux7L/g/El3AmuV8OIlWcaXfq6iPjCAEgyMBehBvQGtW/LdHyVBxlZKC27XZGL4K1b2zcevQ/yOyTr08532y9We1T3i7EQNSvmfJ73fM+VB+qdgtZBIlfuaPdwTrIg2Yz9rVDsU64Q7weqAw00OwA4BV5eW3RbZLweY+dRp+qY8zwxS+mATw23fuO43eyCO9p+o484IqL54kA69c9QmQZBztpDP9A/EiOEbWZQ9SN7gw2TqJq8J0+FQ3NmjLsh/n+QBAUd0S8g/ZJPuo+6tWSJk/3iNRO0vhMldUkpDFTSzSslOyUslQJm5CSiHsa8kHlTOFisSf401ZK0qA/0ya3exZ3c54eZP0IcHT1gqyVRgnF9kGNFzspge5KtzMEeHy9kHrEuItwasQYQ2svBdEdWw9PLig+aVorZLOkkNcU44AXGVlGtKTSu3WXpX6la6UNKMMc6cyvjg6tlNb0EE+m4T7UcnyV4A9Q23KRy4DEyaceI8lPMLJg8jz/AR1lRuRWXqXca/a85fwMrwIVv8o1KXBhIhm58pP5UMuQ93fCBpDxzKyGwZSHQSbJxFX+Ljv3XXL/6Fixf4+EJv4eixSACCQADBQYCAcIAQT+cosXCZdz+HIuXFspv+PAB/bnHf+HI0XsL+j8hyNl7ZNZ/jwAf8YZS/hyPF+sZe/jsAf/gN6V3JLjQjRYdL9Fx4zbx8PtWnYwxKARFS96YhvSUXveIIYks6CzXAek3gH12bQww02pQY+HWnF6nNuaJl1hSjbkXGsa2sd4JD8gNw8AxR4duAy+iaktVGjAe+wjoDTb0MbS44ITz/wBBb35UOgk5M1OmsHKJ49Jdoak6KBodv95w76jqtY/8D8WKqVUJTa5XoyKyB/4CRdq0D5L5uOY0p725pDmPm8kxKbkl91FCacof11QJni3vZskenIgxaezjPIgC1cRd2xZ+l7QuWU2uBPG9llpwX8kOs2RgB6xWPlB2jCrA/gCuBjfn+h/WswTic+DohVxQM37LJePMoph+9VYWLn6LA+kPwzp4avkexx7/wgZzs+CNWB/u0IQkwCnFDSYsA1jisSmAPcL0U+jNHLx3+LeaF4P/jo3KEtoQHZFIh/gQAEgwCrDH/uhuUJ4f4sQzCQmOBnBxovv/Gxndp57eEyPcng88QNrn+Lgq7BxhALCov8EIYJMiTcaB2nAZzpAn8eTP7RNgv1dDOH+GAq7CI2w0dGIsKsu4wp+vipuZLoBUgWf4CPzeE/vZt5lQlX5wnevT/CXwYOfQqYM4xA4v89BKcpsfn8AaCP8C3TCVlFUyd5Wji7c52AYD1nZkopy5x0i6cgXZKYIhvYHUd2UCbLGm+QVTNZH4imm+Iv02NVu2qwF8IT/ggQVOLiP/BAgbJ/AgAJBg0QSSmCDFb4u5HJY78+QAfxsGbgdrc7oMBchMJuZ246fx8OY4H+Pguon/PR3WaAQF/A2oL/AZ+GFu1EhF60YxZPOGAfRV5a/Bgxz6kdutGy1rmwLw4in38Aapn+GCvrYotaDpxiGlutqlNiGbOiIjIFoz13+KwB/+EwTP/BiHEh0fwfrM/4+B2Cv4WAS/8Sg1eul6X8eAH4AApf8PbLBZ/DmztoLl0B/x53O/wRtgDX/BqG4Auv8DaiLTkf4MN2mv9eFpMkEaNF547kjaSGk1kt1oMn0436hQRMbU94DiNqcjX6C0hgxfwBvBgPRxiyZWG352f3vc2DIYnjqnkcWkJRK+rszzdVobY4f7T0w4BQ142dHmNToRRws4y18EvVtdtwK1Cy4/B/AGT/EA5lcDjS219SWHFgg+DniUEXKjk+SeYvfwBlMaaZgCxCWJxh6H+eAezMFxP8PHBPYwLdRCoc6dZg0UnHATyMOY0u9PK/8eZkhDpM/8VgL/8Y7QgJyf4eOVvP4fCz/8sEnTwDU+kk0C7/D/rFQcTEsyIYe8ZWxUuAPf7iHuFf8fDaOYHz5APl9zA21qb+awgf/MQ4p34un+uBDUr+IuMz+E9iACP8RA+W3+CCGpb/UAIhF/kg/fp/jviMJxD+PuMwE/j7oTCT/IwguL/K2lRCGndIZX+VjwJz+LBEMDYA7qo5HE/w+BAH38a8RH8JlvXYA/44C25P8GAprtBAfwTgKJpil2MZdvR4EwJdo3X1AIDJv72aJqkEJiOjyk9d+tECqsSiEt9E2UDmDfSePnRX6YKh5i54Pw2wQDkQgE9u89BgLV0T8T3LUxc9UwTbO5x1trJSwlVGq9l4I3Hf4+Pc8v4JvlBRaqnZX18361WaKU1D/dFgpGbUmjq3ZClItadJxMKpEH+AOdEymrUSEg/9+o03IXECsABjs0duMv8EEi67J/4wJF1wQMGuFcwZBh4ci5BRUKnWI3GEOF13bLg+195nZhCSMfWnt8E//ZRvQN4J3+3jegbdKP/gBgnL3LHTEC11cJJRQyrjtWJg/Xc2DL/LmbcVswk5/gJGz2u6zgFDnZFylK5f4c6KPOGf0BtXW49scaONbgzfahXkTN5gW/z8bzA2Qt35NyGzEWW870o09QRwhxBI9cjyoC8hHpVPeIEj4XSLlXVCAA/iYDjCBztSOovN1niaGpOa4ocmVKJk0kVjtn8fgP5wL6U6fNj4/m1mBGmtkFgbKKza0pZUeaaTRLIcdhH4AaIqoLQqdyfs8qaWXGdzvAAGxNAoT2b35PVCgNES3x6Yf+fiHb0jOF5uLwKmrd1QPGd7OZOKTQyYJSBylSz88E1GV/Bamknki7lFTawczFZGAXHqGKMVkSirnIFufaPX5MOP8BEavIOd/5INckD8F/wEHHiDkMad6c60Q/vVUkyoRO95z/ggvBOSv/GBgnPWmg3PtTm00AuP7Fsx8wNEMgx0r4aO+nhJH8SqElHgy/8DCUn+xDegbwTv9vBvQN8kH/QCRdfPTCBEXtG/JSl9BLJL3mXWKxDNFxr0EFiwkTgl8JJsO0gy1G+bIsiB+RQS/Oyu7APpePScVc9RBEUDzOKCB198X+Pw+Tm/fyT72SWcdvclKrdF7cxvpvCjDCdn1O3cJUBGNdA3MJNzHci/F4gI9rRZaL7XkSZlIcE5QqMG2kT1OwQ0Jl1H/KBvQN4LO2tb1hcjMmEsl6qCgC3KNZKlPrxFJo3CCreI4PEh7RyP+OATzJWhAI0V3XPgMAEfIMRCqn//wJ5l/3gyIVAKKc8f+fAnmVyyxZvGIbriTHfbTuGjwlVm20I+yMtNa6okDaFoDxGzf+7ATzKgSpo9q+UxHTi//8VHy/7YBPMv4eDY/4wDg/4hC//9RAJ5l/GIhOgf60KkBQAAAGgS/2gCeZfw+Df/xfe80Rpv8/YATzL+aQQP+BxKPh/x4CeZEX//gTzL+S9Rv/fxaz5z++BM0v8uHLsf//i1nz/UBaz5/j4YWa/+ACeZf6UBbGL+BeoNd4XFnkg53y9pjYd7nlU6NvW/M8cBd3Ly9+wRYkiibHGa/xMDq5fIPpwQUiQgFXvyHBhDscf4oKvLoUuhDt0AC5B6RzbqwLgKke0o38SCKI4Blo4UpqMm+42z4tL9TwhyXjYSohMOq/kstH/zEDsRgOb/f5aP/3Iq8tAP4BAmb/3hV5b/Anf3/lAQlD/x4en2f5SPUDP9DFXloAXDmQNf+vCry0DJP8HIz1H+bCbxD+EAKP+H0G/+MZmkBzD+DJAEO6QMfKguEdmqGtVqNkPNXkQ+0/j7ESBGvpK4k/ksED/jIDjz/49RswH+X1afHAZ3JctiaAAP/BR45p/gmQRNf4TgRwT+FIFz+EwD3+FQdgJxLnNFgHQAP8YI1tHVR+mL6L7QXTvCeRn+9E/7qYMz5f+Kgpf8B0AABz/5Qclz/BR6z9LdnauSnSueHC3QGmx9jTsL0pBdf+EhK4BBzZsWj5eQ0XpYGWTobKZ8k1zPLSGZZv8Qx7E/gp64YQIyGLc3C9hBcgWo9ei74mR/wEEfUpYzTiJVrEj/shvlMXezHGZPZK8pSQ1rPdxvTUddiiXB6t3/goJY8bv8IC/RpnwXLr/5OHTZf8uGOqP8DO6n8B0yjPoJdYY4HSOpcPCqeOzTP3iEk47WOcPLxXD0X8PCNHzjXjNrS4ii+uqV0vPw7vfRF64jwL8X8dsEBmH+hCfwMBBv8eAnvD9/j4bzbi7KWaoklUYVXWhI9A5pIxJFQWNZVvVog1yCTJH7JyP1lykbnfJTaQX6r1jRAI+fub0IbthGAYeJ5Nye9uqmoa2v8kEoef8OWE8L/e1hPKaxqRiLA+iagIF7ON/u6i4T2ev+eg7+i8kg/+Bd+BqqJlyy+UIvWCBSPY2d1tvcxRNoPmuCWEnD/s+mDiPhLq/hQM98ILnCG/+Cgzz/GAkX4kNGQTCdJwH+CgQ+VuG3X/AQ/Y3QyMIAlKt5qqn3Fk9AaF1swFfjIRJZcUeBZrJpn8O32NvCAdbZ4/8CAAkGBVO3f90I3W/CK/xcRut8rV5B4v/jYJzLclLH1renl/85EGlFVf/CER+ByE/yIaXPsv9U+VYZ/P3lWL/y95Vhn+iA6Af8HKdRtThXl9eYOqngZkNfNVoFqbuGoW0HB1ieQCFG7ujUshb9ZbkgWdKKvjtvkgcNniPYRLwvhrP/8wGRg1/lQegRd5ZlCbFgTHb/BSRUv/CYEB8gUggfwKBAgr6R/+LD1QUCaP8vCYY/+fjG9T+HwF/+EwUr+HwQL/GRMMDZHJYc3Ea/66LLIAH/FBNXf3gABsgABO/4QPnBf4TS47/KwzxMmRJs2TT2JG4FQfw+B2/4IExFf4TUXf4TTcfzUgmIr/CSQm3ovl/BNQe7Xg/K7y/hUbNVYCFvlyLFgWAhx0D64VUOX1YRBja1Tr5f44N8XCz//wO0x/o4HaYxwD2P4F8+XwoLnv+AJIuz2KwKhiwAY0GucgWl4QcYF+0hvk3jM5KK3Vu3v4Xrn8mEAofF34MBBYb+Ua58EPCy8ZH43kjs/gqNvjF4CRyK20Y2HjyMDfv8BC8fb0zaMXoD8EVQaNstakw+O+Z8H/CosMgwgE76+DBgIjb/5OSF1f8iw1w4BA7qTzN69Q/8E2I0eEqu3bGFCbvRx14hj0+QW1HD1p62qUbGQhYKZX2exA6z9GQ9tZyuaTCZD8rQIOFQwueLUnn/lgmF0VGdb8iD/YRMLp/AgDN//4pml/zoW9Bg5TTEO+/52JhdMDDaqP/TxTNL/CYT5x6W6kNInF6jBaJMdsTV7uBcCjdhzq9NA7CAQQ7EzGX+OCYji5wP4UIewKrEL+BQ3YF/a3/uw1zsAf58F1u/4IKYQIGKLOpWxW+z5Omuv8gDXO1DS9XrF3I7G4FyUPbZ6mX1rhk3kiOntRK1gvQGLipsKpdoLCY7Ni+HVORuXrzYv+Re5CBoGbqQ0uzKxQNA3UD2H+fBrnZA/jYnb/zAI4fAPYgIiOk3Yh/ATtrCJGFtjhAfwyC/nBV0LbWt/wnuoVysFf7MMrcEL+B4JgBSD/nhlbhgfxWAvfwoO/fwULGeqH2DP8XBl/+sAXQ7/gcOvxpk1Vv96C63ZAAAAAPfwPryLmvg9/ZE+FNTnY59jlbTzm7drGk/5WGuVv4GC2P4fC//9mC6zf8IAPP//4ytQ/iDi//x8eI3/8+GuVv4rF//4HLV/4Z51DebE5agVH/aw1yt/D4b//FbgP/ExAf/z4a5Wpv8hDXKxcYhobGBj/igzkzMTQ0MDX/Dg1y6bGJuamxyZnBgcG5l/hMGuXGM4NTgxYTcwbjBoS3lLQnQ6NTRhN2JmMTlvMEJIkxxZmf/ExlacYmRwbvv8R450n9FNErzFRbKIVGM3QoTQ186R8H6MZJXLTFpIqGfdd3sT/i4PJccv8EHjaP8BuCvUEiaKAmy5YG8cu8v/gIFP9r30aw49ChgEP8F2Uz/4CuxCXuDU57T+dekJBs8yYLfRbDiZXRUZBV+zbA1Gb6hI3r1t/hMBG7ICEAowpIP4Z7d/8EIPyQR/BXbvNwvn+ILvGlv5HNXmAPgAKnmc5xQf4SHi/4CKzjxxg417Jk7E/hxFnLf72bEKQuHevrpCeGpNK4ImknKRRiJx7/PRPEybrJwfwK2IUMEXILuFgmE7CqOjoaBHDIm79+y8OAwfmCl+CFnvjXJV/K/gts0UzA5hjcp85DO1XdAtjUhf4OLAKcbZf4DRbNj+AwATigwJSUv8WINm4KiOgGWf4YMm5BA7EXeeL3/AQJcznfcqfsCcMX7hIJ91/Hy4r/r4L8r/hNHMwE2Yv87JbboCLUiP/DkNwPk38NHSYDuWBxzz/+fgMGqUKnPZ/476R9fA3hJHWwpAj/D/AQw+0k8rRdJd6ZDDGBSLqR8C2Hr7GnDMp4JGhkema184cr9fxbKnW0GOsCRjnDe6t04a9Mta2k0Ofx4BfG3+PC5KEm/x8GpbG/+co0/1f0z+fwX80tK9vcq+1jfc1b1/BylU+xyoss0odfgYYk5cnxZauglY+vyMSbp5IqZ6C/Fw7q5CX2PFtOyDgGfBtR1ci02i6Yb/j0F/ZP4dkeTjeAP6cAAP5BBf+eJABvGOo0fZKBLNW4OlfZw/YGToyXpoFmI/GvcpVlLjp1yVgeBZp0xX0S8d57R3TeDogwCJzPgNzPyoZFnxK6xv/CQDT5VMRK8IgOyrYurzQVuv4sJj3U0uTXVm/b0/xBnMGGPmEsv+KjnXmvgwSrq/4EkaYMCEfP/FxzrzCfyE+GTB/lYQHJ/x8YJ16vAl0ayEa9+t6XVr8fCmkO+D7n8GTGHQ2+Y1Aq2j8e2lnmgPB8YfR6X9uf+fAoDX+Hre4E/geD5wUU/h7Ry/x4ILYDXn/BK9nWFibtjwRv/KX3O1H3Bsm3fwrEr9eHw+ejqTVw8io50zi/4sFa9f8EHN4C1ffO0Cu+VvAyhihXx052BFMPTieBPK6q+ss/PAr5QH18MUAHbIFoGnP9fFd8v+fA2GL//xXfL/pwrvlhcEjr/4KYw02mSv/LRvbjSK1FVfwWt2lv8P6cQMWQtR3V8r+CQs/Dyzpao+nprOC1qWsUdDRh3K+E18XMQzb37QDvtM9mbo54UYDIp3M2bEpWKHZdZyz4LqmigiTmu9tyNC+RP+DOzFiEXuycBeGTub+cOuzVbsqDXykBlj+je5YjPSU/kHsBaBrSIvUfmAZM5qN+3H7BJbijKkRShfnqUJJzyFGfoUm7hoVnDCNm50OomCn6uUM3vJQaiPwvYSUBdQ48KXHxyLquiU8R4l7ecYK7JSNJX9lXeqmFjCLH/D/FOcTXr0aKg5Ef1Z6GZkds7fhLT+KNQfMle07NR67kBjIe/ai+5Q9zfkV1rZNVzEWNyzqY8LtF8FgE3YHacgGsBL6fJgDVwrx7QgyXIms80EivLMfDrdelhB/gIVVAL5dbzrCdP0FP60Oy0zJA/XAAeS82xfyRG3GhRCtW+XyTcQJfE8kSE8/wEhC/4/WjosFUjUWuiZvZYqkV4aEPPI+SjchO+8smC6Fqv+Aglf+xd5ImpIHSi0Wos6+FaCEwdx30vVlzcRgNfS1QynA/xYeKfSP+SBVt3+A6va2SLGWqV1aV+7R8vkUsMXrazDUX6UBtX04q631O4vSCBn/xMJkVd8GAzWJCAHEOGvBLfBKLUlYqmpqui07o6+VBut7p3k2ihNDlXwoJO1vLFAfwS6U3Tpzai5z5GS2U9A9jU/eNLbO+9xHTbMGvkdIb/HqICxfwB/7/42Q+T9Y//woNUMf8CBbfAH/WwC2+AwQBA4CCAwKABIH/uIH5CjaVdxP4cAH6yhQ/48AB+ukk7+PAB+6IY3+HAL/0P/4cztI4AigD+PQB+Xp/4cyOkc45gJFCap3Uq+sGtXYkKdHvO3x4LC3bfyGVrCG5S9TYbW28GJkkcAzmfc1033i4aiceZ68YFvYwhnG7iulBSMrYQMQI/wEDUPgGrsmukASii/v91ywK7hUfXVZPOUtWmNkDEMFpq0Bd0GAfh3FofE7OaTJWHfqgWt7N86Z2ztyxrpjif5AOoycM9Jdepv7clKGRDTR7Whu+0AT29H7JliRr3sINISmFWBrdGJB+LIMXNaQigI/EhU7gT/lMF4fl0U3vM6y2kSGdV8+IE/0Xlc00KCpFA4bYhwPQZKbYjA1NisZwM5HZfIXgHlV/v2ALNu2JDrxDASG0zlqkghNHpKctVJ0gTSPUj/x0Lh7uP//hHYX/SBxl6mkP8C2zl8AwdLbB5UDImqE3MnZi7++0L28E+paKyzARXaGO4ng4T/4sK0dL//CBWjoGFdElxy/KCJh/gI/j/BzEfLOWBepB8YZN6vpZPxj+CsULCZsjOQgocV5UPcHbkNcKRloHY06ohtTQ050KdBdZxNKo/hSk5cUP/BQUeD/iwKtlhxvBhCT7Coe/xEODX/wm8wZ8d0EAaybZ0Ppn8ABKE81V4nfLfZHo4iTcgLw0qkHzY2O59/BOPfoZ+tjNJnJ53nwWXIsyx5G0k1TKD6X131NGtAJGfKw0Mjf44C7ENP/aRPythv+xDp8UA//8T8rf8wJ+Vg3tcf9/ByI6Nhqcc2K0CaWLqhWyJZiXbZvHF7i6Gp7wVIyp/gJKlW71JoPkhx2/4ABb6tIrOzQttVnL67ht31XtHsqRg/Z2hIUsvnLqBI2SD/U9L+vZzrKlTUKDMzl0Gv6jZSUGaYE1q4CiA+ew1pWCmgrwPKkN1b8koJi7P64R8jbGlX7DgRbD+ZFXMCMZcIcA2Ce6c4Y0RAM3ly8KxyyUJlTvshml/8BGGI6cBwhZBlneUWLM6A7Zg6NCZ+t/gIYvlAl1wO8BEdAlVMWb8vUOnr06/yAG9YKhM4A6jzEazEzk5ZryZRX9/p+tQBvjFoIc63SAVw6IvP1n5KJL+s8nEsMVZP9+XydDRckqDMEXzLcOScZjfcc4Tp6IRojeDH7mMl6lj3UJatQnVIGylh8FCH9bbOD5og3Qxo7MjV7bGnuv1uDDUPKLDLrwmbiqj4QoWOPv8Agm/vGg+O7j3PMhzHBhlfoJYHCVB5JBmsddSQshWVewYv1AmSAAnCpgb3jh8Z7XmhjT7CvTpYZzYWxHXUPWpZtF2EBhA/wLnA6u9xT7sAujhG9XogFhbDrnuUU2fj2k2JXKsiQL5Qy+VOEoYQA9deoTWpJAVVeL63jfjFHtfAF/gpCWSi3P/CQqwnBUghKf4mFcVYYK6fe8wACAP4Fz79lPRpdQa/JOJJQ4upZEZCxacXQ6ual719dGacqZw9H3Vb/GAM6AYf/GBnQCEBAAQFAwkIBgIBB/9yBFAMhl/8OctUYs/kh/DgB+vO/8OTtMY5JH8OefEY7dDx/A4BPvN+ufw4BfviuMfw4BvvvS6fw8AP+H8OgH/V/kAwgfaFeQBH+ACpMSaF/iHgsafhbp2lzRACWrrCuHEvVpzoBhl15sQmNAR41pYzL8gmY9Qk4ZE9X17ogzC0/OikRLRB9KxES1TCtEwuaUYMB7ktPmNj3UC7GjJiXWy9X31cG45ltzlVIPcm7GCqAFCw0wVLZlJ7qH/AR2fcMSyMsozT97MzzUtmf5ABn4DNggCgEJNbdPX1bqYCJN02bJ8LzFEadKocwL2Uc9vk9Sx3yW3Vbv8BBB+kMMDTM3aI6QKIFylZacWa5sObb2bbOOgKXABfnnc9i9bClVsGhMATlgw17VpVNpTZPoJT21fpjFQsAnGseWBleDhpxI8dwQbI9Hj1Z6FqWlnbpYtgRa6qm3/BPZxu9XUouC3C5dmtqDjgeneK5HsC9pHL049R6dmdzvkX3lu/wmUef7qK8bvDjrzUAR6Sa+jFilB5aAPGlirCkHImkOtSCdouJns/xrk1BAvaWXfnVdum+JvZGmtVkvGSEm8Au6C5wgmhAMGuqH+AQAE/gMsQbroB0QioIjWwvjEfeOXu/XzK92GFYeEY1ig/gbik50JNn+F4i5VsqoRqkWM01jloeGG3BVbdJzTxuLcOLrKXP8ZC8woLCAY9hEf//BeYVFRv9jC+NaT1ioQAJSXUWc49e7J6dzfHlabdll2ATUDV9UQ7/hiMf+fBQcy/wYCovAYd7/4aQNWf8bH1B3+0BeYVvqKGei70xVEU9IxORpnuiMQiQYWnKsBSR7nauHsHBvUDG5R7YEHoedq0HAYSOy1BrXeDWBFoCL1WFfo1sS9qywcsIhBhV0YMuPTz5FVQjH5VXKIBp/mHitmnV2lCOykRCaYnGlN4l/7OxBJrPyU+/dGp+eWIpje2ZndHrVFY/ND4IHP83C+Vf+CAX+RBv4GDJf/8C/yP9/hkv+whf5HPxThgXDu90HBdt8D7Coq+I+LSSZHcA0SSPGHXM/5mPQTCJBgxPReQTGXG4CSW+0n6Q/wy4tF4Ad99orqzAfqML/JAv8iCr6JH/gQUI8Xnxu8CoIzGAxpYfN/An44o/xH+OY6jXaFTNwH8C9R71bK3bi69ZCuZrDZagZ2odeaOxnsrlCA+TgzbK3Al+Mbe/44Hxt1X/aB4ks4AMq8naU6YrqzQRZtDXBBjHnjAJnaovD4/kUjFUEraJFN3+xBwX4B//8ZC3f8sHxt20siVYC/4fcm9wOXfGNnxPhhQhC7sKxosF7/gBZC5OYO70K3bvo4RsaGhl/46n20KNyVWX1tOBFNBWPYJkFIKodxsxHOJG2q1lPRKChXgWKCsLmyNkhFEJRTtUM0I3urShAjFfa4HkOlK5O/6ALIzlzF14AeV7EmaoRcDQdj4zhrrCNwyC9TXOQeTzegqNiiGtAUCDyjWgqtvxo8Q0VWAEQZETEp9bFXXXvlFxxt6lukY8pESUSfPdLZdeTT5Tm48Z+Rw6CNbJr85bXSGV/h917Ub/mgnHLM6yKYmoNiczT/dENLQfkXBTz46ficWUoE95EKxjd5ppOZoAtWDnhY8VIysqx7l0L7L8UzVm4ckCxMrZTv+AF1PpfrRpBV8ZGt/XIjrF9vsZv8BJVNND5bW9cfFgZsdUeng/XwG9kxWGlbofFDBPsNLZ8gMKhMCc8iIuo/2ODaQRvaM5bMr5KJse/4AkOn/OMTWX85Gp7ARdq2GwGm9sK1rPQ1tE9CLNZZyXeS4zLXbAmtb+0NsNLxEfOlMWV8LHDEjR0vA/xgL9n4H8Ao91/nwX7P/j4aV/mAZ1/gn0D7j1El0cmUa1q8v4AYEBc2SQ0YiYiEkf50lQ453bSX5kASoBK/xgcxo/UIBnHy9X+/DAYb/GQJyAHuwrPXrR/Ds/CHwQj/ko5jR/gM01PJRQ/hHTUbwHz3KxBrbHTyDGQtY/wEiClnwae6rZcGUjtK/qf4mED/P4EHKQL/BBaPv/AgAJ/ghG0LPSeqDsiT0emajeEaCHVsJS5SaKHgVfMXrSZzYD/GgvTXg/wKUqP+/Bemv+IiQT+G9eUO96hstIQ42ntQPYgSy/4AbWYJY5wRNY5CxCLgRws8nNqltNQP/44L01jw8dYAQkSNlCCRPz6Q0BGwQ0Qjt/5MKLxf4av5/8cGJS4H/f7jL/pA6cJ3Lu/gW7x+C0xTY6va2PcPqFeiauYQ0WTLwouX2Gf78ikzD2r3487NKtRil7LrgzYQhKTbi1Ylug9er4/wgQhx9n/NLahof7GKZpwAB/v9tQ/5cUzTpxh/wAt4rbmjytFo5L2xSob+AX3TENb3W8edwjiegLEspLH099e5ipUH4PtKiVvZ9EeX1ruTiC0kHSt9KBsYXI2JfGYCEeacPIehSt+IyEskZ6QJqlC+/OMEg+PV9g+XphIJ76+Jc5kLvvDJhbALCxAjF4E2ZXz3UQvyj14dugV3XK8djXX6RTNHMsgU6J0TJfB0NnxmjIKSIFJyeS/bu5MCh8QmAoSwdp8gUHr92cKJnvVRJT62X/Y5fMAHun18dSJgWukmtLqB/yAGWhF8omjB0muQzfYShH7uCn7M9XQj5DSJ9ZqdlbIsMVd8M4MYqXMMq3D9X0wTK4WSFs0bateGv6Dna14NJjnCeTgsvXRLOdEGCbOOtgTDOPjm8sLPJRwBF/p7cD/gIfCmHeGzkoj2bLv7mFybl24azc0GlU6ReudlqbaSyLrG81efMpZxc7dLFh6w0CmeY34/Uuj6IX8TFkuCNN6fTGaCrnCrO1G1K97uw+V/hPUnSqvrX5OOeAaOQAxVwTvTfjFgzLpUEUH+BD0nxABcNn2tWrsCRgA9AAJ3FzyO3avX8wENEakRH85qc0kouhuwN/i4VlExf4cnGYZpHWVeQlHzgXYhKP8BBMMoP9TwOTmCzRQlVTHyqm32nSf+AOpav+ERx+U7kf8CayniCA3uKNp6PQuN+pyWLss1/gJEKd3Cwmme3dAvMv0V9Z2HjMVL/HB34L+f/+FAtP9HCgWj41ND/AoMeD/ro5Va2nSsMe43JA0w3TdDog5zDOv6mhGfmpFaFnmVw/wmGgf4MIKbAeEA3yjoP4EABIMKrQz/FRBTYF5nF9z4wf6OIKbDR6c+0exMoHgt0QrT7BcUtquvD3cxN4LfVFroPhIm+/gx3p/hMC5QIUdBymm2pnQuv2f+U6K2h9HpaF9KQvsOnZprXZ5iUn/4sK+ufwmC8/4yDMvBUVnY1/yAiScnH/PCCmwKaG90LXf/OBbwZ/3Qgps/gYMhv/dAvRg8avCNbfxDsE+IXPWNO93OrzOQC5Gi3zZZiIBrjMyeWAArFh0sOCEXF/qV/jgiiRToQCZTPC/gMAEyYDCOaU/zsRRI//8DuOv9gEUSLkA0nzGqxUc607JkTYvzj/gIcwctPJHLM9OkIByn9guGvkmvP4eBX+b+fQV+koIZHMUBTJJOFhYOid4rXqj3c7r3CVZlmJhUQZ8f/wEhgnECi/7+Hkjwr6NIP1vGEAFBLmNiIH8PlkAD1EGvRFlz7thekLyOqvyV8U2lRD2EwYiSVWfgn8wJE/gC0vItcXPMy4S7ygd8izIkcKcpkcz0DCTbSy6OfB3oOflTsk6NIN1s4DUpDO313HyzGh37kIQBPaOK7Oq8b9bv51W/zaPx1nlZ0XoOuCxqYj1CCdQlBjPUZoM2uiV9NHVi1wp4PJB0h6q7lStCUjBb7VHdU7gZbpBXnX1pfIPvyVxGLacNcu6cItuLLqvzz+lDlHX5tzncZAm1reM9wWe5APA16YQh7w6b/I3vWEsj9ybfH8pEW+MnuUzhTu97+LxNWtcuR6Bh/bAiziFfPkgd2ruBJrVEE5VtTWkjT5o8X74MknWlE9Sosn4nYsQyAS6+JtDKs9f1kTqxqNUDmf9itLXmvqUxsZ9f12jI7xur/AR3CSs4hcoykO9sj3IYG6k7nO3KcLGgNvZmIjx8GLYbDPcR8ZfZairnA2Us5KaiEofTuqEFcp8SPKGKlimY2xjxZ+GZ9rQ33e3mqjwu6uWI8yOJkxmCbuA7yNmkxmXwrG8l7QTELYp4Z6myscu6Tk5+eSuwosPeENw7UhAaHgLN9HAGiYWTo6EUjUDkMFBOY/1CPOYHQ5FgyIS8ZK1Z2eaKucNp8MA5Rux3M8b2xX5YrpnBRmYy+iXfASEDyOWA/f4CQstSZvbtJtB+lxKZtJjg41EDadIp2zruTxeIr1Bdh8ZhRC0HUuiccBRI4vu36D5Y4hTIMGeWLrXgZFehwZjF07H2j4P56J+8AX70ryihxxjG8vbDodeRxyKEf8BBXQOWhvTVdmzzPp2irQALKx9muq2MntNi7HVvbUxmdyNPgUt8bqbuNmUh5jY7pJXOmY5HNeGScTmlVdMID44Y6c+pWzxhyj9uCvKY6Y4zxIL2k2/+Jf1g77d0EiJKZRqFLCAeXi+ZkB7H1qoR3SgIvoMTC9jLQOCELoMzp/BAMqyNDFWMBAml3YwO5g2K1n4m0VtnmgCP4XGeBr/WzkAra/nRbtX6HECN6DlmvvImvZE5hEQbqIGjQP2m0CmQF4b7GsKA9HfnGm5IPud6guGZZPLkHG6rv8AGHyGOp3P8WGdmtP+ftp0qph8/Dt4W2gnJ8I3F9eCAovc6phEErE0cS9Rs3WLHBz/lT9q6mQ9ZEyn3hECQMA4x7Oz/mHs1dnaItgNZB4+AeUd1LeSu3Z+kv4Ig2iuNQgC/0qr08cZEA3LwAasUFVCDcDMppk3iXecgo5N1CqtXUyfGHr/gJEYR37JOXXKuVffiZBn9/NwvPrBdB92pmKkRBP89BgjfWF0/wEGSUGt535dOaE7IJkpPeln2wYQqusHnUxDKJ/vkIRNqXoTsp2Su3XeyGvTToeLPiH2O/GWw4FJBwBRx7POw/0UB9ZyKFFRU7Nqzayn1F8ToyHJBhMhCQeawnlgnrefGWgCHVdWliobEhVwNadHviO4BkAbd+cAVu1g5hdSAdU7Qgl2nYz7lu2JEJWpR3H4+iXlgySANwlKoqBW4o/qXHbxAz2ugEBZXPjUCp7ZNaqnXpV7gkRJT0nJhw4PKLGboab2X4+ONyuYyi1tz71Kan+Ahk5EczFJvjH9UdIyB8SdFXIeselb9H4O9i1iM4yPjAWp9L+RRAcN6CVRJ3uBxIMkqDWClIAoJWsFw5GxKHHddEHzlO3mBAt4MhYkVMAeOhmsU1cs5m5SprwGqZKw3IXk/pkKoTzKEh0oEGf0aAi7MqcljsvS6NCDyJKKMi4SGjpaIXzOzwCcK3lT87WaAr3AhjU+1XatlIsbO3lshICgLLO8fOkRyEc1sq6vUcFKZIinN4hy6ePFMmcSD9TjiYksETeuuY4Ejp3801onwlJfDJ8hJ5tc7CC7YOTeiLub9Os+EumZhIxjw8Jx6FBMc/gU70D5rK9+armDfAAq90EMN1unqK7HiiGq//wEMFW7/FiusT5YL1MZl2x5k1c6GnT6DLtidb8LeP6eR/hQqoz/0YJSoEH8OxZIaRf4+BoRiCuFPqYWMkB53pHm03LYgREF3o7i+Hvh/TmGzA8laFgVdQjbGSQBRpWiVOERB2qzh4J8AIhMf4blhHER3L8+yOP8kDYmXiKP8A3aR4WNLrmdCVc1BCgVat6wtEu7OIUQzFQILFqbB8FNt/ioqozkf8FFu1/tCAlcRlX8BgAhnBZEx/k4qoyVjMU+drDNofTtEfB9h2OzPBgAz/HAjwGTL2NfhOvufStBQZhihk0kAGrLmfobF2OnLoOTE+DujeCGIh4/wKh4DIuHVim1ggFwJ7IKr/ZAYXwUHx4kCcVjValqkEW1UBC0D/jo/78Zv+pDZAAb/vwJuIT/9eD7rgb/og6p88SG7+GNoLjbQqTMz5MABHVNGdVNZURjQ7mG5nWVlL45ajoMYQNwDIdd/sloVXsO/sLpbAo73j47YIKGP9DCktMv/DwgBnn8PiAfB01zwXEeVByQ0KJPsg64UCHsjZV4nkVGIdPJgyJz89oZTmknkVCcvr1sQODHsQ70bIoDD9vNm56zhUgTCYuMc3+UDapzxJIiF3TKDxHMLkXT6r/JLc+U8u4Rj6iPCLh4xobzs/8BIdAe9/DoZyz/1SGch38/BnKr/LwGch38gBnP+OAr+09/v/qu/0kKwaQX8CsvBl3jLHlVL4aabRttVSHL2Due6nV769xVSENqaXEVICPkn/jAL5inD/iAXzF/Lei2CAAKDAgCEg4GEf7kC+Yoo9t3b+HAB9VOL+HBwqKZkT/HRq2kztf8OJi8U3TpR/HgA+36mT/DgEvuAh/Dg6VFOZp/Dmp3FQUd/kAEkJYmaHlcDUPwphgvyAA3FjckLLOYol/BJchaJBdsK3QhtMYEP4HZ0vbo9IjNCaiQNK1HhUXIkMpo/SfA2jPcMdSxRINj3liuQt9y3wrHyahXzIfpw2C+X0QSF6hWsKqrCGevatAjPZanq+2aB4Sf5CUeQsac8wZh8FtUuSZuUeP8xd4eOn+H9Fs0iwmGeazPBXdGIev4w4xFaotx/LBCs82jKkJ46w4e1J7xwA9Vk2TIxDl19d/gAQzqAsme2MIK4aUaZY8MKvtHpLFNhVknT0RdNer1QUUM8/MDmV6ZPvgIjfDhmbrdCZQFmAxIGghds1dVaRf2pP1Q/O1ngheJN9DjS4xu5+V11zpxRsn+IY70E/v8V0/0kCARkH8CaV/ibMZ5wCYSxurNZtceQtvHie94S2txjuItjd+iGGbz3juZfwn0J/wPri/68GQFD/Cgy11/iYI/z/gXZLAf4MAhbL+UQIUEPDqdHKyFrMsAMD/HB9Qyr/3sQIMG8YwlytrWfhlPsy1SweqJVv+kS/x434pA1W/wK/bHWwtKcYgmN5Wc8kusa3Ha/SKHeIZEcZlI7jEbJ3g3cf+f+LCwAH/BBgmKNqv+0jBMXa/18YCYv+eh8jL//xgmL//4wTF/iGaZY+3tq9Qpc/0LHWI+wOw3TqZ2FndP1rQ8h4TrYltA7Ebc4mvXWpeH6LSIa1RGM7ySZ5XNV39hKtLSP413lV1sqgQbSDwHQwj5nXqU2EpxI++kopL071/xU0T4EGjpPfgQn9rDnwKyXiWR6yVEw3S3hcr/AQ5rLhVaLiZ26eycohu28Eyt2cELc5xAhZ5Pr5EBVwAmFIhJsadAexcKJ9b8h2HddsryYjHnA5t4osnw6zDsh0DXBjQYcKWZZg4DROdP9OJKJP8PzvJfOcs9SDUo+IOsAsQESWQ8aKcmqSFGGvngN2mdfOUedovF6quGdPDu9Eu3cavXGTy2sOYKUHz26CpgiB/AC9M66CCvPQS9eWGflrU3Nn9kv6Qf4CHkQ7qHqu6h7z7xikyQ95U2xgNTRL/AQ/bBedt0Fpo9odrhEl8vr/d4bp2R1WVjmNoO1D2UNaMktyPG93peIhF3A3dsC/U7fO7HdrKGW6KNfcs+zyOnJ0sEPO2gasgd6NY0mE5utCf7OdUmpKkgCqfSRZqp5tyX8CyYx8GW20sHUk+MYEAVKL4f4CQXS4ZftuLFOCauwUgO/Wzz97Oov8XCnjDnCAYed6H+CDOOKBGbPkzhmnd9JHDrgGeCTk/Vlg/8QB1yPihLfa75qNqcl0ZnCiitAG46tHO9IKVrlMhEpFndR1mN8C/wmAjf4+DrkQJPkNgeJCjgP4JRa0W3ocTI10OAR/fnyzPgpucXbLLvOIDX3l90xTBZQmLLRb+F0pQZ/kNKUmP4eWmF/+H1ph2BxeWEkCE4Z2wOVXZbvHlqOo+onwkbmOFugS1bENgLwRYJtvqKNy19h09UOs54vceiRBiHh7VmNVECG2qVMwuD/IQeRj/44PnCZ3+/0IP+T7szxTbkxVsNoQyYAQms/bXaQMwjl51zYL9L+4gzyKiv18yY23O4OBCniIqjSpRU0lrEHPcQJLSWP8FCJJSf/AfkwOfwGACFX+xjmV/xUTp9P4VJNePy1IC88A6P9ZZPItrCRFu1bDMe9NM0Cg+cv4dXyNo/qlfjD/5+X42H+Xl+MP/kVfjVdobXC7oEXl7LzuREH1UeBsYt3RLyizT1n9YPWJ3oE0av8cHckA5/e1XL/Ee3Cfw/RYEOSHYf8C2Oqz4ozKeics0Tli9EoGShUacUTAKVqF6TSvXySbSkahzZD/jgW0qtP7/WX/5MuBf4DQ6JcyGU26As9nbOvMwlzT+sexrCR2E6/yj9OswV4rxWuoe4/hQXb/wUNDU/wSLtfwx/Hek0BaOhm/40NBuaz+BOBj/90D6df4KP7ZzK6ehENUpxM5RhVLcgbnx+qB9uL/gHgWRH/CwYk47eH8dQLIj/D0Cy0fw/AsxkGZyyIDAQ38Avo5XpvDMBeCvaMeECBv7KD3fgWdUoZqFTx0Ksyn5rgB1ZIcOHto/uXHNz8x6woZrIn+lME+ztfxLAssBtXdsKWqn7JUmY/4CQzMhLyKzc9w7/jmWxdjfOPlHLU6txvsooauYYVEycdKz+AJBK+zJZm+59XQxf/BwjU6Bt1EtfwBgAjFBdJB/i5QWu/g9NgBcYZW+2uI9OH7Y3woPw50wJ/wEOaYfwxfbiaZhlw+BVOZ/AuX0zCCp+tyZyTdopyY7MWA9sjHCo9kul1nkzCpTn26BSY2s/44Pq7Rv+ad8rgf9jBuR39/75X9Mb5QAf+H8/75QQGVzl6rY6QgYr/AEPjfm9KbzaxOxjfH7x7HvkXoJNDUblSTweAmys2Fckb8UXGiUoKwr+wX61JZ41HsaILyTZtqLlYFCa/wBP/2TpBR+7WEur10TXpyfguyu4XyxIQU3Boen6HXcpoDBQ1MlFjUV3ejj8SSe1VQ7OlXb2AYW2UB981GbkUOdFiQNtSgaucbpejWrI39CJKNylQIANnXueP830knWfSqSGOw3RAQv6HWywbhbmbK3Qx7aKq/wA7zoPxIuyOGFZyCbf8PgG+hmtOo46eoya5CCk/NXXvdwKToEPmdZwTgisJvFdcoGoUzbJBFnIQTSV+Pd0915VZOg9eW/W7GQIkSDuICgyAFwXIU6hxRt8qvCgUibBL8KKhr29Spo138hgXKv9h+NBDan2aRm7PzHd5qwuct6Iy2bV1+6yXCmmDhiCHmdvckkAcGQHE/Nf7/gJL7BbLckJzZTlGPCbCM8gkNGA4eEGxnf/LilDvMpPSGaKKJ6Lf5e927MYEyh0sQbhiiMdMCTFILgBLScD+FBVTv/wUkr5/wQKpqQRKfwmKWQ1FBAxwkOlAfwLseNTbggnbCwwLesk3S1Zb1y/V2LzDLbBlPHnyRgRo5B0+I/8OypbT/VMqWIfz9KltP8vSpYh/IsqW3UIunNil8qTOXbLs+jSkX8mBtmscD7n7qToN76nBYgZbc+ODXg6j//8KXI/6OFLkXVozf4F7sHqHoy17hJHu8HPxmuJx9Lbrqdzm93xVmaRpIT9X6nlYCf/HA4WZf/7SBKPoDxqQApckZVoe8f7JnzdWPZKH5foQZBL39kPfRiZEos0n/2IGEyf/+QPkf50I1f8VAzy1qdJv8FahQILyC1/A+Q6ilUKuQr0MNSSHP8AS2SH8ixzy/QYQRGhRnWF8DDKYwb2fNr+fVlBwfaGQwSYFXO66N+t9GPAK3nlfC+gRsLI1tHnJhnEIyRdhZSVTB6Xwz5GPGOiqg26Y0W8xt+yBR3JcUExheoCkvKcrLgYOHgOlUb7SkXtXH8Pq/9FuIRJAjyPFi/uux982AHDmSyrHSWS8BgODjjK25YVRgWKSkZfOFdPex8gE3UM9Xz3HCz66hfGbkhZUZ6MjhXLRWXABbEm/0S3Hh/EDNNpLBHhuFiwzs/7gKMHmJ+rVNAcTpmKJDqcsHsvE8P0SZHI0annQ88Cw2/43wtkW/Fq68JRT8hnv31Fqn9CeFqpMhtBMIXed4vuztabQww7GpN6KadBgZu9sxE5jf4CMs5++JWcwewc+Yh11pPnmTtROXXraupA4wcXel7Du++7TkG/4XEVkSq8yDUCPgGfwfarbvWVwYBukmfYVBRSrW7BToeBOQAkc29b7kb3OhXWXVn2qidRyrRsl5UuhtZOnUodK7GwCQ81b9FCRAnA6RfsjIiSar3GbPdgLv/g43J/hch1RVtzWVQSgMAABmnt8X0Xav2OEoowuH59ar4UgGnBhOcDgZE9+2KDrj+AMjhnxegCYguaB7AddoCHT38Wya4IAWh9aj0Ff8HDeyDvCATI9qv8BgAIvQWT6/5OLhv/8pCvE4DQtXup02XoAABgfwoBDfIQJ88CH9bx/hYfnip0X/FgrxQD/5iFeKP8oB7uH8EgVP++g7/aBwD/XR3+0AAP+qg+JABncPzPh6d3oD/Nxl7sB04CCsQW+MAAf56FAIgWk/2IKAREGPIfwDxEiT8eGvyvZgEyRiueTN0iWI960/sVlaCf4CTqF1bC9/+LDNUv8Jh/z/jIq3sfssWEKJh/14KABAH/lhQCL/PxpeH/pIsuS/iHDg/9BBUvp+F7xKa5f+MjS8P+EgIn+BQllgBf8nGl4c/CYk1/CYIV/iY3wVBwQCAf8oIPnX8Phj/8Phn/8Pgzz/lgTJEKGg1ThbsNf4YGXNj5RxyI/Ij/Phx+W7/w7u1ApM/72GYhaPA/xgFysMv/jiEeVnwOQfxAeM6VqQ5incDPNj6fph5AugNw6te4A1QkFomnt8Xv/ZSGa/4vX+3kDNf3zn/4AvUlXzy5v1pph7anWknuD1VceLRrr1QtmYICLQtFa8RKrDYILbDIpNZNvuuMWOe0jc3U5i9Fl6dnHEPRDGu0Osbwn+gAuNhW/SApqtHRg74tK7oWnukKTx+7TWxSSfuV1Pg76+7ugdrN+ZyD6u4G9YrzXulApSpuJdtJy6nxCXhpRhZsY9n/8/FjpKDWoZZJTktS3MCCfYaRAUumV2agxKU3uESK9fTKjNAlHOxkWqPGrBk9S0n/uvTgyH614hrP8AC1rrmLFCjN/LFmbXg/v8fgf7cFYoSXGMIMOobxHBPlrVMNLzDsLc8Q8J7dUYrCY2NGiWSG7NQhe4AsK9ddYIa7mRfADC4QmfABoxWqtZBi9owX+eguVhL/zGC9SWHpP3OW+G+WXqar0ERCcc92oxl7UTriteEcfYjMrZbR/8DCUn+xEM1/xev9vIZr/2g/9CC5WKxJ67g7HcwXs3QdyNy1TGaeSBUFLbVVSWWpnwsNC2nL8Xgd92uLRUwHc5M1ANdlEhOwhGrouaA8ArcxCt1vp3/j8Pk8ZRkENQQzdXfTMIbp5w8yy2tOdEelQ5sM7Fe9fnRJS+XtdUtyTUd+wQXZoIYNCMzMfuWkHrAaBdvfnihEAr259/yQgZr/8L1U7pCAY9mGH8BokFvVjpGuc0sKS7+ZqFCcydBMH8AE+/d+QwCfENmon/+RAzg+G/vbmbEo27LN7e7xNQ/2xk+olFM3TIqCS/zwO0YBBk3fwL8fwLQU+SLacAqzTFiZO914BLTA3kRSdsOkuMj5c562kC9gf8Jg2HYA/4KAiTP/GARGbzpL64GFF/BVjxEtoUa4T8pPPyGAlwuVO8Nk3cNxhQsgJCCOusqTzWovPe/GDcbHLtHGIaqgNlCeQ9JpdHjF5AXh/EqbBDgIGGBRXcLUB/AvTdL+nwSvlrAI0da1Br6EQyjJFywF8uTaszaDlHNGYJ9t2Q74DxHJGwKY9ELN7yqukjMcMe7WUfxSA1gXZ+OjfFQGB/C/6wNfwh/rEz/D36xY/w/ZhljjkCY2kYqbbaWjSrtW4fDZRLFG9kW14/NPZgRCc4blCAgv2F5U4uMwXhC47SegAoqI8ErcMk/29QIdLoK6jJWf5QO1xvGbNt4DG9f5EyFI8868BLW3Tg9HF8L3wABnnH8k4kFZqev/xMHoYgXx/ghTUtCKse/82B6QwN+pdPLnQac1IuaXeGt4V5VTPHYf4uD0hg5qxJw747wAMD/EyPpT8NsFq+cIDDNlR/5WM+kWgytGjc19EyXmQH21SHbFmKcKNZv4W8PUjFqTHkfkS8AP8eIEPfW/4KBo///9IQ9/+PifV8akP4JCeJ/Z7EMqDxYt7QY1PNAlZ4Kb9QRfEE8QsMN5GwqdTeDvlD+FXB7/BRghTLWB/Ejg9/oYwQucQExghGLTErsupPC0NYWzvctD1fnD/wklm11Lq1sBHFrVj/Xxghd/jBLFH//8YIXf7oMELoVDxT+CsoIRJEnX/i5xeugT+Ciu4C1C6v/ywaANzwwlx5/BS7hH6yPsf50MELgWJ0y/h/y0YIXdokO/wSF4iCjOJFFf50BoWaK//xoHKC/xJvy3Q8If54MELg91QP8sGAhcmcC4dzP+djBC52cL7m/gm0FzYAxYv8tGCFyxKcX+BQoUDpwwr2P8gB798aSHNAfmyf0InGNxyGKzg0nCUtzIGW+xLexsZPdifXAKvuxHEnnk5EEtI4rdq5Qe+mJ4/4MF17iIN4c3LWX7OwLaiOFAfWoKpheCH0rly5r5I/f1xrrMphgZG/up6G3ii0GmWA5D4oSDm5flXQur/wAcAoABADrwmvrRHMUiXWNe/Zseq8f/gIc/j8RNc2t8Er8HgA6JMEnJxqKUcienJBnnZTtgWE9tha7j+XZ1oJNTD0GaAOOSLIsczMhEMk3+8p2BladLv8gE3599+67b/gI7HAJOEULQVlG+0uOfWvGqMbhego7lmbkZvm3EBDHZy7RAyIQCGO6HVzOLPftL+6kuNmibFvxxtAZIUYs3FtF3YsTJRXF9f8BJ1XpfU0XlrCiYlGKwE9HbvIxPB6NTfpnHorJEjnpFgagf2KeScsYf07ldlJKFkXL7dEUNuXSvlPETNs1MFQya02RVTYH7NmbqrU6nzK1KBwnQM0g+9xzONkSqB/gI0fVMF8Yf0Ec+YTAsLQ59dj2pnAwn2r+ADyx4rccLVv/HiJXk1f1ToIiP8/aCL5/L2giI/yBoI6L0ng7y6sl9ZmhvBX0c6Vh8RNoIf4KKuZHb/WxVzJ/CYDt/oIq5hP4D6/KZBtcrm51p4fs8eb/HUeI3viIjXGcJC8HkSIDE+B63Hr//HA/2FMf/+IJsf9JEE2P+BA3iH+A/Iyq+3gyX4fbeb1YN04rl+jO7cxHwsOAdnrGWC3N/yVaDKj/xwF9snv/hREUeAR04/xYKhXf4gL7ZOBeYoDmH2fXCvsuD7VMK7uvg4i/P4cesanpngOzTTZe4e0elAGvb81VCRY734sbxzH4RA9tiTdv/oynf4F9W7JW5jGW+fWz3JF9ORz16I00sMlXj6GvJx2i2Mikoal4oIwt+MD0yfrOa/HuE8saa7amikP3H4soGU/x4TM1jxtYL/hT/xiGkfwmA3c8RBpFb/khEbb8bpk5kcmrs6fhYKMK3Y3HEtYE887pSQwjQUd3M1mJ14O+/fYj/sWglq1YmP6N9XEfBO0d0h8/OBEPsE/iJD7AywCatytSv4GtP8Tgkm7Pjjyb6TDO/gAVs0+UWC9zniAQB7s+2IxfRKz1MOu43UMNnnw9KB74uiOtum/hlIKwmtki/8HCgln8Eyms24fypl/2QFwvJ5/gMOBMvHIBEjBL/NzDsorh6jwkkjAkRHR5OHfUoPbR14Q5zZ+6RK/xkPwUA8IB30FIf/8H4KEYl/2MDxLAKq6PmjcxyVixHEnqdDoraxjtNZCyvgKlfJgquFznY0/6OB4lL4/5IDxLdiMewFBc8ZmnGUNBPB7LbG5bdZxYMNwuKu4gYaNOOBFNm95NJpjKuhh1qNsGDGpXlKcxsI0+liPv2B4PauUDblf58H4KAj+BguX//A/BR/f4XL/sIH6innPsHB2uOdB4Ttgtzpo8D7AXJCz6ePkrn6LC0cz89pnLqlI4mfuwuq1Fwnfbi7Kzd7JH4pcagFh/8BJRC0stxFBaiQOAhvCOhfkrbJvkUWuOVK/EQxxfwBIb2DPccKi1AdxrJOVUI424R1TppVTZOKTFJfonkBl7nG0H1expYrUhSH4dN7XKp/kYH6i/hQa2/gNgeA/4jYHiBAH0gXgP4G0YsjeuuE1sCaMpBtvyTe8LMFlvmOUtiBjV1oNsFJF44AjXA/xgPsRYX+iB9iL+PhVH+YBTH/CQK89n7FOtwSHsNdclz4agsfRkgpqk4k6M5npS+yAxKqPyvTK4aAERYj5e+zxA1lDZG3iJ7P4Gx/gIW2l8P4F198Z/a/4DABHP+GWbEIlMs/wEah+VMBgSsKXwTKjn31hdHhiz5hIor0+oT/84MJio4j+IzT7/NxODD/CYDv/n4nBh/gXyF02TeKgJ0z8BIUqV+yEv7gB98f+0wPg0SbhAJ6V4mu2Nsipq1/jIHxh8OEBB5cdP+/A7jfxof4CSBYIr5vWzvg50Q5gv0Jkz0bABpbpNqQcoIcZlxOhrwf+OA7jawrLmCJECJjM54hM6/31G65Waw1H+ggdxv+BG4rXqz9jE4A6/TO4f1OfyNJY7EsEdiZvsEVAlili6XNZtNL270/A3/Y6bep2Pqvg/0t7Ka4sf5/gb+/Bf8BDUgIHDTGf/AYAJsQYPb7v8dT9/U/w9/f93+PhtCRV3QIPgvlYo7EhEWOYTHZlNPTeQAJuZQ0YOkvsEcItAZOKhBCu6ZrU6GZldoVCmAhzOm10aBfSKO61q5NLFmHj3+IP7//GQT/3XfwHjnsP//gn/vzz4E/9gfR/D1AjYf5WGZEv4ScBSz/BQxhx/CT8D8hApAQUNwgFXkaHBhJrff4qN9P/8FIpR1yD8jm3VgWpPzx4DdHzuXMAbM3uL+n/rgfVJ/yK+M3ax/iH94l3c5/pY2A1v4TgM/8eCvH4H//4M4R/7kGcI/5GRU9f/ABnCP8CQNz/2oM4RAdB/RC7/XwZwiyf56DOEf4RAo/9RBnCIj/gYGj0P9SBnCKJdH5D/BmepvOSzEnXvm3k/sA2NdMOkWiwOqzP/pZFdd/h8Hn/jEEjEhr/PgZyigf7EDOUXiZFPPcQZyj0IAyJDX+XAzlH+FAmMSGgASGv9WBnKP8CUde0/YNFJ1XCBLxLwu5gCdM9PFL5gBTBlGXvHSPaRD1iE4Se/+AlR3WN4TcKe/pc58pV5AcBYYOPwPkEUgWEApBhHODAtRu/hUhOsVBW2BdV/0YDxJhmBBdffb5sZGU9p2AcinMKVmn+AjF1AtsGlwzzcj3aWW8br+DGgT+EgK+YEDiUN+CEH+WRTECKjs4zCJO0TdwOLj6FRWAfb1EPdL/iwDxJ/wmBf/40P/hN7bvzse/14LyG/6+B4kwtnYXMtc3cZWVwZXL/7QPEmAfwKGTf28SHOL2Mbhv/Ep9bvSaNm5Do8pA787YI4cAcSX1CTy6Nj8QCUGw3WioyAyJgafu0MFKPeuVO6IF3fSqZw6La64G/4ME/063/agyUP/DsS4189UxLgl/P0S5/r4cNNEv5AiXP8cCkxL7//4cHj/0cODwY9ewn8C4VPNykl8NRkiKf4CEOakhxYKdDyZrZVYUjYG2LrefTFgaU56XTv8cDEfRr/tIyMJ9f//DEfX/2jAawmE5+Iv4Kx63Mzj/9/Aoy2lQ5yhkyqV2dHN2Vk8+Y/qBB3X4BmIakw0TluBlTKhXr2R8l9Rt1sEosotqJnXsCXP4CO5CP4A1X7Yr9uy4/CosuPoDNehIjaAYuEqaNZ1R8plwfl2e3eQi9zF/wEQpD3HRWJBCNhW8sP51qQGdqxgT3fv1tyP3elfL1VA02d8bdBibmZAmhE6jQtWrhGyMprMnWIiOrnskwzFNxWlLRvrvHTu2t8YUqBEPFXK+9e6rwYi8J3UWOJMuDDEePIC1NleNYnjZt11rf/D/IgDGzjTQJPoq7ZvfuQfYLYYLKKo0ukLq7MC4U1mIDfN6K9koO9GAR20QA5QKvO2Ft8qt13hi186A6IA/OFtcWCBBYG1KAL1+u1k6brBIqz2QNmldWw3SF/RLf4CBSGBI3lpUJY/jGR4qGfgyCXYSZTN1y3FbUYq3mNXVNjgqJdyt+XKxwONVcYGTj8iLevLE8tWhl9xOac3skAc/ffDszeI2H9drwqJTBqlrtuF6lIDP8BGV7cwZ36XeeKxIzKmFuqp2tCPi1aeEuAv6bR0EOyBzZrLaxbzQfOaefRj2X8C0Sz9/LbGr/wmA7fyC2r/wHsi9UYJtrTc68u3TPYYqnMMe64XjM8v8eFvRcSEnKTsNOZPAX+F9rMJO3PWaJJRWjy3lGqKyn0oF3/xK5IQ3wnpCetpUB/CpPF/gwUSlroQCS9br/gDABPSDB59a/3oMj1Iar//gUSlSIX/GRxCQDmNWPe+Z8f//hRKX+HwR/+BemzuX9gIYnxHV1Xa4a5b7RRmDcyvjmoUvIGxLKeD2aq8kp3/xYj0eSX+TiC9vx8qpmiOKCRFuZgLQXznrLDE0ldVQIGLNVU+8V3e/7dSoRSFjdOyb4UsQiWp2048BnHnY+nX4P4E5+wb+IoEYdiMwU1q0jgMA/xwRTi/H97Fk4RTecEWib0AOautxkMKEaFhqtt/noIKIbLjv/4F8pPaE81hi0j/3ScbMwEyo18GOs4s9p3+uoIBEt3k+zHlemj/FhsrJgf4QCw7zVFA2Se8lOM6CknwAnWKYIsgiqkIZN6/gfYNr+CvKX5Eu1OmDEMz7ZDaBM9HwQ8imequyAtyQINDHrWm28PusSAhRYSxd7WvthN9dUfZmwgQP+6hO/wcCgDfxAL5jJInyRti/wN5Sfo0NNQiCy+AAMeUSVOhGRRq2j3J1oGWB+vW7fGVXn4epf+Ojr/FpIQDBzRWhARKSvmDBJ0L//8df6/50Ov9c4HTpJwtP8Hgg0BLfw+XP7uwRvks7o0rfH1LeZv+Ahu8itybKfuvTCBDBXmr6LISK9/8MCcEr1619aXsAWqFMB13c3nOqKJMR8tX+hjr/WNkZW6z9f/zx1/ro+sVjYFZ/H4M/cRfjAc3D9RS3/VtiTRPl8Q6l4FGC6OrmoiDWuq+duH8GNPkA/wA2HKJ0/TqMKOPRSQUeoILVs4MCN/74Ov9f4TxHPkBS6CByuEAqN74YMG7cz/FiXnSCgn+GEvOn+LO54BnNcENxemCsojf4djziDT/14MR+f5YLsXv8wGTu1T0z3+1gPn7/y4nSl/6EGEnAH+LCnz/Pwecv/KSBR/FgO//j4/Otf8/Gqnn8YgL/8wA34B/rwU6y/kwG//h8ya/i/YI/gjBv/4IpIPIBqMC+/IsIxfegQEM2/NliAMVzLHA3miMSjW9kgIBDOWnqYSKZRLSzAcaGEeGva3Z/bN8ot7+BbBPgpyvhAZlB2z+AwARzgwiIiJTniX6MocarK4F2UrAU0d1ne5hjkyFH6G7MM64hAHPtFQDAf8yB7EihqnjcXmOhdulvSYP/EYWYBHEooXVLJOn5v/4QJEbjlKTySf/fqrMBM+J9R4JDW3N27vlpFigHpBhgjxfkCboIBroQC/HcgIMG69qUNO5xL702QhrzCFpJJORpahOdYrGAuQJEAtvhIP+a/AvUU0BqK8kJy9JuV+RwT7gwFC2bw/5dGNwFE5DEfGv/awIqd/hhGa4e5I99xm9/hUucAjk3a9UzwNb0cP/Hxk4d/AZRwAp8jNl7QNbpYAQGfdj57o9eLXin+Chj5GNj/77bhg/hQXT/hMKa/gReE/hQVAFAAB/jeF5oblCSDZ+jhSuUBWcyvOtZ/pw0IN+LKBjv9WFP5X8JgN3+ggp/K/gPyr8gVe0qCuG3UpDRjOHyPCzqyjilkcb7EnxRZe0qvfmeK1KUSokdEN6PceHxZab310Y3f+BPibgLw/IlOIIcBpegCMCDY/gU4h8gljBcsemTcSbD9f92vLX4Igt4IcVvXrrJyt2hA/Qnz63xoyhd4Vs80li2sxOkR5IDOqcWXOT/Bh2Ef/A+VO0f4yDtgWBo0V6mkoD/HQ2DKnH/7EYHusOxgDMa3PtiK68RCb1k3TLi8I/z0Rp+Gxluc8CdRPkG87iBseKgXoBNA1cCzjpzc1ed/SWWdf+AhLczivExs3AE45X/HR3XbS/wGf5eH8Cn+4EuRf+bjuu2h8FNYC3+HjVNZ/h9qTQjcXr6FFUvZEKOkzqtmP89tpSR4AJpcRx+iwSGW0Pv/DBrLdpN3cIV9WyOeeF3D1IvTsvjuK4C3/oQ7rtiv4ENUtf+FTVPAWXLmPTTATuUnt6TvuL10e3ZmQnz/ikOMYaP/ARSlm8sUBgGYe55FBayiEtQoP2OKrVT8aSwlAfxSAihIZEaNNVfwMHpCAI8gAAGK2NxoTsJRky4z+OVXYtb2nCB2Ze87VCa93YWH+MEXiSv/gDxnNb/4wd87/CzIj/ggS5hr/8jCXMITCuQ//gQ+ic/wyCXMP8Jxt/8VgD/8WLu38DAU3+cEIIwJg2OymAyu/1xG433iz/w9nlQ/+0Dcb4AB2nt8hTDwqk4VCP0njJtryED8YvlCnEGPD9U7xrFRVpeHa0SjLF8kgmt7cy93TqrQf18BZwTronIH/BwljsrRX+0EeaNxA7q1+/d+fULx6BzSpM3W3DH/wEsKswMIzj7MV8QXNeLbMXCH/sgGoR//8EWb39EQqEoY6WZ/BUKg/NP8P2c4MEUK+3u47/gIJa78X7u4VAfk8VEA0S48g7eWItTZ6t326c24IESbX13DNnDceOX9gui66VI0XjEx+WaxO+dzo9LPH4PixIhHtpJBcHtrMrsWwBcuLXleb6TdsTqkdIKVkQT6tDMJLNMa1XWI0J8BIVsnvnpPfcKKVOFL86SecKjvtS/AbdKRyp2LJJwXGstW2n/PmEjwuT6il7N8ofJpk8Afy5GisTkCoGP+VBBs/hXH/AQxe3vSW9RCmvnI5T1RCiIqLBQcwx/D4G+sHtHjm1hjwKxOog6qAqFL9u9Q2Rm6Myyv4C/fBW98rSzWRvpG/9VyrHCYxeca3319gjmJ9jp3wLvE9hiG/OHEcCeo0jJlNSTy4O1myGVmFzE7Q2P67El9rtvoYqH4e5r0O38Aw7gkSzt3UTNwbdNAx6ClqcFn/ACgOlFKnIDbM2wUXta66OICfiVzAenP/ACg46F0daEQuqJh+bndBYBCnFuguRqtYGNahB4z3JpUUSqnzsiqPvD4nNcJaDEoBTHMYIdFq5eIsB9gd49gm0mNWrlFg8H+w+iE5OiumPj/g5BGdgEnPYQI8NGD/gMAEioMDDT//WiIzsAf4YNGpf4TAJX+DLUN0qatmHyv+AFRIwgBeUw5IWbtDmyXT+KwB/+E8BfPEmcVCzvXjgR4Rks5w2MwD1cM1n/PBgG7Smf+OkSkKpOGH/isEf/hG4wWlP+OxuJp7fIW/0AiM7f4wRO//LhAQ7Hc3/mR29pAVuz+rIh/jwleSz/89Fexv5/Av1J88wC3MXlIuLfS/x0HcLpiWj/x6CB/v9BhA/+MiHzZ7RiUK9e/3/9EIog7z3YkSpa8Ur7jBFDxEXz15/gIj6lLDG/gQ1tBYQD4NtS/gMAEWv4YNbQIHQr18EkWAwKcBs10pl8R787qtdd0RH6RWJF8vf4pNs2ly+mMPKv4GRhhrcB4CeJBApPbCGFFIiUQbpI/yXviCeKHhaQAGLdDT4Df+B1+opoQEriMr/gMAENIMEnYj//x4FP/nY8CnmUP30xE/j94Ijx5meYhSs4PJ29N2V2LXk5bNIsU22noVgGZsGBFg7T+DOQmpx6x2it2zxi4SVcpxaAT6f8O17P//HgU/Pi5BZhigSdMWPv/x8eBT+NH/D5cufl5VRyhc4/nTsucF0buAzFwY9ltsL+tmpZGAn8o2DP+GCs/x1AnVFRO1RgGVLV18ajuxE5Trumv+QH6j/BR4FNn/5oPAp5zmv3BMJMPTR7/ARlrZrWpZWCDU+ztb/AyRUCDy42fwINk/wykFQNOxyz/gIYsu/gZX9HtJZwTwoVtejdcJXf2goUXNDAhilqTMaHygr9d4zRlA0j5LW7ggedn2zPxm6vXXRRLlB3O5/4Gqfv4gAa4YMBTRhGfmAwPTMWiN94HOu5UYTesmUy0gbwKkcPkELEGEAtcBkoMC4lf/yB0Qh/CYHT/vgyiEA1vgv9fHRCB+Ia0ujyEbinKJIwqIDlVcBssmK9QvAwlUhMnYm/8ACdMPv0QiOhYn9ZhbBpFomcA5G00mmcE0R6MSLW1vI/B8CXBCeupqhhHqG9sU+eJ6X73+YO27hBivq80hAOukMGTYwWfEo0Yg0EXZU8mPxwPuzlZIfFKjPGvExr1sdPetUDvo4BP/3zFm6zbMG0ncZhBQbkyDMxQiuyoAVQCv6qEZILUbGI9wlLy30CiPXCMm3SHl6ki4wB8UY62rkdBRdgwYDYLGtoUyKo4nhsrFwtPyvRi9IU3mBXCyWWNvxwX53fKx6cpETZJ4hORv0Qy+BkZbVIK9RQIRutqAtnDKHqWFyj9ZLhpNtamppROY5308Ws9xvjFARyvxeQsab6TcLgNricMrHmqMWTSFNfwAL5eyVFuRSu57NR29qJeSyTvajWd4sSEI7ZAPUDO0kDQKsF6G57fEPqsJhYRRpUIDgYqlPSFVLaroUJfASVtkFVSoj22NZ8duRwFRL8CtSBOK0u8Bd8DB866AfPuXNHHVk4hWI66zTuQNI4o0lu6jEzv6bVLMcRPDyzdHIm2rxZDoW0mbDpI+MeIsxrg4+k0juM751BwegI9xDCOn/gIUdk0I/pKfPJJdGDBAqutgXOhTia5kKaMwRUC7Ui0qeK/mhI7B4BQ/1qlvB+gEsK1JTWLx0krl+W7oYcqzlNQl8CJH9arIaIwFW886QH1XOM+7E5fTwEihUpCX7aYcdJPuf8/HRCH8Eti5FjYPByiko/M8UOIDvJLTzFq0MIZnNQGA83dxWuNeS9ObIf2Jo7nezvER3TkDZFIfa1C339X/gwz3S/iEStrvIKQf9bID+BWUAl/wEfA3CVdCpgE3cw6ogN7PWZVjBQgfiz5iR3/gIjdwGKMk4IAIP/NUR3caM8DD1JA2AvuI5+/IK0h/ikWEF1DMzx5N/xYDLutkEFDn+Xi33v/MQIvkH0E/gpSoE6Eh3XFK0AfmYHAM4BayA0Dw2iwL5BHf/BRXhyMnb2/wGACe0GBCsFKRzBb2yWx5qtYx152VuW82CSMlA3Q5J+/RDfNjVcgiIrjp7/GDA2AzuS8h5O4/snxlCU4c2p+f8vSkf8Ga4H+JjxZ3+Gedb+BQOb+Pinv/CCiF/zxqpphZ24CjZ/lhJ8C/hMGy/ysR9YfwmEO/wz19eV9YP8N9vhJejurkd3+fDwkkBAP93DVsn+NlFin+M9rj+MQGb/0AhHMwmkGRF7/Ffd9/FgP//IU9P/Ar16U7nWadDAq+pPLKYlOW8DJyq/OLG5xDHOAuQtTAptg7eCLYikuvBKrTBsQWZeJjkQoScayXKf4QR3NRz+aerD/HyS8R/oQM8Xf//BiCX/Fi1QKAqdPe/woDKJL+dYfz/1YLlqTG6Qqkr8ddU5SGtPThfONOugiqFPP6CpKD9AW7+pVBwXjwAYKsl9mKzhlPOoqqB/wCT2CTB6Ci9vWkkVey7nZ2IS8C3kFpptumA7LM5ioJ6A1bRxhJYlJgJFWf7PY+b0wn8hbANY4TCKanH2oMoMU7iGvvK4GcMFc8WnnRt5WT2atRApG2cDl3hcMAW8U8yzp/UisVMFuQdAdQbGKyoDhp6FgPB0cz/kb7EPznAE/4CLdy0CvIVXYmetK6m7XtRZ97UD/yAfgdt5DyRu7m+wR5uEuATBQ2nA5qmyDTPQXO/uOdLfj/cVqojplnVVLY2JiXlbY017kOlSwDHw1fm5vBgaszhSBLw9F9oohjTT7NWaXwrV1nej6Q/jOT1bFWFDHShAh7OkHEVCG9V0ZRLNNS07+FnfIEw1UyvT0/4djjLjK6E9uxXY8CMg64h6Aud7wMzHmufS2WDAJkaNvTnpwVycYWiJEv2s5loUiWrWdCarVDQb7RTJ+zqN0+xBVFTu69SyhgdiAN2A4GILMgHbyMPvEgMqO0iR5sWRIQ56fxSfVgOZ55pPc0BgOcwmLx4Kanxe0hn9HdDJlYntvjn/FJ/iEKw2HgMPf4KRyJvIt/wEMc9d7FGoi/9XdnkNs0j+ksYTFz+RFtu/p4TYuVhFCbshEndgE+qi4XQKF/gJJVWkhjDjsWoBH4/wctwPE5Ll/wGACaf8MH+NDIAn5pBf8DTLQYr6w8hWY7xA4xPwLFsObS2C4marjvidBRTHpZWbk50TuRPyu2xyoy6mElXZBe/uYbsVDwVgBeH8SgNUOAhKgFLsJwUB/johNwT/9JEJuH3/nghHQgqRY/30dzc9iI/cIlPB/gJVzjtM3ANE/ENIplExMOXY1KcqoOWgxATBN/vwO5ufz/Hg+UDk/w8CvrOwy+IEZHieprEpr0uabste7sJ0yiPKztJzVF8Mjwz6hP8fgv+sDxBVBZEYrA5zCWJSE6b+UBI7tDcVBbQigps0wR0uANUa7PJAYYqDMm4TllHQIQG733n+7zu+FNPgiM6Klzmwf8PpNA2JtAn++FL/gJCYBUX5g23hOddAySQnXs8pSML4v+yTDirq/1cQm4fxEBa4P/oI7m578n+DnNEcAAEAB/9WHc3IP/qw7m5B/9WB3Nz/GYCuA/xAEX/xqBXgXft+oDEshaegdtbgXq03D7NUywyn2qPvBHauYnd/zhBn+OEYKfyNbvQq8bvJAe+wIEN923VrrrpcNAeRUD2+bhm5AIGARoKndPz7v2zl/tJyPtxkaSq2Z7dFgiP4pF9UXXSR03e/4GsvDnfrwOkmIzFLztekhUgcwsfW2pkQBDiFZ12d/3l9FdjpKnxTL234AT6lr92PADdRGCCqAIfz+B1r7+IBsw10lBcdWyAwP4Wy/bB/gjv2cbklILvDBjfSJ9jC8XSw8AxO57KGTelAffKqoD+CchU8hdRlnZFTL45usSUebCRgLcGY/PGjiGPSsOuG4uUV8DwjpVBpiJpKRVv+s1iFgqNW2y0bOb+KRw5BpoyJ8IkgMBTcmSw8Qciikmd428j/QUIKWGW3P4pHVAFGqivEhqA/x0Tpf/9uSPuz/0fFv0CchlM/4Abfx9JB0STTBAYkL/LAVjUXTAaZlHo2zFvlYs55nSoiThniL38S+1q0xAdKCu/AfwMGaIHltVLOWPGVu3PJgrVABu825EDLyTuCnXFH/PWAKT4ZvkDo9i+AFWm27cPSX2gcmCRuk0Q1P8Ug6pbAfwAdUxtXw/wMptImRT2LIt1yLm7smROzGt4RU2UB7dofV/wAdlqRm/8yb9j1qTX1DpC6RJ7TnMCuxr1oD8vDID9NM/xUEgjzQn9KdAYAhAJXfgcIJHJa0XQgxEsjmzbpz/4KBhSf4kCh22LJJggkYDAh8lqe+7Ugu8dWVfhK6oNafWiEPI/wUNu9O38FB4PQ9/5OQ+j/8rATm4AAADBP9rsfwMhKJA/ooXgWz9p2AABrum20aQ8Ew6XTfJQRuTrBUl++2r0w/8KAVX8B24LvCAkPmDP8BgAgQ/wzApwVlbDhOMwGAnrVYLbKisu5n4i8Vya00ITLdpEh/FRYmL6BpzrUBgV3qT5rqhIA8mFeln4GI+Y5gymk4fxSICGHwXd9v+fwMYYJjXdXIrEQTU6B8kfDA1tBewvy5TxjLVDOgCZsjSymoOiDQy4fzbARaofBJUBFBwnN0GNQj/f8DuR38QAe4CHR6nOTOx/Ay7qnmIE+eiWEcLXiUgrf4CWZoK0aV/udenUkYB5T/Vejb+ddX7cxsMVfSbTCc2KPlmvs78d9UTOwZEC7/4lAa4ZWST2rCNaAwMaJWkA1hpXBggSmmcsue8eLAxyQv4FCU3T+CgEWa5/5WCUzMViGQAMCt8lWkyUDVkzsiFn2X01aWp/3mdgL4/gO/wgT8/0cGCeAH1/jxGqpXP8fCUwZ4ZSaQ4fwBBWjW3ERJVN3btJNu72iFS+wd8hDx6SLGslUHpBCpfj5EkOsl2ecA/jgHNaB4V5oYTkotP0/c7FiyywSY/5IFlf/JTMbBzkwo7uo2wAhRNnjr6AVnk3UsMMpfYOz7AJJnNHK//xwS0J2D/NFre5AO0q7aP25WabpPPakbZyJFjBZbDIbrQXN87uA9lUCM2C3+xjIAABAP7/tb/6Htb5Vfsf4KYzhJC4Cv/h8xi+/VOPYARPcQDFVv6t45/YH+ZU4zl+6O0SI7cCk0BHu3+XsrXl8zwG95dYGf6a/iChIPzwnJghTeiVMUxN6CmO+6AlepULM5f8BITqlIi2DggA1qJZRrKQZgE8xwaKqfpeY6GKubdnrfgwsXiT8ppII8s9PAQExD9RDKrIlq3zfwQc9cavYSAJJPbC3/wAtcKDzTHeWt6cPTqbLsO7cZ9q3yoDGlHB0p0lox/k7+8Erdc+TOozaiPwKSMCxk50BTYVJ0Qd6AoJ/xAahcLHhbpWiwaAGV+wyXMSMRLDa54txuzTRWaNkf6zknqZfKrwCibH+v1OJtmZihWEyIxnw4lbAjoLMsd0msI2ymj2V4k+0o06G6fDGVRyYdPgy0Jg44KJa67o7LAtXQpPceeaHml4bGAo+96jK7WyiZ8FvqWpnGRYUmyYN1e3abaiJsetUKnumldgBWMXrNUibMisOSa2Bun6d9UDBt7vB96QIlKQ+qIZplOkTa3/gJZtjczN/wEP1MhLhBBHr6x7gXLjavCvdgGes2BPUGoG64BO6PAAEX+4o+9Tb/wSMwDX8BjMpF/DEXfhbl0sGD3/CwMwfwHrluv8QgIkOB6+jDdRu7fwNzPK4G7Gc83uCQ1gThRX34W/JwFUiukyo/nOBCyK9ruLUr63K8aOGOaFG/AGF3TpLI4zVNadsef8Dp10i7M/8BgAjf/DEdARtkQNo4x/4+TEy2z/qRO+eJ/78FhV7r/XhXpWJ/6IOQnPJbIuysWh6CdksG1Ko5RaPOlkUayeWhQizxgKEX8B5zH63OII1jdSDLMgJtBsKVCSKyhLGJAdAvD+JQlWHAQzLUodogaAwGenZcPMNWNXavjuVf8BJzcW+FMnawdf8DzHn8QBQ3dX/BsyLf/DB77hfwQe++I73XfUEVSwt9+uJ1hlSBoPzPWtQyb0TSJp8X+Ej7XfyX+CqObeUNyDWMJ8KhCu2IS2uaKbXTWvllL99ducQH6zSHudQI1FR5k7i0bV66IRTSnnuauhP4oDSKmxdtLCYIDAkJuv5aNM50fIZsPPDVuAl4qZ6Cu/ggpa/ggOO/lgpaTPKkXAD+Bfddjldhaq0EBusVQnV1MgD4L4lhazH1K16QAuFNrCEz+8PKhX1O64CjAOCYKf+AZJppRnMwqbThU/4oEOEAIrkUFV/4WA7/4BhOn+IAicDy60CKiR9/gb6IZiDb05T1nSjgz+Ps8PfoJa9ve0F9VJto6PbmuBE9IPaloPBd2Rw/R3FkAUw75lDOOD61snu/4pCkyoPYl5d44DAaQGD/gB21umx4Rk+9eAHcNJrWvB14rz/BhgSKBwgFfvjz/AYAJBQYE49v8LUQoB4n9X0Sn8JgeX8+USgB0uH8vUSgj1zfzxB6ThbgHqA8YPVAOZ8rfe2mp4bVLE4TpFEcXUxxkDrj+qTFQ1uDZEaDXGQSZq3mvslTx/+AkH5SbqNfN6BhCnv4AtofVoykzfDg15i+lRcCyE09NBWwL7LQXEu+BtoQphr3EZKOzjhI+kAQV1dCfYvo7ZLHsDQ+bIMZCrMAEqTDXTmIKBna/IAgHLIrjYisC4KLHjFyACebyE0hfNVEmzRJ1pLquqI4m4UtjM/BWL7wYITe5NMkjWt4CLJSO1l9ahilAlANgFRFQUBtqlJ/Mcl+HkVUqrnaHZxHSOE4X6/SRWmjqtvLZI8YaTMofHWPDUHlpNi6hVBQvH79D9M1cyKGpYBm9XTUygDrWHSpoGmgFMGhS5psiNHz/Z+evoysoUoWzJQSdiwxd1Ej+YI/PGF/gCDCt9JsTe8ZHEwh/wEKYeWkcEQZYKAhYt6OEZpE24OKVc9kciDEAtJHy5roDgUrCidKfXScoDm39/0HLJ/u/hCIrr/WyTLHQiQe/oP4w8+vUmKVWVisUg+4GWe/Wk+tj4s78qEi/73nK/cYvrHrku9QSQ+vpC00ILs9zjn1k5dsm+GeKDFor3Fz0ciAkfQRafI9k4yAzO9/T1azUW+fKfXTwKrq6ILihBUKT6b6vQP0uRgYAlz7R83yR/2YsHk5+MystCbSEfbwpCU4Asvx1meZQon58N+3vW939KuaMq4Ko5R0BR2WddBZGp1g35fmc0oAceGUsF7WTSl5r9/wL+UtSDoYRjLEoe0A5Zm0kQw8qRytC3j4X8Om543jfsmBxCCn+AlGw4rMAFs7pwR2pcKpLK7u/OAIfDng9X+B6FD+ICV1qoVpWxs+AwMSN1/t+4fJZ4Y4e59oDatyEymqFP4ISw/5hGFIArqfT+Bo1huw9DdworgKZs4VQBFiIaN2Q2uw1LZmU5NhWt+4YrJoytJezchVHDpvUcf4Ai2eHyu2HQwKifa4H+KB7H2S/iL88WAwATUYqEa0L3ozdpwWBoY1esDYz5Tf4oBNCacSB+9a/4WBNPyBk03+ICIEC6AmJsU+m/ganLcizQn/mWArx7KMomKdIFTqj5DwJO/RyENG8JsRJSt/xjv8bFbrMbCAg579f8BgAJ8/68K3WQf+HwB9nqg9zsQEnbE7i/Tpjw14LzYvULRHRywj9hvTgzlRJf8gGnVxadWrXqTWItGqiN/CQ1iNDMUrtXF6hyOKvEkPgUUxHWOBAcqHdiG1vv+sICLJUUtsP/8AZU25bPP+CgxSV0hAT7xvD+AwAT4gwEKWP8nEytj/FqYz/Acx5q0OB/CZOf/ng7hmFXKwhc0+fOih/Uq5YA9AqIRgMU//yEeGlgjT2+TsQ1rC80uqCi/lsbRyK7vkI/O1tCTCOZEjJqFZMrw6ww7/iwrhlMv8FFcMp5CATJht5/AYAJiwWQT/5QWTnp9GJSiPs4H3QoW6hCMEA85LiJsHf4rAH/4rklFGjIosvk1cf/14VwyA/8QT/CqxTLg0SJHIuOQW+J18GZIQ33Cp38D5an8DhPX8MtW1j0k930ivAYBNgiA6PH6+y/KycbJb4f4CSmVEwin9kC7/4lARYGaFVJMxHpgMD/HCAEwC//sIn7DYJhLf/ARjJXjsZdNYCKX5sNvKryK3+eAUbEBGa3W/gXj6eTZ7txxuZsEEGfzASp641CKJMXOkJq7mSHnoOdL74UPWIMNNS+fC0dUSBHBl7mc6aSTT5YIwKAv/BC9f/BAOr/LC9fF5/K0AH8LAo23fwKjqfzCBCHugriX8DVjr7yE6yBXDuLLwULd9ECwquPFGdG8Qczh+VdqJCOhS9n7SVmMbWOl/12FEfcq8aBVLIq/9dbl/BIaV5fwGGE/X8MvANmyNv8ABDAR1QGB8woC6+H8Dgd/8PgIupTm9goUcB/CoJt4fwK3ffxCBF1TSH2rsbP8DdkUCq6jVW+0FLeiLG6kNCDuL+SZ/DoMVSGNKBXUTwDYgTemUoNdngyZlU8odwbcHNwBxER127P8UIHveSRr8IP+AlxVXA0DZpmnqxK2Q18FAgwqbRjxdwnjs//8MEJgE03TXQQ1joB0WC78ZTj5+S85KT0fx5cDjZHBP8CjsUKtjiR6Ml7+2Jx17HlzaSu3KmvMFsn5eIp2g1gczEkloXbsHGgnXNcgSAZe9E0EL2v/PrylX8DesgKAzRW/wGACWv8NBZo2JQ9KQQGAkeDfTu6xzM68jYOujBEZie7iinJ/Arfm5/wUAjfy8jMIUVAjM7rxWv85YDGosP8Oyya/h8/wQ9mfwQGG3Zf/LBK8BCGNgVCjFFmb/gJCR2jhE8vtVO5DY6KdAs0f/BCpZFxwgFPPhRBgtjXSkoEIv2AixUVQQzT0pakADeofxE4MBccnJ/gIPiH/wwmLAP8JgJX8vDeYD/lYs/Y/zERdAISB/BOzbEsdxVxJ7WuR9w9kcYQn84RFgol8zXcI5xOvDKdrgBJ0rf4yS7VLD/AYNzw//4CZZBzo/yUI6tfwGW0Rt7uEuMXZcZ0R1Zn/ADaWggZWEJPStT7omO4miqb9U2gNk0jccQwxMrWRIWEiWaGMmM97Shn8UiAtjPHZkVF/8LCAvh/Aj/x/EQCL3BA5neQAQH8Kh438Cw9H8QgRYFLwsD+vodAYH+ODEqBQgQCZmkp/gMAEMIMI5nf/OxiVB//wOiu/2AYlQOZ1bigwFc59XqQtdv+Ahow9GlrTqbsQzK9olPnAOF934P47k/IeBX+f+fQV+oCl+lLPdKU2zak960LX3lbwHOwUwQRV88H9nEyPODKliTf9/FnTxH+unEOKMX+8xMGmzaHTP5thftFPUBRaTl7xSYRK491QD3Y4T3yQjkl5s6L0XCdTuMAHGc1iXSJT0Z2khwWuDnxO2xH1/gIYJkcoZl7Y8HGp8u1zdWSxxwjdDRje26foTP7A5FlpfhMwRTosvCvtXXui0tsbJe8y8ltJiWUBiFqeV4etJvfpuX/ynEJTGhGhDhNge8tsdWgcyOjpHJ61U4lEDqZJ+chxo7R7sNMAFtsqAMBLPm2byLqE9Yy22RtupVMmO0ED3TNwAzXKptpasjyzdfgjbHqwneNw9lMvPxb2Z1N6JDivebvIMsqzxYcfkNJd2rEr/wEsQEYpSKnd5RcijBNyKQ9IlFhYGIkEmnROAcVFDB6MmZmXzcWOj7XdlFCAqeeJ9lgieeD8cXTQrbADM4lwR2HMIMxiNjDOhSAYz4koWaxQtg2++ON72Pww9lA2sZuFNb3bunxAKXKfoMwXSmNXJUMqQ1tKHOV1in+rwed0g2oklMhebNhkAIxfrN6t8E89bQ8pINTh2Hn+WJ9ScDjOP4xj9uD9ra93zQqQqtc2GEElYWYKwJ9PlLrwApErwohg0kCOrcYWnn+AiAstiOV4fN/QRImwULlwH9JBElkOKlTPaLMRuhPoiD8zXfw880JpOHyTiH2sAkeI2xJiIpb/wElO2/tjuEhpyX48idlzGkjvV7BRqX8eiVJJacCZbXqaQPoT0fhTpXqnWrz93/66IjUne82kv6QyPIOVkYlIiWtM7NYL+amZaKEomJWMpR6I3egzrfCtONBBoA0hQkZFGKXIRgiCe4ORZsW1xx46WvUICHZJVQb8vOxYgaPAe1ggt2lepKQSiMhWXcqjt/wCEXZ8k74/Kgd2PofTwjkUfZEFVdDWk/JjtFEou7/yVmf+Agb/UJgWjsUrqvWRXt20E3KpPsO2Q6y4Ah03PpZH18qwlC3vUftwMrmyPxMp8rUF+FkuNYcPp/xRVre0YR3IhjKMW87SVB+Zlvz+AF86ESQ3AbTx7KJXChTI5Q5+UFSS6RbRvCL4GBGFrOZiVUVokPDBaqPCoYa9GrBAkhP7AJxQD6tGgLciJ/G905Gcxref3XF4sgbunQM6m/gzJU2TcZ5VlwiVVejC4O2Gkxoi3/wEiz6YI0lzdt+dZgBWyUthroJHB0o478m2TIajVJ2H7nFjg9zpZqGoF19gvU4KH9Z7PP0KYwMvFqxOL2JbQ+3xJaVy9DAyn+jBf6pbnRRXfQ2QfAH7S2Pj3nSCwBeY+yb3sinvcIBA1Ru/pZSlso14JqaV+2D12JO5wnrRCDgu2Fu+jpSzYlVBVnRoeh+/K9NIJpPJELRw0IiWEVAOBdV2Alew1oRxNWgxrmf4CEEYgyTJ1fQdJCp9alY8Y42UjSal3qA2cqu9FcAMurK5CFVpwLWYyqjs+hQPzy6A1pBugeb7Q0hNAGCCOFnODmJV9/gHZcQUxT2vdL1bfQLbEvgefA6Af/gCpu3WjxzsmRIa9t6HvKtd3+AKn3B9obxyj8AvTQmXZHAY0QsLlZVIPxj4oBojUkErgh/wEX/g5ac4uALmIaEerK+8YMlxgLksdPn4dDyCoqEoo1JFJ6McTUiZHL8f2LtVVP8BwdmfWjGfN4ptechRxtGFsS5JMaEiGLo/TkuMvnzMc4BtM7wAFA2tIlxGNBc3cAJXYDdAEU9Ck2tBSYTm3vsfvLLETsBgbud8h2oPEiqnuoDloHPAFY4rjUq9LsMru7Lxs9NM8p6XlcCLBrTKS9GBsQ/iAKixgTyCCq9TbYGqZwJLPnCUZubGIY3lfvaWuCKJBWvBkauJK/IhtSwHZVwQtIhmxdTq1/gIT+tZdkjY6XyyQRE1pFQ5cN9totC+T41w3EIZ6w5QjALE9aAgaOsqCFuCM+f8BB2jxasn9xntZ+3SFFLEcNxeq4yWj/pDqSdWE8gjWzfwLsbyHWuPkoLexZE9CKa4GxliLxSbNM9y4EfkoAPpqz4yeKc+dbs+ce35BVyTb0CisIdC43LE7R/xSTQg8vp9yzeX8DbHEpK6w7XH43Seskz/kwuI6t88Zb0robyBJ+U2WP6ttN7LjNQtzoRmLwJs75rw1qm1D+rXGJIv8UIDWbGh99tvRAYD+BplSPaN6X9QWG/ni9OOYfKdyf/4MBusyM/gkCzgwEqB/8nAgwv8YLd4K1GX8C7y0xhLu6Hz6WW2NRSClS4/jW3S2R1K/KhIH+sbW4YEGrP+hKgYc+LEKpQXSifNd03ZrPNdgqIX/NR2AzR5OUAH+CAvFrynMlpxvytBRDpzgiMUE2yNO2C7QvmxHCEEhCNZv5h/m58StAx1JilUXfCK5iqU8JgdxBp76wf4qBnTcj7ftdgMBwEWHBrMRCJK5jPq44JN/Y3kVjeX+Bsbr+IAk+GRdfF4g+KoDA/hQE1/icvwAgcLBubn5f4XCKG//BSG5z/BIEZzy0fcAOrEYDchvTUs46yWuRx4KQ3gycqaDx0I/wKhhKP/BEvU6/6+FQSFxECPIdBYd8JA6Wc9BOqp7rj6Xx+M0ggFrhAJiBzSDB5CU/xYmOmUlIK3lb/Ha0h/kQVYS/ywHeT/8JgZP8En/lBOpMeJGVaW2oWan38ALk/vGtHhBbNr3h6KAlg34ODQOsKDKIRbZ5QD0+x3+Pzaxyao8LfzwM/l38UidwUB5ncib0B/ArTLR+JAOXvAdGGkUWUu3BDiJR7lxn5QEHMbZK0UIiWqNOCW+okoZQkJ0CmsIPDDv8IyYa/EHv9fxUJ3YTOfTTHAYHudnF55gvqCKCVLTOpgIHNSaUyIP8Ui0BaAC/UeecBgeASHA2j/AI6zZbLaGLbSR6aDRxvXSP8D3nn8xJpAuyyXAAfw2HIR38FCDs3G/6JgzvMXsyltmwQWl/4FcIqeo0flJrhh4b1hZTpQspXEEpIBCJcKghnLXnGOvhfsLYz/GCLtyaf+MIupKMsKaZqQuovQ3DLhqL6KTi3DCGOVr9R++6Oo//ARC92lSx7/AYCSf7KTncfKm/28nO4/rD/0A7NCk9Uvezt1PIP/ASKvMGUoRd0tNQajj6G4QDP08HUiZiQUNrZy0WZpVywW3xcT23pXxTqSfa7tE/ijzUEf+9HqMTf99X+fk61a5d+WR9KZLq0S0zH4Jwu+2vmeWVUD/fmj75LhMuihxQswQc0qQizZ6thhs83G+0PxUftL2uEKDjnzWaXUgrQogf/JCZrAfwoJK9/8CWuP8QkFma7cpYkvf8LD83j/goiM4/iIgtJjIgdr87YDA5NQF37pinUBnMGbrE5xj84XJG6347YIE6WEAqgIOoMD2/87BKehR/D2ejg/w+OhU823ZSFZaMchRND6iULUsj6GzyYUKWJMfabMIIzpwckof7XWOngmTFDLlwEqXzhn18H14ASgmTr9sg1IyjfPNof5KOoOf4DhMyr2Nq0pawNkyw2q+hpPNmV0ab1E05BaCfPV5cI2PSOTPu1U7pbRKplUGVQTQSRQmdKnw90bz/BB3oCa/+MF3KHAvywD18SyVezNAt4QLHyA6+vr9ripDnrjZtwV3QjFg7+BhRn/YieJh5U3+3k8TDhR//gEcMbgCCh59qenFmiTFweYvTp3Zj0Bb0GUmYJPmFX280JGo5MIiPqVY5X1nOPkFQSpNADs3gX/l99U/qvtbI+Vy/vOP8firJ9fhF6MksUkYICcDL9p1ZjapXVN8McLTNe1/eN8jzoTZ2oh7AfhtDkzuOf4CGZvd1zMc0H+J4bI8Q/X9pTO5VVZ6NR/j8Af3Q9EsZSPSpb0hm6C7cyXBPrfGaksc19Hg3tMKQgMsSWEisW0tg+nmxa4F9xe+ceyAZQheZ6EP5BKpQE3ImcSu/z8klzILM3ymjMIIgCPZMog+5PQU94oXhPBHhVSH+AIPVlGTannKOJUYTzME5/Us0X8U53eHRY3ql4r3M5aMsgYm3j5qpbkJx/iQZZM8OTRR+oThJCcn5Dwb06UHZnzri/gbpxBXta3f4DABOX+GM19wFxCcZ/GAwOPMq2DkkKIDdEyAHdldJSPC0kLSALw/iUBEhwE2n0ZxAG/+H86Ubf6pzpRT+fs6XX/l7OlFP5Fzpa8VmX1de90VYyIoZdi/gG2dEsYPj/b7mS+SgCLGBUUh7O2s2ed4N0ujg6dV6Bz1aM+zgN+Wr0U/gg2L/ggKc3GD+WEr4Gh/jAnLHcHQ8qgQkhlWxUYigljvC+CnL/geUY/gkMh/l71N/hREb/wFJ96X8DSM2OAf4wR+CwThlkNzEbT+BvoOzJJlVUfCsGn3NBrKqEGRior48IFe281b7W2VTkbaQcBv/hQDw/gUk0gV7Wt/+AxBbn/hlToCPsx+o+GgMBIm5rlqwiHuDvQAl9MQ+TM1cWhVL+KhBIOxcB8aIDAWP8BDVyaVRbQ+EP8AGXabAlBgrLksfcWQN/wOmSfxAJOmClIpaMAfwN+8Vv4w8zQxItggvj+87+zeTbdw6ZmQo4IUjGKz+b3HulAY9/jIqLkCYQF+u7E//4VFyI4p/sYKq7PBlh/wEMRFiH9GzgymYpub1+oCQdWfEBm3UIVTCkShNtks3+jhI1L4f+SCRqWLNRoWtGTHdp2LN16yaWZB3malBM5O4eYpSgx9D+GDoqmy5ywOO560gtw77H6fRBJbb3C+znBEHlclIFj36hgv45/noqLkCv4GC5f/8FRcn9/hcv+whKV7CtfYOiY173I8ol71OiHOksDzJtV0Wq5dxgQJziIcXb5gCzycWNA4kSvCcBab1dLslv/AQQSNQ6YiPLmo+MwrSaRn/ICQwt8wzhxHwqxvgvu1ViCSs1VLyKKnjX9KqR67bGFIbK4LBAcEd6dToLmarRHf/gHPk+6ZzCz3k1B98SkEw4CDJNelqJ0b/iIsU/hkh4df4KHA/5YWBEX2gLAA/gXGhLr7oWO6Idon6/J6EJFAdZHoFAA8edPDMs0cjT058szUovVGctm9K11/gBPct3a3ArtIo3Cd0bUh/gdyWhwzH/gDABM/+Gh2cN61iGjYDAMTSaZX0ryqWGxXcSTqGzusSBScv/wYFULfxEAirU8pdUgZwGB/ChNf/AxBz/EIEXFcI7BhiCwH+MhIx/E/0QFXmfx8MxfzAMhfwTTQX1o13DTWFEVGgNMvnD/gJaHrYGH98k23BwWKWH2ptYPidgGD/MZFODeL/wgpwb/z8JDX/L78mBPrr6jrwL8wsMeRGybM73C8dE8JI3qH7vBOCZRFH/HBIa8vboYebvndzXZ5/KRPnMqPkn4sdL/QgVUn/Af+9hJt17UbWT37znE6ojZhqNWpiAiTYCO2lJV/8Scqav1GC5uEKUqhoF791YhrA4C8Fj5eG0mo/4MH6SB3/DCHunp/4qKJ+gIjeWWlh2r/Xxzx7fwMAzf/+P49v84EFtAEPUKiKK9r/nQon62BSRL7XP08fx7fwmE+zKr3LRF/gIN2MkGiOBkkBPQJJoCLIxkIUTmwFJJ9WmU9nX3/HRPq0myjvile0e6CO91RUlPVqZDU6/ap/FQ2uIIeLvDP8Ddj+NmXd+0QfLIQBT4FR5PttGMicBYcFfoKcWYsUkrYYzkcy2V2CmOl+gsBdWDqgHv4hctr3x8V/AsDy4fwUOifyw75nSkinn+NDoPWx/gOIPjT/4w9o3/iw/Xkw/8ICgt8x5qqFU/I0ftM7vyEG0OEJDYOcN0Mm9CxUJZh/gw/XkBWnt8sr84wgafUicOuZczwpN+sklzxzu6tDBzPwPgkX7v8BJx5Fbs/44Mho1b/aQCa3AckG22X0ztVuyHDgbv+Agw67/AS/caHyCVdkNUpZNcQBxKVZVhPn+xhL+v//xU97/xJPSCijjYkv4Kcaa/w43/ICWlyfxADiuBZzM1AudwQ7WbvoVny4/tx2aRBtlDonqNAYh0tM1H60qOUxrgxI8RXfjHwc1Bq0cC6oDDHgGV95QNsoer3Ntdsi8r3p2miZnc0LhkmL86+vPWijKUE2Xf4CC+jGGa6NgBLNPSEjVnvzLNvQL3F0SsUaBFmwQw7umRrfMGCjNMYSFHk7OD22v+qvnUUuyx/8BCJEheHvBSwcCP09lTPd8vuh+7Q1iYbT3GF7EkKQZT6o1XiZkroz7S7gQSzt7gezuf5ALIDH8i1rnxaXjpLERpkrjJJwuyOiWh+wJZ2p0HO5iKQhReBzKt6begXB6muWQ3IkHjdEK0/zSq09mVa5WTHAK8XbPzXNmxEJLy7VvB2yDgCTvTQUq7dGDw//DU31s6dNSox2hsM0hv7VmVjBBmWa+frNyBibmqfd2Bh9sKk1CVJz9psy/CDtYZsoP5gH54Mppl0HHkCoyAkwSxKe1giUap04CUwGSiU9ikP4AhWDhrZ0YIQeEMfKq5FJKSmmuhI3LBCNXTVnMwAJrxpAQaT2WgnFUQwFIdSVdDhaM/4pNnQVues6vOoDAQYJOcGR5+LpenVJDr9nI4TZGbKpAu/+JQEWGnu2pKqYGgMCTSuPifPpySI4QaWfbJ4U0mrH4Or+KjigLBmX5GIDA/hQcq/gP7GcfyIgNAEV7WZTmcC/gbrvzQ1pj7RnaPp9Dbj/ASAddzgPFzZXkC41BbfbFATAKEVlm3e1Prw1uHS/H6e3YEGxx/scz6h9s1EBfIGMf4KIOl/4IK0gNAdf4VK0vWe7DRLcHQuQLl/lwrS/gYBm/30HKgfwbQNB+vOQPj21CW/3EPGAPK+xqzGJPD/xWAP/x+VpfxYLr+bxS2s51DH8cS44ETIM/V0Mufx8VpYGLrB0uUJFedabfe/wZTj/wmFC/xWAPz+WEpRvzbedP8pwg/yklKN+Su09wr/ghCMBEMBaXTw/KVpAYvk5nd1ojOlGifeebT+KBCaf98xaa6j+BvorR1ARfs+HZjZKt9r9N7+u85fV/wEuEe9jufn932ylKeL4CignQ+ElD2bTw5E95E0dOahs3GmOrpX8CfwRR/EQmuevMWuIonEcBgFCYYcHuinrzzOLROK5W6ZlIyJFZ/An+Q8P8Rijr76fCRrmX8DfnmreS7bNipzSSw+w4DMtTZ6c2jq/Nw7ahofO+G5dQP+2EbQVDpTFiyHSgVZP9noV65ffFZjpd/FIsSWfP+AkU5zrPIDA/PzejTGqe96WLOD8B6gnAiiUtRC/ioXPcO3lKq7+Bv+7ZAcY4jAyXwu6Q/clEx4HOPgqZfvDgbvOGMwJAahLUIfU+c5sKROr93EK7V84K+ULeWCXXgT/ikY7IDwu0zvLgMCpz7iEiGVXFC/l6kkoepoG1aFg/8GC+zz1/BQx5/LTqGCtjX/AuLDuh3RWyglQsJzZx6fIDCXlNBGxb4xdH7n4C6XhOkvqH6/6x2ZyMOU0CxGNbZSa4GP4trUndBfwMpCAwPYZH8BgAmh/DI4WD1NnpXaJ/wM5ycOc0lEgqq1cSW8z5NhtUuJlSPLtEBkhFVumtDQY99bhxY4s3/nCoAAsRFm6xLr9WTSE6yZ8P8DY5v8RANYKjrXNcXUBgLjZZhSYMdzo+JLBzfIvzXcioiGjfwLdZFv8FAI38tArlZW/n8Kg8P8B5TD3/EQGUART5NEQfU38DZTHI4Skq7BU1HWwVjUOBZyIft7ydTmeGcPaaPjmgBD1cDpPvy/hZ930x/klXuA32Q5GLNlpEtE/g4p9A4oQCXzPE/gMAEzf8rL77eirf4CLOpkhx9FYnQTWvl1jIbW//KO2/hxPVFZN9wMqxnOMrkomNVyqkJh5CiuWig0nPvKCAXSEAj/plYMBgkmUtpHnw3RJCQUTaxczTTx6ko6FAY1hgEeYbaMAITAEp1D/wkEpm9BbvF/l+CCWdUXy7b/9O1wMRiAEOXMxqPWdOuIX5pWxfv8KA8v8AefyXfwGEV5X8BhGGl/DARWQTE0hsAoQQGBFSVXdxkooFX/gIZqYy1ONvQ18molv+/8HMCv38EgI38tYsYLI6BgXXmRd729jt5ZIsasPDKnDDDVG9R/CIEJuKP8saCABBKl/AsP52dPppXqk+KOeq7AVWN8Wy0B82XYn9TQBStUU0CmpyXYTxKjgEFA0JoMP9JzQee5fg7dD+EH/gxdiSxj+Aw6PJ/gQF3gwJCOv8nBmjv8Xp5AFwUN7BqSpQK5/BNdQ4UczFt13fu2PUhmEw9H5Gt5GbXkIkXhgUhng/MQgCp4stgxkf0Z5Y7w3a3S/4OG7yq6EAmwtAf4DABAqDAFMTP8VDd5QGHFS1A4H+Zhu8r+EwHr+KwB//LA0rF/AMBTfxwRsgYYtVE1D53+uj/yfyV/4esPnf/aARrR/jgsVADP/9gUm2mVwqdlJyDNQkonktG/vRIxZ8AkP8eRiwhDY1/AujH4VGt5Cyi6VnwtW6hPg6Y+sYmUtC40tl/ByCMb5p0f4CCrwK79jPXQhwgfY8JRsRV35j18ID3wMP4Ex/AUjhb+AwAQe/hohc3C8wSs2A/hoU/c/4KGC/5emDD+GRkn+ZAIT+BdKKm9m+HEwy+8f8BLcVs0qJ1hlRlvvEqyfEWpwgFIQWNBgJCF/yuLKAA9xo0wJUTYu1++H0pILUfh8L6DiDfsg4QcCkGi2lKVgwJJ5H8Ej7fp1RnZZQxz3NE61oW8o5wXwqOJGWpu964lt5RxBt0gAHB/Cg4h/ALygfxCELAkDy/KZOZ/8LD7f8TgIoEhN5DtirnAYH8IhK58gUx/AhK7xwgEsDuh/AYAJswYIdfsp/CZJtDD5YCBuZkNyBMjQTwDf/Hc0f/AZLJ/Dqs4nf40M4QP4FAhvPHhT21/n4OhrBv8rBinn8JlW/+uiLGwBgP4+2eAH+P44+utezBK5HQYy9eHSzOyI1DQ1Jj6GrHgipzB/odBraBZaAxGlmXM+ddG2Jiwc5Y+G/gDYe4FcYNkCMaBlG09LHuVXduob/yMpIbeXvwTSL77/e3mKM7zxQH5ClctbXOcYGRxXJHL2yKbOaPZOTfXTwpCjtJIMCtrobV6qSHDJ4hf/FIrwTOBHQ5FBAYASkl7CU8utVZKYg80NH25pgjymG3+DDvsz+IBb6GwA3pzkgb/gb98++IAIuxmGP8BApncxHVlQ6w6btEdFM9WdPF8bNgNVyJXMVTL8TVI0pbp0+Sup/sYDStOh/P2Rn+EFsf7+CQGv+Wy6ihXcCQyEoN0zA1PIhPVP/AQMCyVN4WSyGnR/wN7JQP8zAQqB9YH8CNod/gIIKzgNc8osz4Il96DUfVOi36UOduYiy8CHz/AQZR1fig0omDSBaFuDntUSXIeevfH3xdC5YDUAvf/FQPCyEO7ujZAYCmaiZTjV0+R9vWib/FF1X+AhCgvKga7+BvwP/iEHhHbF94wewgMBaGca2fEWtrcalIbjLd1onkAm3CP/BAy24D/5UGW3N5FX+zsL4iQyJi21kF8z5/cPyMS7G9wCNULhrEieoIOf9LAvff8JgWH8PCC+X//4Zbc//8MLbn//hltz//wy25/sQZbcyv/iDLbmb/0QZbcfR0Y/3RDLbmd/ogZbczv8gDOxX8cUiv+JgRN75A26CAWyEAnrH+mDCR7E/xYU5qgaJ/iApzV/04CLVNPb88X8P/I/4DYUGQWoIE52yI6HaW1B827pgrP7ZkyFnD9oFYfmbwJ8OQ8WdMUrVAHCG38A27dNMHd/mhdvfSEjfcGSUzTVtJMlPx8HAEgSXR9J/D8X8+87GvL+RstUYFRmIORdRinzS4Y3EZ/mNIJs985xJE5X/j4EZusStPfryb5Pe3q/ikye8Dro1+2Q/NeG9MJHyhsM3VN2H/4scT1/wmBf/x8NyMn8L0m9/n4UHY/zwmch/4KY6BBtkRPyVmc7Ahyt2+Bh9WomQ2QN64IDX4QCSv8BK4T0FbHMpCtbzJovWQe6YPBi9gE9iDfZxA8ZAQn4JGt3/BFm15gbeA0AeUsceHTgNDmvHTsLzzgN6IOB5Q0opjHaX3cuSbf8MmyDt/BRg1/LyXc/wobIf4IB2zoL+IjJ9gUlS6W2D/BQJIf5g7VsJ0z3bBJqLkZ5XVMxAHUlY1VVAX261oMsq/eBaK2uP+OC0F3CP5oon3MBiJBih3T7LKD3+1ywer6A1Y1dYQ/Sy9k/iOoXN47RaWw/2Mk7Sf3/RP/0TRPxf+Agofr/CiZl8I/GRz5AQ8K45xTT/caT7SaQXkIOlA1f0e/AMIO2JyWlDk8mGepJcHfGcgq2D9p3mEHWmcNNw671gdHWHXFoWdvx0yABIsoH0Cs/7UgUlwp+dhy6toPFtDTC/ZI2qW4ttCTGdCb1HrCDTq2hc8sLwVkm/IWdSIIrGdYOKXIqP0Wb83PPG6XpGGIM1UTF6t8FzFsMQRd/qWRM86WD+b9IQH3j89ABwA6hkHIKTUf0iPQed51gW5DMy1T5PEvLSstmEaLy7xScwfBGKP4fUgiXsVyipEQiYJOn76HFrFHn7FsXkEaiK6BItN7YL734Twvxmo1kuPrUMBZ6te1M3gQIzPEtRlQ2oTShVuffBolPDwNJfO/kEo0aAcvJnnOMbjVKfzgvWR6JSGdisCDssyXi0mArciA8F8SNM2nMhHPMlTYi36YcI+N7xMd0UMtpNay6nfzaMr3nQMHvYAr3V9Nhvx9xtlwcmY3myOLU48mx2UIeEHd8USZMGdZiBz2aiPiCRKeyOSrvvlcPTBN2LsDFcB+UvQcaBnaOD+kd/BwxeAWFmEVkH+KWRWzSptRnPaAwP4AnvsmxB0ksjBUfvU8iTSQiRFwSCSo/jFPi/hRN3/wUD8v/xCzQgaHG/KpINX+Ck/CvzAtW6UmfIyBA8Xntx6YyUgDmruUEUPz7s2lW1SRou7+vGLkckbS1Ikc6/NIoJLs0vI3AiBqsf4IZQv4JTXSH+WENdAIl//gRNd8w6vEtrNYlHsfMKy6C0WRgxwRiepuWNgbmFTfRWQZZsKUhnZVRv7/MMa1fpo90qcZ3dHmcr/gZPsiiEAnF/ev4DABIX/WxNncOqf3gfwoBCfP+CEaw0p88KEnc4GKMP+LCbO0Cxf9gE2do6p/f8/aK3B/Neit/DyCP/5iOJnBJK/j0uQQbH+PtFaE6AF5/wwo83/wmF9/xOhEYklfMuRTtny/CcTBFCgDoZHtomX/vaKWKoh2S8987vqAT/DmLmXYC4cFN/O9FkRMZknnXK6vUja+y36/wPDG9EHpf8BgAin/DS3WhZelifEBgZv4b27k1s/6DiF7tE4Ta8gEqiJN/wOJ1fxAAigcvwaeHk8kBgf4CQ95UyIoFg89ZzFxlsunH/AQPnk75xoR/wPNhygNS/wGACVv8Mw2GCM1QzMiV/AtsX5iX3ZQlHnqCz9QzNYMg2yCtYZwUL1xS7EiZQ/JhBfhPItv8MimDp/BQHx/LBcGQWuE4AGAf4qC6Vv8ELrTc5YfwQwSjsFOIH8KmBX+Ola/EHrWFWQ7Azv4FLoxZHetsKJp7LxAKYMbYQDQ2n82kHKz55THqI0RWRnNx4T6a7/8BB0mW2ZyK+qnKIKzX8AXX8wzeMMIEl/BIQQmfwGEGqn8OhBDl4QGAkrFIIG84l89Tyqrg+BTsq++uloR/A7Hp/D4CLDR2MokWE5QGB/Cg2Z4fwK1XP8QBSYF0hjKfDCBAYA5i3RCuB+s5fX4OIQJ3KHbk598fwLu4PX8RgIukMrDymsn8DBJo4Uh9bsmlIqBiGMKv+AJU7zN4M0okjetE447cY1ICuehbuyWI4myN1KdGa5MzNXkHiynsA1aNX+P4JDRU3+Aw0pV/hoE8RTOponyAwE8AnIU2JKZjE3mDaekasqELAyzD/4IbqP4IARv5bq2nhwf4FIbyBSvC2VpWzW3VHq3lYXMXggZJbU6Kjwj064qfMbe3neMs1DMi31xmFdlYd2e5as23VXBCd2f4FHxHH+AoDx/lpvNBiCJ/gTkM8yP3HqnJByhBh0ejtLeAPaJXTCzQbOoOWew/wEPUOt1kVVlS4sYudFitE5g0KQgZOzA8Jx+QSVdk/xSDNm7AMyjxM/w2DNu38FA8X8vxUf+NlFwmnv/2FMapqIgpBjBom4yqubtZE4yteUVtLm/54NTn0CHxP/BAjMn5k+3QWrLSaw25WN4o4ulq7H3hdIAUG16oORzDL+MXvGv/lgOV1RjYj3fp6WmKEBlMhAhms4Xf4KUcNN15P4E+nv8FHt77DUSgaX1+p/AGgo7HiVh7CnApHTkDSWEZgIQwOHkb4RgEBgfwmAOAXyCGH+CiJ0P+CPuoAagQ/8giPEfwmB5f74B84gYK/18XhjCcjKjQKnUUR80I9NwZvCJN8y8X6f4CJSbJXgWPeeI8K2rIWgN1yNDv4ANji8O/loEQntWReJ5Jc+KnmC9hf4XJKB5bppL4ntDcFnqeHIkXEbb9up/YoaGfyYP8BH/e20CcI/QZuB8nOK4Z3iWoj2SD2f5rLtlKS3eg8YqLiaQmgWlbv2iuaMVLTgP09mzooSQWwhAuzrzuQ9PQT7IAR7QAKsoM4mAfD9P4Fxhgu35YhOEE72tVXa30po69Z+B+99e1Qqx6KKKhXj8zcQpg8h7emvXOlRHUsRB0jyVhJzy6VJD3YHshj2hLLynWHrCgkrfrsMdFmxyZ6IrqCB0R8WJM/MYWkICvM9KkMi+1b192Pwtvj5i/LdsKGeNKuVcKT0wKna1WIv84XW80FUPOKSDT0J3tZDBFLQ9NPvW5E3TdDBVp2m0gisoUUgZha0eMIpTXUyqZuEUDdjXAmTtutrcGs4FbESG1La1fAsA9jZmANpGi2bJcksMksd5ib7J9XFw7Mp3TTVpHh/gCWIJvhXcBDG7JIuLwss2Edj0EZpbaPzFJPjChOo2tST3bs/wEXuVwOy2MHEUu0d3RPSB2XQm2LCZdpUc9AvtgLVeJPXQ2qWR08ALB/FADMn2lItdfpWjFT9UiPsNDO9o6SJG7bBs6c3BJXEgDJO82FwLUvkoS/JyfVpcRiSk0gRL+XsjMsZTVH96UHV+nOb7BGQiHZpCIOeDh/wBwS+VlC79imMjl+4oo7W4CJeKZkdK6kggguqrdMcLPXA2lgCA2bMmASiOusPA15Bp/5ARHiP4UER/8GLbC/8ECI0FbKB/k4iuJ40eLC7GlCEVnwxkJZDeSSUi+Gf4eYAm8IwHdMf4B7NP8BA+uojZQV6axUw4DF7nWVzU/wPlEWBr2/wEgAnr/jQKrsOnRQN9aAwP4UIX/D+BcFv+IAEUERQiJYRzz/4WJUP4GRgf4gAiwWvoXtCsjeAwAALae3zK0bVg+FvMMD+RuIz9ANSrV3Au+sD8n5bxgEGmh9JMTWFf8VEENz/gwtdGTP9oFro0EQf6+LXRv89FzkP//i10b/Xha6DbpCIf/+LXRp0O5V1tVDGN0iqrdS4wKfiC6XGTGGZF+D249c44X7ZOc033C+50KEw5DQlRHX0HfHHKrMvCsD+AD2xunl27GYJS7p3CwRM/et2dJ1z+TZ8bURep4LgYrGxrf1bWM1wT1vHjaBFEg0Kfm9K31fzZrcr4wi7PMUvQtAfxhGfTSojaBmP8BA2pEVl5bfuvORFWF5aMfq0RYFPE65xzwi0u6FEvvBODDKNZLdVzTd2qPnX2Cwpq3UIXJVACuBRhITs3/AOyWFOFErqH8PsgYMFBmrwHsAimOcRceGOTA45vpMsaac2k+f88U/1HZF/La9MwOzzFATcNLdkcsJQjyyCLiRrlpOCwH2eQwpDjq1+KuTMNWRUocyDkaYQWZaldwyHstnnGdzk2DQQk47bU4v47qclvxefw+l0i/KxaewlA6dP0G36qE2r7MdKXFKCk8wQJK3Y5brCqq9A8FIBGSyC+cntSFqrTlPOuuFSBzU7Z2OLK4g6pXFiqpshaIXkXrZvZhddphm+lNRctz7qiN2Dzgf4oVdXs6DHkxEoQCR3C4/wQYiF4BxMH1mgn04pvyQfysmSQ/GuCipAUxSMcsV54KBgAAFfwHlsAm9qj05j/wW0aAmkO/wbZC1Tqpqz3l1s+H1mES+kIpXiTQQz+/6WPrR/8YMpVf+Kl/MxHkxT/wUsbPOKR/CoHDAf5EVXDriEB2Px5v4EABIMAqxU/7oquHef+LCkNAJ2+MbGcsJ/hu8Q+29n4dOi/yAid8WBp0dYsSyHmjIYsLvcAqS/BhXmdp/8UkRpu24mD9I9AYFNi/4CExvDJjGxJaac4LVhhjmtmkO2wC+QEr/gQBkC1ohALzT2CEDWk6P4MJ6xGUQcS5XrI3pDNrVBLspTxJxYdZ3twgLj9Af8TJOfH+ThIR7/FghzDYGOqpz1ENBko/xoK6KH5cOP3vBCLzyo//L4tpgAABqP1zzFFD7e0a+dJL5+ZqYiQ4fIDXQQJlfwKH6CT3oSlwgCFsJh/xhCmCetjApoZo1vb4Zn+IFi1f/NSFcaefwqP1lIAogGeAH2AfwDMCAYCDgIuAk4CUf5QWvv1UAkjz/Ey04Iuc5N0J7xmJlCPxaDyFYplzbHN1H/CYAmef4mNKZv8nLaMYBiYBz/wmAqMPZwB9TomFhJInYr3L6ZuB9R7WZ/AYBWAAL/BxGyoxbHxALl/BVg0OCrPCkfQA/w5GPOZsA7mR2/kkjxPUtPCj7ZfIq+Jc/IFHQQf1fwKLchth0/4sXadwJhHKZCV7/WBxpUz+7PNbcB/jRMiZC15iD0j+PMwEBz7SO/4rJuoYSNmB1Q4TexOZpc2jhAGz2vbOB/0seiE/4gYELYYEv4M0QW6bwamVDx0Ff2v0dsTdsMx7EluL/1IliDf8BgP4D/gpLS5Bv4M0wa2Z2+1NXx8oZwaW2H2k6WR0ErGE/9UDlVIEAAv8CAP/1/gxhG2B/yWkroXaHX0xMq0IX2oTLeYXz1/wUagbAn8BhUtv/AgAJBiYloSkNuDRlyUabu2kVivNmCvy0aq16rcBcfwHcFfkB/AwABWT0RyYFi4hRj4NeQwJkAfX8Jh2ZkB9H+KEoHMBAQDAXJc24JsgTRQgBW7kOUybvk8BUdn+QLOggEF/gUCGC3GwlFL13gtoA1ygwPglW5UHG/moi/eF/8TMYTf+oEoNiYb3AA/qKAAP8fGny38Ygf8szgH9VAAH+1iGq4fQibVSU0N/hjkMx86ITs3WBGN/wUuXWrnZtBb0UIVkXCGwNFZ77J2RowF/gxYCdatb+cQ3MAB2nt8y9yTnkngvCl2H9VgrCylJDaWHrEgEMOKFD/gFKdo4DdDSbs/xwI4/Pn+bBHH7/KQx9xAvcEdU1nSkwAAGB/ihe4eAs+DAb9EhAwpHDqDCxQ/JStp64T7WIzcADnJLhmFccRZrsJsv4DEJCx3JeP/SMfqwfxkYCiX+fEY2H+DAD//HyVA7/FgoIGRxO/UX8M4W8G78aCG89G/w9SKk3+PA/McnM/89Dy4xOIf58KjuwR/h4C/7v4eAfwb/WwmBM09vm0/xEdqXMID25/p/wIACEGBR7v/XZ2omfwsdqb7oh1JX/jg4Cj+5oGvP8BEBF6Q/xAallX2MVMOT59fONHwFhd6QZ5W8hJZ/FK7FUc+WNi3eQGA1jbJrI2Y7Af1se+B4muWIa2ubQv+DBCs/+Il6IlDeP3FH4gMD7MNw4l+z9bRqfNiV2GGq71koMR/4Iocj+CF+/+W4Ag5SMBOrH//ARwaT7/4CVyNbfzN4N8rjLBq8aOG3+CI8z+CQIT+WqP2jj/4FgjTPPld70prhjLmULVX3oQym4iJUMITg5wzwhngY+QQMeXT+HUdu6hAe3CYX/gQAEgwKPif67R20/+OQwZpskBK5xn+IEFLyt4fXtSTq/ml12/mbhpmX7Sfw+v+MUdv/Ewkhj/gxJTOBDv+BCV+/gSWDgwFcyv9QCteP+FhJIT+BCLU3eiSLOz3MeSsZRb82PReCIyOBPvov/2Ryv4AJyKVWQpchpnC8TOueWjrOHGQkbKvGu5Lcxi3i3j/BwJyl/ELYviMNniTb5AYAAxU4HyMkpzpE6ZYa0HjZQWLtwdn8CfTQJ/ELdHDb5900r8jH8Nh/P8Fhun8sZYpGE5yoAfw0H8uv8xgQhdXBOQAGAc4ByAhn+AgqXSA0+Y2l1ZK0k8gNzkh+QEugkAJ/MFD7iQO3v0ojWlZlSSrJ/XfPcUHej0A3+RL7bv8Bq0sTED0hf5OXw9P5WVxQD+NSsH+CFcVd/hkrhPEqpBRj+BnJ1lmDj10tWeP4E1Ov8TAgHHw2QQUEQgE9WAlBezj/BPN6/5iBAgQAD/yEYTjrX+VDldf4e9Xydk+eDo+JjpAj08oaactTgc2wXqzsOWHhUiiyEf/fRyuvaGv8P85QATl34oiZjZXkH71CHAwpiRcScIhagOAaG4xzJtiFyqwSGH6yFBW55ZSzcC5lWCmbfUCQp4f8BOKlkbzPDO8G238NceRh8aQ2ry+gVRbAOs3sPIuBdT8rHv8eyBGWrfwLtip1JRUvWu9JX221ug/1GT4X4B5+fK/61xBjaFcYYUFfB0tNv8GA20sIZ5AMgWkfgV7quyUFV38VDP63IYgHM/wLEoHh/BwmB/DFVMxQtUpJn7AfwqSA+H8C8KP8QwuvZzTEb7BHAf4yOR//8BIyEP/OC8aB70rNM60KR/0Qcj//MCc4R9cgNp3AgxLSQSDmeGTvBpgPRvNQCxxb6H98VwcvMrWtbknALzFEBi7BypotGdGhtkjbgMz/8UDoL+tVSzCbj/jRJPzsv4Ft9v/jJJ+f8B14SEJXFWYoAW70sSPPPgI26T5S36F7MO7JvZCMo4+9CJQZ7tn90cyFwL2ca102GZbLX9g41yJp/GIbSvy2b5xqx9aTkOoOfwAKZyrm8h3p/hAFSw/ggG0/1sCoVyyRgAlt5XXhPs0R6CXdcOraJZOlTx9X/wiaX/y3bxqih/AuJEjQGvTDiOcOU+x26DU2lhHdY7by0FcTTyldNTfSSM8FfF6yBidndX88SUhMlzLmXMLZy7NcNgXh/EoXHDgIvefEuzycBgNcDT69GOgDEoc3sfI0Sb0j0q1TjfHbBAVhCARs0JcGB9jN/nQvTn8D/HSukOHsH8P/eJ3K4LlTZIidZ9SPoBCy3ZJf/hCB/hA7/pOyNM64Y2bXIHukyookKA8ei+M6rcDSq9VDk6NnMFzIG6dyIm89xM53+UE5tbzSmPPgDKtxrkbofR/xm98gtBuTu5eDdwYXPB8QcMAlP4n/p6CGepLo2yP7aP8hPj5Maj6dT7v4xMzv4ZC5HP+Yy5VdPveYAP4aC5HT+aS5F/gTiNTpkwUjatf5XWnaQAZ7Je6ANALXCJhSvOdreYkhgtUX6+P4UEFP8FBLMIOl7tT/AYAJY/40CHZEjf2f8ASNj3AfwqHq/wLlY/xCAigTalLgM+j8BgbHcbn3EXngLR5oGpuA3WHCgYu8r/wLTfrP8Eaxiz/r5PfL/h2J8u8+A9Yms/gQAEgwLifv+6IF+/m1fwsqmizGJy5O+WH8PCrmJnHQAf56QTdm7/qRlLqK/76V2cUYM/14bdoiv+iEEJC81GMH+y9DZv0d2MVWPGycS1fEN9P2udSVqqacdNkq21fZ4b2YtfEwqs/BtZ9QR/FvWBc6zQ0/4G+V/4iEbA3KbAsjbmAwIZPIiGlONZvnhzBZvcalhadLPNj/ikBFAhCefC2JoD+Hg+e8/gTn4/gTn9gwKviP+6IJTHm3fwsMqr6VctMIGH8N6ZTkj9yB9mJb/kRfrCae3zVam0Rz3ymi3BgmHbKIhha0pd2ccjFSWcWoid736VgJdCFvegP8AQUdOxW7mIcKrmlM6JlERii/D/A3I1/EANtDHiAhPbslMBgAec+MM/B41l/gJlmIjyeLvKjnrf8BCPchv/A+0DfwSAi/62BTRYtewP8WMELP+CmdAxf+f4IAzqmDDpsyP8oMELT1yiJaQR4XKvBo3NWNn/Pk3ZiC//44LEU/3h4T+HAB//ATM2mEfw6Af+nB/DoC/7fwYI7l5MRTC5I75iB9633l2mPzFB3FRL/0rCfCSiMxaJoqSNavP8MN6Vl4N5ubE+s/yBPrfw7VVXv8Cd9/8Cd+kGBqxj/XdVU5fwsVVXBxUaal9gV/DbGcvZBNICd1/ENJD/AZVestEDnqsSFTrhSzEOeTm+3UcaJiQSnGpPw1vKWtMQoGjzC7IvRzD5obFUuHG7mrhCXA0jpaQ/wSL+VX8Bhu1l/DP5GZDzgq2WGwGBhxebmWp2cEujGrMdkIYyYmYuszP8DN/f8RACLLzhMqwwtAYBGD4DfurFSb1pMZZoWxVo6Ip4Zxf4OBNGb+CRNL+Xi+b/HSFYZfQgOPwJJBgoNB/8kQrDA00dgH8IHUSv4afelA5HPw6Wi7+HTuYBaP//IVhn//kyv//xghWGKBzFlGdCCH+QEKwz/FCzrnqQYKVj/8CrXP+MFuEyJvXIYp8J/yQtyZT38fGEv+HGF1D/GQbNUUTP+Nleor/gNibXazC312rsUwI9G0XCRvV5U+w78WNvcwtRe5OjJiYMXS7rTelT6g5nfP1vetv24f0tl84PJl/ikYmA+xH6TSXv4WF4P8FA5jn8QgIoIvTFiVvHV/DZVm7/wUBGQ1s/ys8Uf4IEiiP4VRW/4D6llr/CAIdCt/4yLXFkvLOhfM6//BgIy2wmr8qIagZz2eFS/2jNJbF2uIkmFsU0+xMALgGamqXSjj3Lxcq3OhPwe8z/+AgzLeZmbGsIWOY/gINoFtujf4DABJX/GgfA0yBnLAt38LF9X8B9pDx/EQCCgWx/gJaDkTjeqgP4FilWRJCZ74UxWvkTRXE9i44ErpBQp0axvS6mjJWNeT/a7nAv+HKQx9/jakM/ijQQAbBT84AqqMM97qjfAADA/wET+Ni7awlnnKGhQLqvWwJiy54x8d38Dmkv8PgncNjlNrQ9sNAYDUUjXV8hOR0WEllrZfPlOoreKZqQF8gf8CO5INKQgEir9r/AYAJfQYw2zEpasKhX4wRYf+AhLSoQ2SDIqw/Kq80XmP4DIF/9CLVc/+vAsJCAmD/PiWSRif4OZ9vv8aNAFn1/pYdBX/j+C979geOwyuagiLRhC7D2Zx1MsMeNI/jq7C/wkt+xfxgB/gKyEsB+mQqgk/+LFVuQVdWx/RnuWU1t4/z0oPeIn+diwubzyEuMy/wcDv/wmGcf4oXGZf8OLhyf8VgD/8VYOX+QmgMt/h4Kf/gV/JaPaybMATXH89Vf1EpMNhuOFWk+lvmvgUFPC5a4srodPAk6LaNgk1IBQMZN6NZaEVQWwiwLo/jBFsL+yASNLdHKke6GLkWeCGvhTL+tE/gdxf/gkS7/1wAowcPgGOmXP7XXAMmTqZlHHUgx5G+vH4D/CIEJ/rwJE3f8cDQdzv/+wuF9RzWIk+nb63sO1InFo0ErX+pOYGfzjy1560ul/gV3JbF37ZRHUyfUW05vZFRchs16kQ/DvANKmqWw5HLkx+PvhKoAricpxduKE+ufpCcUC9cWSAJD84pIkCaOhXFoTID+FQlrw/gRxZ/iIjWSuEad8n3/wYCxGLggEV3SpbMTawMNSx30P8xqiClA/BEHrQOVkROWzbFDk9/i5czip+D/4xJC6SAoOABASAgQIDAYL/uIYn6iYfhC/+HBGWlMP4dEhaYeNP48AH6fKCf5cAH6kS/hwnYgmreP4ckqomtz/T+H8bKCN0TO45SlFLR/6C1ECJWiB0Mtgc5gRig9MvBEDIGnTZshBq4YXCSfUgh5yTctNXfPAEUdC6LHnPWqnGQTpc6cYr7FfM6wnjRkAR9sO4cWrZhwfeEytkM3K2yjzsNVhnnpn06UGYmCBz7EeajJIs9U+VE49BgtYWR+vyVjVYUqXv8gDE/Skz2JHIrlEf9vFEdrYJxKhRgfWwLAgSfwz2dWYupncNC5vjbPOFIeXcIUWGHRIQKzK1DpYhoG2OIHWlnzRRoj4CvtxTH091KOdXDyRINkmm/srQJZEkNd1r+F98QMcJprrtV/4CSFwqU7nLCGkzP4wN22NZ1HZ+g9wDtbYpDI5ScgCXr/wUzG5fwwOYfwFiKP8vHDH8KVO/bBBQT/oYZMImv4ein4r+H4p8t21UK1aykaUz+1VzKY1uNww8timQ7Y0z8oq9o+k0Ix9MMSosBq0xIcJEUfm6MnDFwq/zlFSWcifmzof9/RZbkf/JCCUboeW8RAGnWOvTfiD0fMuAhs0mF/f8dsECjkIBN65wH+egowAAm/4ejoon+H57WwF4mKyGQsvYrqI3Lx5bXIgkT8BRwHGtIu7sb0jpNcrFDO4dauYFhZwrrMvZtyjs3hm3Xt2DfoziUuyuDnznwQr/JSJZ0DNPb5unIZQi7mYnC2teEg00XwOpGGI1sjN1cREntQzY4WmPuG55Hgh7SbQ5i3fRRx8caV52mkwOo/8Un7BkMujjG+sBgI+prmJcozJTU4gQIpgMtBWALjB/AtB3/ESA9DLuT/gJmYvc46AwC1WtlLIIEXf/pWJFeeb+tjeWLjhv4oPJ+WUyWmtPYDALMoOBzgV0motW/LgHOgbCOFj+uY/wcCni/zKVY08V/AqWbfwNHA/xChz90+AOIZE/4uWX3QLz/gWFrb4QCVusD/4DABBiDAuIn/haFrAsT+QoWv/PSNhmSbot/D0MWL8+1iE7b+Moasa/l6GrFv99Iy138wAb4v/ohUR983gUkQoQ9C8hPT+AHwVzN0isKWbqsTc5TK7xXNkxNX9gr2afK//HBpgDM//sEgqzfwJ2j2iWxpLhfzo3qaeDGhNjac889HU9bd9/wL9HvatQ6GF0UVf4dPdrCt9HgUuTMUrGaxKOyzkw1qHkHgwDVZii/UxPnW455LwAVbVS7CxHBvc/ODCT5QX/wswZvQYCogv8T2UP+Ai4TIL4/yckELcj0VQOtMxNIdYDbl/h1BALTHA2tV4JpVBo0vIHQ+jNnQpzID7/Ht/jEbkVrRt7tt5kb3DNryYHTugjdGj9/wC7v4lVMeFrXQ7b/OAwO6+TjRazCY3N/8BAjzz+nzIWZOUy4Uc/CKGX/LdpW9Zf8NDkf8yIqBv/gJCvtWP8bHs31n/AeA3+kGA2Ap/1A9m+/x8iQG/5kCTjgP/HykIj/jw+0FMPPJCvyd/jgnqQfv62S1v8/CO7Oh5A6OQXmhsvqUAtZ8cWn0UF2I/5yIwkwMsLcPI0P4GpTn/XOnefT/R0zn7RQCiGqPmPyIUBtFu0rH0RoJvlBIc0dH/HCATNR/7SBsVgDMvbfz+jm6/2vbrMY65WMFmH4L5G5H2OAG95rhwBjgkc//8gEzf/aWJeYO5If8KH42pDiaP+PlvbwGAUKrfwaQosVsKMM/MZFu68pRHMM+hJocc1akdLLS1K1Cpr5u6NNGMPPyZmnmZFPfvCxDlJPwu+hiUpc78Bi0wjZSGxk+Me/hkcTrVMbuQuYetOhk9lOEze0CPh6WJB93gze+0QEwGfkwuAwV5BMG2Zm4GBtDM+ttjbhLHSW2MQUZvil+9MOSHgEzUjTfssymDeK81jh3Au6ldBwoVA48NBErbINOEWdEgHwG+ulM0o6cCbQvhicwBzo3OkOHdVlB3U/h8DfSi9nnPZzKC031hhD1KKvcU6CpjLYtWO1w4OIZ267r2yD0P55O4O4+/U/HoyFZwe+yaM+KKnEPZD/4AaOykNJ7Im9FcaloYDbDrueo0OjBvyRPtaFovoHZN7RPnPa036/zAQnhWQbAveaDkjGqW9z5OA0Wxn21hYR2Wt4QByUGC1BuUJsTvJIhFwspE5zjNYAnrZPXHo1X0LiqQ06s6ZlKAXMZ1r86PZ16x+FcMfScKapFnyADJaJrHaexhl2yDERCTi5qP3freAyb8R+UYWRkfInHVExu51xJE9HIv8Dzl/PEL+jLUMo6PO//Cx7d/FC6Aj1Ic5bkP/gwGwQg5uSqcBDruAe4RBIlaCXwSvkn2RcJkxmTpCtM/CmOCc/P4ZJDtz/mNoH/wUFz7fwIV8w4m70Lap1T9Je7TZ4zRjlpnoIByxrsSnZp1Y+vl1W/RWf+MjTY0LhAjBcGT//wVVRy8+xhVWQ018DhL6ZJmUvzE7BHz9p0yI+lRY2GTrop5rPTK5bjMD/Rw5v39f8kCqqMoAUtyqrnTXW+3mxxAORV9fBEELi7rIll8cdoF+OkxkR4a5xHQqHmGDuIAC7Pnz+jE7El3FZORv8BNJttnEiuWjIu1/nwKqoDP4GC5f/9BVVB/f4XL/sAdQHppHUMGBT5bqCevDE5SWJGYOizTmIH2GIcjMJeEWlsQOSFMBHRVLXePWJh/2RRtTTj4za5czcoAtqBVUxuEhlGPHCV0mpiwB27v/wEOFle0uWXqNfrWF9x6BVw11RWeeFHiTxiMFhhRk87F+yVP73glPnhdHFIY+JKLuyGZG8BwcNnTPcl/kYKqoaxzzLKnmaehKxiqDEUAeWiVeiOP/gb3pCAanB/4DABA/+GtNskknIQPv4WQOPyBedT+IQEUCZySk8J5kgMB1hIMVMt1ha/NsKOOcAe0/LNeFoX+B9WfV2tv+AwATj/hmlDMfI57S+FgMAEOq1cIx1qaiOoWc8yYGzPl+2WoH+B+wv+CQEb+W4hXKVv8dGxW6Wf3tDILT631HKI+KVgYNonSlyXQr+DCI//nhXBKJFBs/4F4CItJNQy9fkD7NuGVl6dGXvlyj5hso9aob2hklYMl3R8L9r/GBpwFjfwGR4tn+fhV9/+Ph8z+YB6z+BD8VNsJdyftY87ZUkLYsG48OhiQIcP8HB1Nn8QB9UNOdMMLEBk/wvT0gLR/AlPTKwgEjAPL/AYAJ3QYq0+/+KmIWYuQE6/y8YvmBQVeevocjJmDgA+EAIjiObyr/Th/jIwXzgAAAf4XnVgYCAwcBGLxkzhAB/wQV+U4ShrAYUAgIBgQCABQEAAgEAAgaAgIKDf4yF/BP8nGL6v8J4Of+JkEk//xMC5n/w37Nf4KBtJ44lRun1XJsNH1bFoimZ5VggDNjEpbFTGA5QxGQAq7vPDT/GRpPnj/wKF3/98HPh/r2A1/SU1osn/3yIM/k5uqYGgeziYNmLWfa3gs0ejQMn3844OfDzDlzMpk3ZYiH2nnyARzFaNf/gJlK0t/9BBW67DX8goGz/iVWzSYa6SpiOdjNGPhoF4fwOHoB/8Bh6ov/Azg5jm8YUfDywGAziRmxPWlUN1Ff4rYd69u+N1DQ9v8A201/EQCLLPBR2CoPAYDuJJjvwmu/Ih88NT9OuFOmBAsGfv8DvbP8EgRn8vbTX8Mj/Tr/MYEJ/Ap8H/C8W+FfMhxb/8Pw7lP/D8W/3Q6TeOs3XvhagSyWxVojioI2pi8FtM5Tta/qVkzEH+EexJ0eLz4nrGgT1IE7Chzdk4fhOhS+0346/ngSmGogYn/lBSCF85K6hChP7vj7VBkzmLpBPOIrcE/hcyR2tY2z+ADauzfYYD+FAjL/BQNjkEA1OIc8BjSgx/DOZkeJqTY5B//CxPd/goOE//iEBFApnDaqBCXz+BqSSWMaqsZ4cfelBHWqOrQm/QzndL8AynyxYRilEVyF8rfyf/xYhJEqf+CkmFP4hAsJJdP+AwATcgwzKXM/FxeiUSf+CRgAQj/LCq1B/FqxAV/tg8nRN/jwyXy7fyW7EcDLdj0/34hJEgH/vkJIn+IgWP/mxekVyf6EL0ipNAkEA4P8+ISROQAbPwAcPAAcZI9psFP5ktwBENsPMQAH8Abgq7taG2/a02kejgHfutPlgKXAzdUhryZMDbtYqBdws6R7Rr1G1cX2vwg+8k5ORDckz/gxCSJ9MLIUFWM7NI5NT4AOD+AwaIGAwDi/4cjACdYMBgHeBgHECAHiAQE/igjBW/yEjACSgKAEH+Ki9Pi7/wUXop/5CLcr1ElSIiRHVdAwHY/wQXpF/P+REJB8K/wQi/tfP+MEYCE0/yUjAEKj/AoCN/h5GAh5/8FI2GqAAuIDAAMADgBAA2oDd/wCCIYDngOr/govT81QHY/wUXpwgIEFf4KRgKisCNP8FBenUkASj/AoA05AT9/AoAi/AUP/+BJCT/1MXp1f+2RfEGdxfKhj4qtEyhaJh05UvBDFyVMkVKe13CYykyfyjHwdkWviNVaImlstXBHUjeXcPp1rBswO7UXgiUMWU1CubTeH/IBAh3PJnjZc/tk6x86CWhlmNQYYttSvX8DqS4vjDh/AYAI8f40DVi1G6LAdugMAGTzZR843wM1cU1wZ/AYjTBhrG2D+BzNv+IAEUCAnh/j8S1gMD/EweCd8hqtBATRwgEXSyNBi3y8yn+JiTVWI/wcHgd3Ib/iBeLe/x8HhEX+Plzr//DBqGoO5vk91FWDfw/WUbA9UIseQa6NjmmoOBc5YH+ehlWL+MSpRg/z0k6wAyB/nply9Cv4eA8Av+B+xIRL0Bwv4eKmPSr/1kHkjf4iJMYf8xJO8n/AiTGH/B8xG4/yzFsgbJ/oIk2hIyj/PjI79yfx7dmAUJrq4ci88txvv8NhYf+phRif+K5w1JeTiwd540/0wKNj/40aDAz/Vh/yAAkH+1hQ8f/MhKfaBfzrMugX+hCUUEJxP8QL9Ep/hxdpb/h//A/0s0PCf6GQAoP4vBv/4iDv/4/oPP8UzDzt/FZFl/SwR//hpIFP/xMjGx/3+Ef/ywEfgAD/PwQfgFJ63MjE/x6Kd/waSv1ND9z7MyUFtf2PqTkn69kcIESRi/5uJ02APPHLThsZ2U/2sSqw/xYZBmKkPahZu6f8nYlVh/gnoYmvpct90BfJVeBV3igTgBjOcTuQbwPkK5BikksMOFtMQLetl21ECe5JctskyZ5j75ClNpArk/wcNBzgrCASu/sP8BgAjVBaczKf4mXZxf9DBKaoAVV+B/CgBDfIU5/wIBDi3+FCuogJ+Wn/FgSmuFIv+XkwiX/Hy7RLfwmBU/76C+Wwlg/2sNHOfxiCP/5ACosFD/Pgtiov/OekwiUIBP9iGfIUAs/IWnq3ZNJj1RHofwyTXF6x/cH0oQZ3b6sldlHhNZff/xYeNF/hMP//xkiYoSc4qf4CSCCSu//8NHOfxXba/5+CpVf5xDg/8/F8lIGCf4eCpVf4yBWdQ/isTH/ixG+DJBkHJa2BP9dBUqoDyf56FyIAM4OEnrar+SwUP+LQSPVUSe1NrL/nguRA/gM7kv7SogAF/igZbSwH/AguNV/iZq8T/xULjVFSgAZqqSf4yZp2f4yCIL+RRX/+HxZ//TA0v4GQEmuBYv/jIl6iy7WRhAwo7P48UxbzR/4djeha5/3sNL+KGAK9VXSZ4oKhcNwtq2lQvfBsKemsj8bkEAhUIEb074wYg5mEpkaG2siRSVsSprJTyIG16y/62QFrAXFmBAGC74WOMD37zmdrvLa0DY0Gp8A8Jg6aCuRKsH8Jmyf8K9YAF/A7beN/wmZ7fxSAj/wKlBDw7j1IDyNwjk/+CBxJox/gpelufjSJf8FMPtP+NhViOMYQIOpYs/gMAEM4MKNrD/FQqxGynR4kmP+CBViO+AwE/hMD9/x4kZLzrr0oGR1dWEuRbQB/5APRA/4RITwlDpziMNMsDWDOw0PAsF1blKoNcTf6/NZonvOfQ/RHbFr+BP71/glrw/1wDaVpn+KhVqgEDoIBr4QC2IgygxCmAf8Ugq1QB8n/RBVqj/TCnIV/xwVbIAnijb5Bvaf8BGylX+DBMy4L01MQflzQP8jCrZH+MGG2mTuMopwsD+FHJz+FEDsBQABAP4UDWAH/UgrlZ/voVhIAFPGANeuaHf+GAX8sUhTIDJSOf48Bf414FDr0/hUF//hQR/Af5NBf/8JAuhzPVJoGEbvprQu6BWxj6sehYV2+CDmN4hc3BdnYGh9esLtxLtahrzhy62DgS6pv6SiJjqPCKl/FL9+XEzuHefZAGBrE7uTlSKewPglE8WcJYLE7V+2u/8UwAoNlac9CS0AYGwvwAoFIU2T8VeXtJ3xPuseM652f4ODKwf4JARv9cAU9XHf48XjDXD/qRywiMf78BsmyT/Xh7baMf6IXjDfMomsf96yBslGSGszDTaDQ+WEIVCJi7uLZ60svd4JPumAk/4ZH8XP+Cgtz+WPjA7t+X4/x4kBeyn/7BLMUD9iaPFqHiFVny6Vm1HBllskBpif+NkhwB/wEGrtM5Zh5Ih/gMBss6UfXsEmSgNatxkm+teLOg0RGVNN1tknWAuAiT2yDj5P8djw/x89pFgAAGAlURCSVp8jLZJO/4qk9lQibwLqoDAhFb0mAvGEXkl/CuvdcM12n1YMvr+KhTbF/Xm74aA/hUqS/gbnU/iGU8FQJ/bc3P/hYV8/wUGcufxFLBSQqgGeWrH8DXz9Tukjx00D7vD8jwvlF1/H13Fie2/DPcDCTqkVzsNBwmHVaF8CVImEWY1wwo8SAIA9zrPX1a3+AoKIedv8EFthcd/AgAJBMNRKf4mDIR4YHD3zyieSAwPyEwBz/BgUQ8CfwGAgwn8CAI0GA1Bd/5AKIe/z8GSr/6IhHwG+If9fJ007pFrYO4V89fxLm1gTFNPa97qx/lk6g6fsQyeR3Hs8peNhMwzIsR2ZCiVwqkrrtAYFkqbxzi0xgQHZYgcF5kcN08F9YMCcdUbs5LzT5c9QOh087xIg+nIKPaTJSlZ+i9C+hmORm/2rvDkEGwdhEQsHxd7U9ccf4AjK+KmXl2rwRQtDXPmtjK6tPT3dvRVeexZGUWE0iw7xi+yaMnCEpgXEynF4rrJpL/AR0Vd+FM3Ckcw3b0nSJAEtp8REj3V6Tmu4IppUJ9iXS5/gIfLKeIf2v2CDrH9jAYEBqMk1Lgd+iQIp8OO85vCkF78+QVdzsTw0VMNO8+UBnaZHEGkhhUwybjNEKhdpa2JhICIUYtcoiFkc/6qfkcGWNDhOFljcSK5eemHSTwWya2FeM4VPgX14ETFLKHNyw4e8VculjtFphzEI6gHChNdysnbVt+GbRV2iuaKl8eXT6jG1AhUrq+vmf0C0gAmndIh/PFgzFjU/wEyhZOkgQXLVR/GP2ws/bGBfTycqHE6+J+k3Xn+AhVPkWohZSLhB0a9CF1oAvCp7V1vzAY1QbxxQ9CZ+7KTnEvBPUOcNWrwj/Yj6vOfUiqXO3/h4ovlfYODRggmBMeNlxMNN/xIj22KZhjpl2x7lhz01v6HyJELd/6PRyBZJlRIg1qTfpKAZ0DDA1RkelYaKaYVlerrkXkQB4N+v2XoD8qjPU/2AFEPT8KCI/8CbiYL/BIiN/roKIeHMD/HCTMV8/wVpZAIn6/OTkmYoTmiGvoh89cMsEC9ZdRJDDW8ZDjv4dSnlnznKkKnlX2AmvVGxeo5eXIjf/ASBsT5wPw5EmkfK9k83NA/Td/goLLMrYNJg/W2RVphy9RngJHNXENG2L751LzLcEqNr9TnwuE7f4V1ggv+BtYL6/goDPEPqM/hbWCC/7C1ggK/x83G8/x1rBIzZDVydv/w6qHewJ/rg0NIVAKK/n3WAgYg/+Ai0GfxCCx/01rBAgJ/oRFoMqQMyAYD/PiSsJ0H8B6wW70AG/+gmagrCj1KunZKihe18x1i3DZDNiQbz4+lIp/BBRzycuFP6L5y70Cvc71rW3yR8AB4zACZXjnOwOoZG6uXx+LSOfwLrBbPQpvTr8AHQqAUV/ghqnP/xMiMkoAuAQBzAYBvf4IRZJaD4K/8bMqFn+IEWSMDAwCrAwBB/4EAXP8GIskYSCAIWAFf+xjE0H/ICxRaAqP+CliixIAQBVQCA/xQTHu/xDrzfAKAIH8xbWX+PBb+L+BAMsCB8FAIAEAiEAgP8TB0en8CAVX+SFivnoVFJSAwLNBQEAAMC7v4i2c/sFASADAu7/GCxXkSgoCgf5EMTI0L+BQES/wQi1Ff4KRak2D+A7B7iAOb/BjALiAgf8C7PhsAqP8AYAmgAOqA8gDyfwHswjIF/AmHYgAVCBWAFYfwGAI4gEZX+CkWpzKAs3+Bdo7yAvv/BSxXkOAZL/goxMsOQNC/wFGJljGBt/8CgCLeBvH//kag3//yS2J//5JbE/hTeCJMiL2B7CQetSew94y7MqeUHUjwGDMlu7lJZ4ULnn9qhA2bKnjG9ju446wqyOQVt2H4dcFQxm1G4MsEJ20JvMgEz4g3gjF+J3y3kvySq4QoNHzNb8yWOZEQ/jFfCzN2CzKNfEuxMsr4GvQAEXkAvGj8/ijeCAfvPBCPhroDAi+qBEInl1qsWMxpvu/VvJ/M+7fK/wgCIvfzIvhWJ7gCaIHawZsrrUcBZYVH+AjB6mtqnbWp/D+EVbH+XGEd8/gVZcsG+9W3vOhA4trT/ATfTkXOdNctk44nUlchQKPG3kw6sgjquu0D8KXAx3YMPkZu1khWQTkLo4F/8BpYRvBc4QHfyFg6Pl0n/8VJ2YVP3fwAVv5RAMf6+LS+wD+BQGb/3RaF90lj/LIkLTL+E4gf+dFjhQBjz1ZHPz/bxaX3/CYU58xWAP/xbxb/wiCR/4+Yy//8OKraaXmE6XT/xWDx/5iMjIL/hQcDt/i4tL7ksCEMWk33YhFvaCfIMf6TN+eAqR2FH/Bgsgb/ETCgjU2k4FpFAfwKjw2tCgClqP3q8jdupktGxmmDPx+jv25TGmhjUJOtpJRAjd6FzO9kpjEexv+8g5T4NLEiNryCsf+DBrxn+ImLzmftuU0oX+GxjBy/mNtHf8FBMln8MjGDp/MYEIaMS0OgH+LgT7H/BC7poDS0IASQrU38BgAlfBhZMef4sB5mwFkbXDEFf9fMC9X+Oghnj8/r+HQB/1/hZDM/wYCgdAh5/DELyAguvov9iQ/wsJzAQORktOu1nOGH+MlpyXCZA90ElFj/y8Dvr/4aWANEUkv8dLgD//7gdcV0o+WBiRlb6OyjrOSVkbZLtmbg/z0aRYTf/gIV6g/gV2MuWf5T4a3PFSkeZSuc9ef0hY3yb4WA0IOetrBIxs4Z6Rbv8cHfiQv/tBexodAG1ETQQvU5PcNRBwRU1+eDNQTrJvdZaSihlqu9YlH7SC7/Yxz9H//5RJQP+KAxd3+AhLFA1/gpzGvwf/IBT52nL7G8ZLI8yyKItk6RWQmHmb3RIC/E9oyKktxrgNNDgWGuG0xUMGWHYsFw3K8Kaapq5iPOD76Rt3VtD8Yah0J3piTc++N5SpPhQEM82RZyYqk0W1Z9o7CbU2cnBPQswovs7P4bmksZZ0jCmtXBFBc/ffTGUzOe8IxrAIqXM6Q18g4UQoUq+Xeg+oRI+BUOLI7jpnx6knpA5Pc3rIUPhqhFlDlfGc5pIxG/TRhNNPI+Ypyz+OL9yMU5UXGiGj0Im30X8QAb5RXQ0fhO8oxduTg4KAWMeaQnGc/NDoBYCxZG5iZBZD8BVecOuYfAZirbB4jVRrZcbpVezsSIPN+2bwiI2hcrmLgf5SdpSBJHHbf1dMLR74e17a99TJVpNVLkwsYDDHJstyQ+qGaLQo2PpxNBBf6zif9yeqrcn3CWmTeL8H4ZgU8qhUn6cM9T9kZyywKTGPd1JOqCjWw6nd3GnH39s2xik2yis+eIRrcvfu5AMG7TCaAoECk++IZ39oP+AilWlK4IOF1orZwCDP6E7llOjOexhJRyffxqKaDmT4f8xAjdv8KHknh/B0IofwxDYRFTd1EcfQH8Kncv+ChGCn+IIjyGmX0f+CYlz8GBlCd57tEkpZJZdFNFQYUIfqVcXHEuOiBCPEXu9jNPPHEKv9KuVllA+KhSmL5Dk0XiXiOAR5jHl/8DoDeoPEl/AYAIdfwzFT+qNQhdjCwGBMh8MXTwJK5CnEwp8wcnBRpSRenr/g4FjX/iACwpBjk9yT9/wYEHbYG8ZQPHlNWHzVh7jtY5nd9Xzz6seEPOsYiX3FCK2/cNQoeiJ89IkEdYAj4R438C8qlasXz7f8DxrH8QgWBe5A4mQKsBgFQrAJd7YLo22nz2MzK/s9eiF876fwRL8/wQAjfy3b10hxgFEylP+AitGu15gJy25b2HJZPnbWVWRf+CCpv+ZAISJAn+BQXXEpUrUBngjk45yLtot/wEYFP1FOL3hhRxEv2EjCo8pHaRLyuv+OhdeI+/2gLrxUlj/lYXWCHvVybEscdBH7sfPBa3Qhe4frgAk/isAf/h8Bf/x83gQ/5eSCtQY4b/+HMuIGOG29/jxDpQdH/HgRqsHs/8PZwWR/oAXXiLRnB804YEHSnNw/Q9xdcMyFjZA9/gcQr5vPq/gMAE8v4GBNzS0vo+QjyAwFxa1HsHNs0VOqEkGFbvhd7Hccpl/gHU0v4fARIcB+dVki1eB/gYRxyHe63EO4sd6ieC3EEjDvTStiUUo0FawcFbaqeg/smDr6h5iz0DV5MANgRsbrbHSijTbmCWGv4Hnyv4hAazImNiEd4P4bEcXT+CgPf/WwJLPbdX/DQji6/wUC+fywfJDNzPYn+DAnKTMh8HazTgxfofp9wT4oV/x6vVXGCE2pGRQUX7/wEJbL7SeL2OAzVIunCoM2NRiHQcKN9Icwb1Fxn+DAymfY/w0GUz4hsf5AqwurUCmBMx/cd0IpNvhVZQwX33q6q2EBgfwmAOfwLIAH+FFAkf/WwarzqB91/CoEJ/goNV5I/4LWChRg/3kGqPOoH3X/ug1XkAOP4riTlXTmDJvmQWf93VIUkaZkCezEl5/x0Lr6aNY9fx51qgTGgfx7M0f4iDVef4TC+/48jUjRrHq+0fIa/wEFO5cVRY+aAMhIOlyw+70qM1G0dy5sKP9DNIC8I/w5nsYFP8SXMx/T/DVkcb2kY8r/J6Zrq2fwUipQKZ6ScL40B/CpJv/gwFxv/iABFAn+c+Ile/wcGlwuer+NAld8GtwBQlk5Xq/4CMpHP8BKhqUsmQ1yIZLa1M+0l6sSbpyha6uZr2PHtteYwHC3nupIAexkS4F3/xKAWMNIpQVOY7//DZTb/Mgrt/hQKIb036UHUaZXbSsh65wt8GiuySyNHBxeZpFDyB6PvttC7+AJVGfnIBPCdrjgcA+2GItQCmoiEbdMO9/wQJF3h//jCmhkW5ogtR5I6Nkw19ufxSpMh1/1wpG5TymEy7r87REnbF/+AwST/ZTChByel/t5hQg41P/4Branwy2wLQqFGlofZbQ54Wv+AhqE2Iutwy5NumQsJBMhcVyJpAmj7we2cIrIp8Sh3SJ3x+b4DjR4j/FG/gA9kQc3OQ4OeVgOU/z8wnQXuuy+annAtizH+AnBXEPw8GQohzPTiU0o/4APxMpK8rJvS891HlZUVF9yhWv+OnrwrkDoV3gFOaOG2lw+mH5lpYog37c2P4/gJJUmGKvMgUPRziprMi8T5vczjegbdfkrSBsmPzxq0Hif8BHQPUuIXvtJGd3K4czNVnMQXmtZ9jNV6p7aMzGF9JjnOg0f6AEpSW3eYbPKvgC9uAWG4+jWpkOE269VkFRcS99yVS1QGvsE6zWYF7L1MRREJle1sp1gSapDClfohKSucBLnJZhfo0/z0sU0Gv/jDW1M0g7fd5UVZZKlwDZE8LxDIYVOaFj+AEgiSxRr5cuzApn9aQ/4GEpP9iMKEHnoX+3mFCD/ATUxf/0BN/xQLlWcHmQQtj0La4P+9CTe0CgaYTZwu6uEZwkKA9PsrhUcSi2XciGTCLMNknGv8BFrxGZL526+pI/15qquZT431A6Dn8fyO/VBk636Zgs+pJ4QSk9tZZIlRXzAkQcjDs2q5Ky1bK3Ia6gc+3XRGcSF9yB4RmORhNEITuxkVEjE3d5D1EYFt8h/8SBslSvUGqmd6F2YwsR9OQkUjbokCIjX8Yk1ZB1SjBwCaiI8rJNkKL28lLhYLu7H8EPAn8EEoH+uAroDE/hVPt/icowA1jtKmVghf/BgckrrmYpc2YT9+epyvLgd4LK4GF+KZhDq4FvHrAM0EugvxdLqGKAEJMKfPzyIeYkAKvYvf4sgUh/FIJsUgRo6H+Aml1f+BlQvbEokPRHdygMF2Lg4kufG88SEq+hM2g4BlQUR/UQ/BAIKh6ROWak6CxkzW0ZWegLCnbAV2Nb/ioGiMWJNvm6AwIh0yu50uPY7tRA6Kd/SgmXuyssIP4RBov5blU4bv/hoInc/5jQmf8GCO1v8BrQO/tetuPS+8yBGms9U9pS2A8Z7qYj2q4NUTlOB3NThW59/XbUrjA5OwsVhDtDB2U1M2PAzjcX/FQPD+JQiybkB/i4lzdk8/IxLm7/CsCeEmX8CQJ8vCAToheX8BgAn3BgrVCf4sJxrP/7iwAwC+yh+abZafwwAH/6sQlKg2h5LukdtUYJIxFkm6j/v+ABw76oyGf6I+N29gYS9inHGVVF+Bn7tWKfAaA/nKQEgxopc02SEtYTAdzpYNyDND9q/k/0ssAMfxfAh/+WCM5UFqgAjYnT+CT/0GQqHWRGVW/hYFvO+VwDMqIvj+EtFHs5GVvohX/ZiwAx8QH/llgBj/HxOpmyA6tr88nBf/8c9//8DAP//Fj7E//MhS2MAf5QWjhP+CLQYn+iIpbG/20fV3/xYJf/w6Fvncxin/a7USP/3I54Z/iUO/P5fjtLD/DCEcU2ubK5/4iPq7S45MTgzNzb/Eh9XWXHIZGJocGP+HkI5MxMDQ4OTA2Mjg2NDUzOTc3MjUxODP/A0tc2spianLKYmTg8GqIWuLiYHTKZGhyZGBsbsiGcpLBsKKr/4mWgjDIx/xIfV7fwLflcdtp90+O+hxzdZvRDolHf5QykPsVeo5ytuQ39hWpXaG3n8MlkP8yFTn+ChYzB/hQ1J8P4OcP/4YMxAZQwhCJs/4GhZObK3Uq47Pr4lEFXR3PD18QBrboUs6D3xOUDb5hoReXIjH/GwLe2LwgEsaMofwGACLMFat3+qDFmIICmSu9qLg/xYC3tgn+Ch/3gGP8LGLMjG0/7qMWZP4GAqf/rGu0P+mCtdQH+GHh6ZkIv453JBICzQ0Jvv9PGLMn/GAVVv+HwSMH5TzN3ehn/0dGu0Pmj/x3Ecf6+NdoenA6bbQ9vjUyqL7PK8JWcEb5jVRfDJ/jFKt/gSyU6h0nd5kJtoVoWB9id0FY6A6C08FLzgxVNIzI6j/ATbFx04RlozAxjvDQVW/0jRBdrAk9UcSDQd9/FUQggQd89DcBgTZXrokMVVA73dUX/gJufQJ+zMbYQwUb+EohD+W908l/hVH77/4lGVIbqDkLdnvSAwPPHBxkZA//sG4AijUJMqEH08dCNdtSR86IheDyFWB/z0MaO6LJfwLSRd095VWAD3c6KvlnA1Sn6JvRMH7vTfLrg5TNTpunFgDQRf+NkG4SOhAP+AL/tC/IQD/wEQSuBUGALKW/60QbhAj+HwB8fuPszK7xpVQnREpF8awBAsp7c4TsRUxUBgtRYMPquYj+H/JmF/wEoaWTkpkCnz2X861mRH+XCPr4p76G1OXbxBl5rD1BaK4H+NkRTZo/14jhEAaILyRH1IvopAAAYBYZc2+Rb29/gIEmJdkQSLUY5k1lgARK/gasdCbYhP/4DABEf+GIEPyPkpf9pb+BtbnyJ8KfS7pJqrkALA2JeyNwOQ+3nQCXFNNrpAczS5mDqjS/xs3LxYWQnsIBLGgYfwGACHMGAlgL/lAWPs/x8Yntf5SMT8r/IxnMD/hhv/f/1cg4ohH8QUU6PgVH+Al/7iFogNXhD7A3t4e/ddkyNX/ga4QCcrZI/4DABCv/GgRIVZbDce5P/DYvK6/wUD+f68CQkyz4K4XEpW2dj7OLITyfygrCKsL/wY1GePH8FBQP8t2NERn4H+MmF960/gONK8f+MD69x/j4FxK/1EPr3AP3/kpY2p/gP579stvZHnAPpm18XWLCjywxQkU0y0Nb02NIoSRbLKviYbynCYUGxt38K6tXxYWJyHdjLOsF6ov4pCuwdrsv8BH7ES1/CxFD/AsrP/EIAigTEY7mMsb/8DX63z89zwttO4dYFvnGOiC/DPckuyKOBWDcq4jAYaoLsQXGQohiDtCLTNJGsW7Zc2qwQhmqEQZj/FIFiWRUxkOXlAYEvUebKXhLhh/4CbUwgz7q9KiFj4T0hL/gcVp/h8D3hsx0uZlAeYD+Ggy15/goBF/ltsPIX5/gXce/dH+AX/QAap5gKoLwwVIl2VjadbUC4cdUigJPWcWOdylDcU5HPD3fMJsKBxGdngz7ms/2nN8GP4Gxj3+Igef57KniZD+AwAHSOGi8ijtqCB6azNO0Y5fMo2nIP4IEHv4IBNv5bEHrXy/x0bDxJ///jkL3/SRyF7R/wseRv+Cm5FoGmIQCQssH/gMAEEIMp66n+FvI0CRP9IMwgj/HLH7/kAG9s/yEFr0vAd/pYwqT/04FrU/4WbHm/4vAL875TpGchpIxbP+GXRy0g+OD6v4XsUy0+odKw6bsoA/+GaAk8R1JESTX+HvVFx/yArfh+gCpDFlDxS3UuE6KAbQawTvONvQhLaaSjLFTKrXknouqQVQJ75qQc5JIO04E01jrn+XQ9P0TB/FI2ZeAf7SZaX/DYuq9fwUMN/68CsXj/HQYyL9fwQNSEcGIrbh/lAMZF1CCWzMFcr57hAANhIhxwp8ZYZfD/h18YAzT+HXzgqWP8eJsAP8PAH4n/hhI8aJsst2vLW8JJeb0wduA7f2rNhRWZ/5wHPCGAZ/i5Y1kSMF7GL1eYCC/kCtI/gT7/9AXs2eSrx/j9rBuc7Kbgcrr2ewTNwFBTv02azC5tgXGyeKZml1CLvePGes7oPEDMIutMxVwXf+KjITZJKPqTYDAIoDQf0Klr2/1nORIcQYLvZASlgG/gS7EBf4iJFogWdBjT1/4bIJf4LJFpr1/hWMyUsRZ9OS/xULUMAedBBnJCAUyREkGJSEZ/ioWoYuEDpDB/OPr/XgtQzwfxWvU9KTsFNFc4ndbC8t9z7e6EkRBL6d/yWAP/6kS9dyD/PjGD2C/6WDnlv4fHbv4fe0OQ7rRhrR5O+LjvcaM2lMzYSvBRxP7/Av/5/Bf/8IMKSvKBI1/gInNS/VwvnxX0aZ2M1yCvfAXh0BuM3KHSl7RMEX8jEXC/hQy18P4OHz/4Yo/P8BCTeXCmL41/DZvY9/wRCjf+qhW6QUqymiteNTX8BSsmb/ARn5AXMqAwX+9uA2LHlw9y0VofwLxSKX8EaMvx/sBczQBWnt9A0vhL9a4wQW6U5GYiaGfed1+CM/gC3DHdMXILaR9wUQQdm2vv8cJyyt9wgatNu//AYAJ0f5WH68ft4D/LItDueTQT7a8hy1ACVt93X/4cyBa2MyCK998jDny2L/AXxVYhg+Hz0D4UkrKX5StSfI8G/wEivIRu9gV/4znog7I9Gjk2c48m4hqRsKFsT/xSiBH5hhwOd9wGAwkkaEIVdJJYzykdq3x7tXMVgF/L/FA5yZo44XlIr/CyCh/AddM+fxGijyiA0qsbUsBgf44X1Orj+/zyD/SDB4snwP8CkzQhvslSmYG0HsMBhI4XDn4hmoIDBDguw1q9oZczBjXbAfD5dc2NKyFIlnPgJkQh3vpkKIIPu+O4M/4oSa/MaKx0IXIDADCbBfK1M8MYQJHTCLY6ZCornGT/AjiPb/goUK1/mVJruwr+FRHy/wUc+ffwSI+/4wKwv89bzGCCgoD+FCZrsggzlhAKWRIngwpo+JTt18vJxHVH0LVS1bwr52E19JwVsf8FN4sf8BIvsKcZsOeWYNdj3YaHxU9FBnAUK+ymCMrBVv/f8AZCXhfUTKnDT/jhQERVv9oMVQEEif7CIvngD//yb9p//5IqgP8cKq4dkdxmckTK9BJAUh4WHNi2UsHTq2Muhm3zwRNQ2E4qzuRwi+iAIaxzeWXti1zauAQ9DGswLzwb2gcxe5Vn2QJzGh1sOQAYJeBVozjW/ryUitkpJKF3IHE2BBEH0NbD7T7CePQBH5Oglx/gJYzgA3KcEWasqFGxQ5STQu3g2OQ1Go3wmAnh8kFTqkqFF9ZJ9GQU0pJzmU+dymawUeTiZkyK6uZpPMzLSGDqiBC8gF3KvY+yiGhRjllrcuL8Ph9JTBpUUrSgl/h8DfI6JVLSSVPL7BtGdgqCV4YUUdC+iD1m2m/qN6rdSFCA8ALieHcRuZEoqnujrHzU9RmCCqbDY42A7s+lE3r8E5VcbMg1RUpch+ILAn/t1KIX7XCp9Cz77Oe9RColo7F3NpILAfBGgLW+W6osW87lekXDQ2t9YrPUi6u1NVhFs/uq9QC2VQYEs3fuyIPpvNJAyn2C8q1A+N+pbcomiiZHqotY8BgvU2B9LOlbRm1oObpTxCLvsy9fWAuIqiLnp6P/AQj5xPYuBdy97NUhr3yATm3wIEn6dLGJWCxt/FLMIfXYbuZ5PAYCVm/wAsYiH2iXLJfTlDSBc5eebGocwdfwKUNvn8zLzkRafwKnUDAUItY9bTMFmsAkUsCxWKwOoeFwAO7YCBh1c85yjgE7dv095o40AU3B8dRgj8Tkrk9aJ4F0f8VNOsTxQUqPQGBoqr+VqDjQHnJOU2Wm8SfQ1EVtFgBd/8SsacMruRR9s+n/DY6u4/MFp9v8w/uPoOA4sUahUrjE2KHUuYoyNgAZCtAkaZP14tr7DmjNg9/di/44IUE37/NhCgmg/d0uWcaRWWwDAyAxKJnbr5L+3+LFIRN08NIleO8zr5gAwBOVnXWcFnuzRaCwAELLJkQ+WcN/wcOPvBWLEOn8BgAj5/DOychNPPKn0ffwsRpf4KEKxv4hARQKLmVTGVtT+GwvZ9/gtuE/1oJTnRghf46DJ03j//yCZB/pBBMgLUb+BLYf0IMwbMktZVNdUXC5HOuBclW6xN8+fdPQOPBftqI0E4Iit6gR7/cBwrNnDc/gjwGJiYcjatbBv4JCoHv+AwqWD/hnjvv5nSDjqE4DA3b2RxoWpqnLAVIVT3QRTKn/dgJH+BhGD+IAEWBpHG/J6ij/4FjjvQjm8+cL3tOMt1Mdsozv9n2+t3l7g8FL6axdLzbp4OnDyA+lXguZ+VHB9zg/JTDvm7PqYD1i//ikCwQmnnkwCGgMD2ZQR4DV4O/wEdVwavOWP6d/gIEfqJiFzT3+B4b7+CQLH+WxgKxmP4aGh37+CgTb/XgjQef4uDGuAv/QxLOD/jxRDwEED/Hwl5egOIwnjlc2J35Dw+Z1Iv0G+v6RXC/6KlYrVRLRDlu5ho2cNlgExffKqVI00RBqrn7llgm2A8mrOlNRbS8iZsIhB/oIr/8h/7/fIf/SDDpC7i38CpiBOkJ0k60JmkUsYbuk2CPc88M6HnKRQcOtOOu6Hn1LadorSCca6ZZnwYFYHyyv+r1smT4yZyH/FIfEDnfpiB0V/CxPL/AfuS//xEAign+AmRfvgs8GkoDAM6BJh26IjPxq8cZCZIgPpe3Zbzn/BjjlME5WySfwBuLJb/ggsQ1Ce0GUs0WOC+/vjq3OCXSLMq+aYcBa2NN/D8mIDA/xwsr00n97V5y/ml9hF4y1CVOQLS0Wgbddc37Q0X8ecjtohqH+OmYbB1//Yxp2d8oLLYKwiB0qSpiChypdveuAmGg/x51Dlpbt/wK62laIl+dG6Uajhu7DfMCYH2piR3mli7s0XrCLdKzfOaH5esyCpyYSoTcXqOTQ86h6hc0cQLlCn8Uh5cCRVcpqcf8Nj1zz/MY5T/goMM9xfXxOOMUJw+8hN9mT/OlsFICbrN6xf/BRS3o6mEp/CYAPQwQPlr0qAEB/kRg25cf+pIKnoz/vwOvHVv9eItTIw/6IUuCvQvb7nwtfx7TLvhWj9HrLas3DGyf0DGw6ufOSZ0jjsqPVfvUEVByFjueBuXuFZpCC2n85zvpv+KCzWVE77ou/ygMAapaPiump7+I9BJCICYdoxXszDgv8HCkn8/MpWTEyn8Klp38CuHH8RD2KITSSYBusB/ApliZyhoNDjuGiCzAM0P/4CI1Vk2RxRMK+YSqmzdLMOKD14PCpnkInsuq9kR/wEK/712thJdMiCl1exLIAf4MILqv4iIHZYTZ+soQeAwDZLiTvPPvrHIeJHHbCvHhEuOV15Au/+JQBFhvvNNpoFuv4F5+fQ3lE522tbIwAnHQ/8BBUUNFR5oL/bTZy/+Ahwv/Q83OCc0p2LnW229N7QuCA9/O02Rmy4Z2p+/g7C//CAokv/BJo7/LdX5KO38NBE38zgQkUm/wyjzjt/f9Uh/pBgqKfVD+BZQo6UoCpv2niBYRGvexev6OIDY1BLWYfRFRQuRuIBzB8KJt/xYq/nf4KOA+/8BEF0j/NDPwj/3+z8f/4JJlwKV6P5/Z+ANM9IunnrCMwttSxcVzeXYhcV8ciQeafrummAmtv8BBjHyFJV77aKQOzQxKQkCZ1vSnIZcjCiElEdZhRVeSvRQk8yFbmyA31kh97UiYkyTRJpgH86LSspxG8YN94U+x0Ns/XMvAioxYd1DgR57ClLoOfuru61nmsViNSnDjuhfwArkIXwvjPJrnUdAMFJgM9I2E3qHw15/S/UKihSfJxuwcBqJ/stPC+2R0fEGNz8g3GPfDhmQ6GFfKREfLRIzD92na0xDQaIX+QG+px/ITyvrvwUL7g9n03QAa/MSMFASs6wmO+kAxMyglmolFiAyotQ3L5P4AvSeQBDeeqwYyFmRDt2ZlrFG8ZzOsZ8apnTbzsZNtL3IgOWLewwTuen/5BSdL6qZ8TNbdmDh9TZYpnWy/VsmB2HP74jpAXXa0PIUEKOpNxpaUHZP7xUdGYEURznxEGsOtaRBehhaO0JL3iBDQEKetPAi3JN4D6MWQ2Pkx2MKcEO/IIopekuB6nHe7LWm/IYFTUmymvrpeR2UFLcAWGVsaO+Od5TrktuIdhrTvryICPf4pQEsd/tuAyCGAwGnTvoBIXa7E/wEyLtmWafyapW60PE338UKU0kiZ69JV5AYB5w4h4zalXVfq3KP3GkeoFoo8RF/wJw9Ax/ESnWc2VxfhK1/AxHUfaF/tPgsQS/tUVV9lkXv4FMyGp0Oz/mM4st/gI/TPKA2wQzv+GSdt0/mlQg/hki+/mQAhDaYmJz+BrtZCZdWq7Wr4vqD69rWW7HnSQCVPNn0ajmP4vEWaRL83d5G/hQsL/wUL8HfxCDNVcSLRkFsQH8KIHf4fxKAiQ4FKmG46qDJAYHral0h8U+xh2vzuPbVhNwQ8w3NuH+Dgn8onhAJkfXT+AwAR9gsnp/1ESGzgFwoPRBA5QQf4MZuwvRGpem4MsgKpuKernM562dQh31GOO1RBIwyZEBP8pGJRaZNMhD4nk6d3KPJdVwYZI7BN9z5z4Au/+JU3CGVE9fK3gPgMC9Dey0bMvqGwUqL8lqbv8BD+A3TduopfwNLr/xCGLwSZOVKPMQH8C24SUytmISAPb8INdo5d0aoFPFYvKsMZ7mrVHgr2EWXFUK2H/jgwv+nf9oQNmEB6bf1U3Ii/ukL5Vi6ZQYV8q0S8DnNbWeIOJDo/wAaSip0QkF/sRlkr//8GfY/8UZZK4hWdf8KKwYREZ3nPkAyLq+yhIS80X3a26j6MaSrj0ZNf7ENTyNeHHvYaCw35DU2WF9if7cu73bQIKX4Wmud8soV94HgqemcgSj/9CMS8Q7QzG7jt/8BNtHGTc+3HrYNkbmWisY4d7mF7ykYFqRHzKGUloFiBPN+k59K/+iBzEI3nppzMcUpHA3g3pwG/UuiDsvJTKtLhS6F1ZZW98jfSkNaeYGrXUUnWhCGbzkwN+l/4COa0wjqWkvtUAwmQ5yvD7bg9ZVysEenR57JKUoCABL3UCzSCw5/yAZdPFkSYbXcmobxrwUKxQc7U5oSaKaqHhnQzv+AhqWeZWwR6ny4XcfchsMNkMkZIDwu6S5WoSFre5nlCx+un15e41H0AEIosH/8fhRdeUi9haNNPgQJncIcFH8JjBS5woty1Q5diuhogfTfsLebT3Uuv8ASUXSejSy7sOFIN17z9q+T9WIpM0mSjGvnQO8TJ0/gliFJl+ndh5rFxrRf4CEtLW21MnJF8AgPTnI7Vyl3Y20dGeFnYLBxl8W360AWBQhlMXLh3cwiLvSS8D8OkR7it9w1iYTRcRDkffho0VJsz+B/mv+ZUV2IxsDXVnEpWglFujiaeEpNgzske7Kjkv+bgIT3/hUbQ/wcFMKFEIBLB10fMBgAnjBhRV7/6QCmFP8qD5N/8EDef/1gphT/Sg+cfbUnOo1M6IIB/m4TGfAZzdO2Cz3wD/PhJNj/soKYU6z/LREo46q6e2qYye8OiEqNfmqR43VS0nf2oyiZc0U8/4sUErX8JlJP+NGscQKHzYRA///Ek2P+1gphT+cQ4P/UQUwKfxUBh/woBR/6YCmFBG+fhRtkHz/nQUwoCRgVkQEP/CWCh/xYCRiQ/rCrMYWv/RAphSCDvQAAv8UHycQD/gQKGFCo4PDncJCO5i/fyJYFCN82kTKW3/xUQvf21ocSH8AGWOS/7kCmFP4fFf/4fFn/9MAoXAjYB2fTPuP/DBF5ozEAXwOSR3+ugphQXZv97BTCk3/AvvOnft4HB69HfjGAFikmoaN9RT1aDPJlBi1+YaQSBUJxLQ2f44ZNYhz+acsVSv9jMuqwCAf3/lq/8OZNYqOV9/4J6GBBTKFwY/n9jTA7OUnOYvPTJLhetmgSP7+83QPOowv19XtAIyEBrav3Np3Rl8izGpLuZLzQZqm+Vop8f9qI7/kCGyHoM3oZqOgv1DZazNakMoHvrf4NFf8NdBT8O8XjkDbXo8np5Fdb+zP/gW4mB495PWhz5HVmrcn0IemnD1piNYfRcpVPrRwvIwo851j2PvC0xOeeh88uQUCiH1JDZfO9DeZwpzMi+ec3QfaYcYeKcA4SN7RH0J1J/wEbgumVGJOj0EOQalCpDjHYkkP8Pm6QMWHy0aC1v5XEpBX85kbfYBhCmPqy7BDiBx9fnZnE+ANggd+ujx97zbaAaWWR+n+Q7brYn9M8uAIrv1vLbs/Kohc3eYUGJmzyTaSEtSUwOQqXptsGCrl9jwdd1/IxlEBuw4FJYhzN7wV0UUtimTE+lMmxIc4dhAitM9pdcPlz1Ce0eP5svq8d1ZvUjknwqLfcqwqzV9Go4OoTEvCszueDA54iBdeRZKXz4ONnNu+07uCRmTCoz6DUCAUgCVO9eFKsstfgLAD4ZEFzMt+j6YhZpEEoxA1Cwt1/ANBoheykrn8BgAjJ/DNHfSSbqLtqFAYDvgKbhOP83xc+Al8lSQ7JFO68fbgw59/wYIwl/xAAiw1EyLdIMUEBgR95DM+2+HRdYIg46tV3zNrilAsOfwN3OfxCAiQw4aweGav8DQ46izKbK0wzUkmOML0C7j2jEH1Alt9OLWHksM8z8gJHbhzL/DKGq7fwUA1/y5BIoH8NI7/8zgQkL5fwLT1Krl0gNnWyAnD6xRwxqmZwoV4thwKD6GccM/+C5YnJNRUAExvrNxOXL3vX/gJ4RYKlmPAGlc0h5I/gZek/iEEmq4j5OsxH4DA0CBtcTn6cbFKOrV5m9lVm0y7fWAC7/yJQmqGqFSVqmxu/gaL2WaEpIGwkQO3VIgEin82gwWYWJMgf2IL09QAc81c77Hs3jGKdeiL48Sv0DnAcDvSc5fAgOG3v8FOUIKXCATFWXn8BgAmLBafv/Ct27S/44MPtAiNju+2XgAUP4aTGnf+CgrD+WL0L/BAt5v/DShJ/MgEIEnNSUn+DARvxd3ycfL/ex+naSMhazCdJHefDXr+eFOUgYaO0/XFdVn8ypBI7f2kQr4Q/pOKK172SyQFr4Hz+KAJErwe2mKxkgMBAAR1vTT4bxkNo7uO7ykRQL8w8c/4oBCSCqi73lByA/hVTU/wUIJL/xEJ6Mwr44QYbQH8Kqz83/xKAiwxoFjdWBFn+CiXzn0YY6ApaaipJpze2NAHNbyceE/gDVhF03VPBm03ZKTrTB+wOyn/dQtyz35vwPNYjNKuoUBi7T5XU6AviB/gNvRDPiEEChMuYQUMl/wgwiR0WUWzd1eZHWxLQuRYtZF3iplCpVNOOAuP8ENC1TxCgAjmQD6f4mBlGoCAgIDAmoXz27yFhgXS6pPpx+Dbk5wH/MTKVk4CAQEBGfXpNUprgelulK9fNG/DG4ro/u4OROKgDR7b/4KdQ3YIH4f4DAl7j+AwATBgxQH5P4WAl/4EKoSo/hkCX/h0CKAAGAgIC01Mzv4YHcEP8WKaJwl/gxTROPz+CgI8K5nD/FytlB/uBTROpntjqTJk5EuRpOGlb0fwbh0lg3y/isAf/xIrZQfw8jRiP8dexipRzidTf48GcEQCbv5bvyhL0C0//iytlB/uRQOM/zAnxrf5WLoOP4mBYP+OAFv/YytlB/0QuXAYP8+JKG9B/DYNf/EQN//oAG8MFkYXvXWkPETQjxKKGNP+/l1xDgdI3klsRz56axysOTRHe4D4mZvHbLZ+PVf5FQiubbxSGQ3SlkuW+r0H5w+LPM/8+Fy4FB/nxoVT/ixlHEgFx/wwFeSCfNSvP8Khl4l0gpZD42Nf4mU3oXYwzK0Fjf9mL3kvyHf5WRcvP+yMImCwf5cRbfD3+Tgx//PSLP56Baf/xZFt8/x8kUHf5mJl+H+Co6AfsM9MSf4Ib4OOPpx3Hu/94MPen8DgP/+Fk2PE//8DhWf7+RbnP+fA4Vn8XjL/8Mkn4lR87eHaSnMXWNz94RbnPQLT/5+vN7T/CyShuytjC8/4kRbmTguMTAyNzEOP8TItzJuXGJmYHJl/h4uXlODkzNTc1MTk1ODE2NzYOTQxODY5MDX/DTD48wnDCZspoZGDMwvCMqrLG8nTMyhqasjEbMLO3pyc1MSEZ/4oBwsDI3MH3//yh7v/u4VWrP8/Gg+P8Ynwf+glPC6l7078/C9v+5wkJWVbdkMn9pHQYCi4ANqYxahRegXiR3/Wynhd/LwD2gyRAxWzgUbtm5DluUDWirocB0Rl5+Ia3zelgylg72YWNkUDZ4jplqcRFDwKQInftk5wMF35oN2Cxu9oR8JsWOH0/4+CBcr0XjKkyYrPY1CtK4b8DYf2FbAe8Ccn+0CCTjsuna0C+cAvsgi5DoNcghIhsjyq8De8bILkvb+BkrgIHv4DSKLf+AwATAP5mSuD1N/a9LytYDe1NSYUf85CWVhXT/Bw3GP/ESkc316TAGlRf4wHoM8T/CBao14R9DTxRjmt2uM6M+e/4CTl+PcFgppNIZN7QMMy2v/BixMeDtPb6MlnL+nEb7DwmATbcsutWFhd5bKIHhUfQjzQRTrc+4p7KEmNGpMsJUqoQyOv3PNHtg3/aWvSH8Dp+kt/OCfp/jpUxJv4QNCJ2JgMFCv+TKmJIbqRvT85Dfxfw2bCLLu3WupXfw8m3uf//JUxJ/6sqYk/++MTX/9MKmJLCs9Zv7Bdx/kASE8/xgy0tH8A+JIZ2hRr/zQ4J0g1m5H/8dMo3tGVgn+SGW2hv/wBMFFuxa6nFCwRJhDQv8nJMgc3c2fxQ7SWIA7NZbhQGAVr+VgPoaJBV0FTvNEjFn79l57Kt/A+Dp/Mz4yS4EIwhyW/hujwxOQfaKitn4qfLr3t/+DgbuD+CW1r+XMPn8z8dDklijCATKZdf8BgAnlBhHNZf52Tbzf/+Dj5n+wE2PNdHiTUM6iGZUs3zhq6wwPSSPSCNhctS9GzdGHPgX7ECDPv4eBX+j+fQV+rpDLuY5We4Hm+O3icMsjZ7rJWkthHYnoEFwLB5T7da9d/4Am3mj6Nw41COTMboXULn5Ko2C2oWOU0aiLX1HXtvlJiXGbYjUrRSfD8MLKztLKvx++oz34bhe6WtWpspmZLh43zupTkcaSXz0QTaH8aN8iEjkividPfKxLIEnoTlx2vKn8C8vjGnf6Lctfmt5CLKeh9bMmd0wq+CjZIp/auIITYFm21EAxtCc+tPXuyVYfmyObsgebtw6Fmnsf3BP6T0lPOVlhCldUcpwKOFadPGtQpYtCzKfAIcNAEfVaPPzvkAkZvuBSlSbdFzI5uapdkWP1hJI0LE+n9H/gJuQYVxQn5QTKweG4P8cD1A9M06WsKFg3QwbWHX+ANEMV3oCSD9qAkgihK0DegkKiRAKw4gFLsvATFbsCMP6b/OYeVeYxQeuvyIqd9przA81iBYJY1J1CEQOPExWFuRUPWwklDk9t2+fm2u9QTEN06ZLwy6WiAqI+QQ3x1Sj3PnSEOl0E39Kz/AQO4ZZB8UWGRn1oKqKXwrYDOxAtEa9v49RoZdMKPNvrCOAqZYmHEkqMmdxVWQxUY0Kp7EyHABRwJ0AP7tDrA96ymAJUULRescDHqHIGH0+cgFAHGBZZieX/gIcbymsGcPIhQyP0sjVD/ASbMAs5+V6RQrifkJmQmRx8Do/eHIFdzzhGlcGVwOwaju+Oek/CN+IuhF9u7tz+fLQ80btZIJSbIpWv5YXs7l63wZ42qRIZ0ZOVhcSgdflvjWhBWHuoH7Cg0IIO4/uJoZ0eKeMVt5cCFvAEr44tuKU80H84+6F5drFLKDQFhYcOl0Bvy3dgNoPptsCf4PzJS4i34n73fmxAXbr46wOqwTXuTw9Pe0+3A32pPlOSc3GS6ozMBCxF9+oVM/M2h1GUbHvQcS/G5uKBYuprjEMu1tC1wDMTB6XAkCi6GLijCdOOkko8iqvtvZNpbE7izxjuZyp8KYSzGFCTl35ti/T7vUb8DG2rgv0VI5NCxSgKFnBf+Aj/m2SZenFIGCTwcpMZU3a5XPB15OFRhRUra+XicF66OEhBR82VVJ9vBBhRdrvk64o9kcM/JMCOVPKR3u8QRa/Ajf0UIZrMKCexIYpEuChMfXI7f8rToTbx3CpwURQAbzgODNwT1vPlWCxnr9aEhgYKZePEP28jVeB6U8hJRomcDS06by2Jtk4VEblRjBXCND0Ima5zUrBrrIzm2sVNYrvcZ39V6kaxZoGVOC5sNbouiUEBIoiYYBdASNI/9FP8BFs/R6ViPOITF7dXQUrXvSc2/q+QUWMzWOXTQluIMCvoUwC1eJLdC+mwyvKIibScI3Ix1tjud3P+AlZG1yczO4VUIuNQogQQpThCA7WWq2ph5pudOre5AuEQTSWXUkpO09Zrys+gF9R0ouWo6uDWxuFdXiWi8rfhfsIL1U9b+nAPxffQTi0ZbCfNCB3/ATHvzCPSBSlsEjKP6xSwqqtJE2Og9kqfOkUU1jeI/S0jPlP+AizeJwwcIiZ1yejMNHqjHaZRqbVnis184AEQiZrKEbSThPl9kyhwO+ws7MP/AQJOGCwvZPNNVFtW+PfVhHFZDxuNjqhpGaJsc8g/fINgM6AimZp4H7/IUcGscKpLSl4D6CBraHMyoe401rssPvTsmvJRCjQOlvW9AZO4iz9dBhkVIaAldYBATNss7b/gJoePBZoIlXBAP852UuwRXE5+Kg0uCqAQx8D4MDfESGFIJ+s10xqAMpqIcfIHLjeiGkKNYsIPaPZVwwVM6nmAt6A+lsfo3bpu8lmbUSjEwhnFeDOEMFBok7kCvOMwNUUr1FL3xtVLjzosavDXEFy0DjiK39jYXSDyu/GezOoBroIF6uiCvv8BC0Gk5flRVBTVCoeaTSPJhPjk8l4yCFXrOwVQO0uxjD+WWvg2lVxRhl8hEUHCwvmCMFsgRnKLFobuo6Tq3dG2CdweVcXW4VMfoZN3MQuXqGJq/wC/XDQBGD4kTJH+nlmeuAJ2nrou78lpTbxqFYxrTsisw1lXBJ/jZay9uf81LWXv+UgWpYshQKo1rPh3yAAwI0WC4pJBF2zuYs6WkW1NsFn8txcf8GAhYf8Qym1MauLQ82LAYEAUYFuTELC4eAijWYEc+c8gt4uTAXh/EoCJDgOzkO+gmtB/jRdaxtf4D7YIL/5C61iCtPb6Ngbu/koPUeVRcjEIO2RlboSeE2rRD2D0L4x0A1WqTYKeH+OEx0FX/2INNEDqAsHCwWqobGa7QNvKJqPB+nPoPtWvVN/W+EA78Kxgi8YVf/+X2+P/tNNEEKms9R/BUMoEcP+QCaw+7G6CSm4gfRQ3yhffJGuYr04/T5xrg2iiDk/vBM9Ei/Yrv2KbuqieK67ZgPQ6Y+PgPg+VwKOZg/zMruGaE+Uu2EhoIFEjlKrEb50SGdPKp2p/nGewYt394JQoWQwunwIl3lzxijkOkB4n8I8KkUeHhOprUsX3/ASz+49vHyq/4CSL0NK1xYRgt+T8JeVBUxo8mMXOuP5mxYMiFTTzoHU9lcbmwg1m6q/DYTRfeT1hBQKvirYXyw92HpfQsXkPaY1suYrhY3Xf+H/kAGHhKGRVXV7r4814l5HWbU9xGk2XQuLhXsJknqiJumvZnS6vPfrjERORmz7pA5HDqflFuD6JYPff0wB5kMVxAvXDfDBS8WyAN6VybaeMaZREdHdXGsbIDeYA+nMUozj7eroOxGIv52z9LDgWrBl1KnAQcvDrjwYDKbADOMlAKnaJY6DxBu5Z6uYCi/bVF8U3ri0c0ebEd4AZorzR8Hru6FTOzQjfUqAuWb1vgvZM2G5vCpMbo4/DySJCN9kDVSuC0xuU/hpB/dv5jtVP8FBbMv8MoP7z/M6IIjGB/iguKL6f8FHaAH+FlUCs9QVCsB1K7A3yWlif9LDWQxgyxHWAgMD/HCNI9Nf/sI9yC0SSotJaldsSaCXYwAkSkkxc+Wcv48+j04HcP4FwknCQUJokwPzxOqj/AQ4QV4K7iKO2vunGZggrDmk1Iz/KiCnwVxzP7OzGqYHTSK3oM5QkBKFAUjVkfxRh+yfDjIeRqQGAfeQw4lmBbHQzuvox8evrCA52qkT/FF3619/4+drT/CyfF4fwdhbfwxd+6POUMsZW/8DYxDpEpSv2ZoNC5y4QYDFg73cJbRx+/NDKU46+n0oxfuCbLFYZ21ajwYHkRXsjNNJTxFVMc1P4X8UBPZBhWO4GUlAfwqoHf4KF5Bv4ivrlFpYYQ6koDAzo0wArMT6Kium4sZIIuQO/VA2Ty/wgclflsIBMLE4fwGACMzBgbo9/4qQ9ih4ytHiXQP9gHJX/8DAM3//jkr/+OOLiHe8PsSK6/jqA/Q6THdIb/cByV//CYU//FYA//lg0WdP4SBI/4s+Cx3aJGJ9Lbb/XRyV//sAI6k/zwF/hvn8C8DW8JjmX5Ui2VLzrNmhVkJfPhH2Tdolpp/kTCML/gJpDMqaiuOTdsPPXkecEw9HoDb2WokOtNlYwr/4Gf3A0InGH/gMAEj/4ZEvTQLG39VY4DA5I0wQo10PxyEV9Aull8R0W9Hthx/gdmX/gkBG/lzgsLP4F3fHv/AQokovbXKQxgsHvzWf4CSpo9V7GF2j3r31SfkjsliYKrq88kCWNKHF7t+RJ2gSHVUriqZnu5h9t3/BirHJ/EAHhDSJn0LY4M8BgB3+Am2UjYK7fAFaeHb902UCUnL3Hi/wUn0zg/8RAJqBSD2gE0fCQGBDe/v59dxb7GfQZoTZ5m97yjVTMH8A5Sb/MoJrD0X8C7jr+0jb2Z9wr9oZhwqPZZZ2nSjn7Yt0gGiTZu99CH8sQ0/0RH9hplU4rDXJqVvrFe+citICwNF/FIRMX1yZ6Yr//DYRN/BYHh/LHEmWLP7yAH8KguvPApDB/EIGUCeh2o9sO7QGB/CcVb/uRan6/gPpvg7ui8ZmP/IqIyvZMElzPLe7KBRyY/dFlnexyQ7gkFeG3P4UA53v/iUDxhsEuCR7pfoDAL5ZfW769294jyKrzZ9/pwiDk+lr+IuBuv4Fjv/qB/1gNOp+cmo722CRHHvYHNn7/8gJqaNkqWYEWjxMCq49pTlLlYc5hB84f8FOl9v+OGmcwUP/2IMSPMwjMI1D19KR1CjsrWEOhI853h/ngWSNA3yOBv+BfOWHD4hL0d1vxC8K78pL0Y/PiHka16VqGAk1CglSv04tjqfIl7YXT0oUvgV1c+2Iw2epgp9zeD+KRDIsTCyd1IJgMA4W5hEJCNP5fjWqKBsMixvFDsyfP4GMmP4IRa7re9KfO9aAwEu0AYdIErhsTQHAxkgPPQGsWaF6/wHDd138yjT0UV/wL8Nxdunc5VH3FYNix2cbupBlBOeMwqNaL+AiqpWgH7iHp1YkCTX3AFOkJQ66lohYgcrZ4uIGGan+Dhq/T+ZR0mEncD/HTO+06wgI3UUp/AYAJNwYD8kx/tALuA/x8zvtTLrES3di96yKWclqeDwpSmz+f4CLABX+KwB/+H1L3/HDfpN/qQd2/HcvuIP8+BdwBn/jwEiLon/ejO+1Tf4OZ32hX+aAkH+Hy+T+/wkH+kwkH/CQQcSRwPGTqfar0tmmWdOjFMqWGhrT4VprnYi9XFppmjwfhA2FvpndPPoz/MstznIIxxHfZr1IxjAXf/EozjDIMrVDy/xP8NjFbj/MZVH/goEKy/hQYF/wUJNkfxEYvsln+YA6a8BgfwIcAyQVqymntYkgip2bg/ouyw204ntroWGVRxayVr0dus/Yzm4UtUE1W0UWf4CObxPYBVI/Sh80FRBB/ikenAuQNq76W/hYdi8P4OGtv4YAsdlblG0FXIDA/gEOs5YiyYEWmFtybFkxh/4AoNNtVM4R26qbJyw2Ii4RAfCVc5VzgZ8x8u+t7hSt76//ARCIlNM+L8YkwIU38UAPPN12PHbiZAYA3Fkni5wxm4JNJ6qVx8ziY/eAPI/wM40f8QgewGg/V7MY3dAYH8CHsk0uzF1AuxEWN9hGk90GhE6jzO2wUqJasqIqq0bTlEPMFToFqpJy1dn4bsmCA7wNhANnzkA7b/g4bdS/mYh/HsC7fNlYuEaeSVq68UKOrwt14Wt/wEvIff4MZSHf4HVnWr/GjZ/JcPX20ZEBgfwIz5s8zsYovf4CQopLuL1wDp+16eWF/kn9KEYuW1QocQKI/p3NyIIrAqHR/ThGIJ3L62XwZDs7287pB/ikJTHaLklqjgYDAmi00hUfRCyKTxnUnL+XnonF7ar0C7/4lBooSgCaVikbRgMD+FAXv+A6nd7/iIQUZHfIsZsOgMBiu1KMtz24R09xsk9vmg2ns+erCL/BwgD8EfwWILf6uCMkBcsUWJgMAAAWnt9KJVW8SFJM8/eoOpMefW21cimLmRbLCR8/3MvJhE4Ngrmv+FAKzx/wY6BwhysjdX8BgAn+MgvBHoUca1fhUgMAD/AAAAAAAAAAAAAAAAAAAAAAAAAAAA=","commitment":"0x80efdbb6bb022aded4c41cf2ab0bc1410169103d995ed653f4376a5b020eff91c0654209e1ea711e88ef1eb689ccce4d","kzgProofContract":"0xa6bd8047fc490a2d551f6afbad3868abbec5973a36733bbc6cd878c29f43c75e6a0b43be61b12fb7f35c0390cb4de9d6","kzgProofSidecar":"0x8ea13a2765d3e7488424089ff6675bd6c85a395a97e66e8a42037e3e7efec691c1ef3fc924fe9053435935bce279d9fe","expectedX":"0x60fa24b0fc61dd51ec0a635f83778a4b0199acdf9b029640d93eea2138b64d67","expectedY":"0x1f0fd649ad202ddaa428553a084923b1934ad131553376b656af493d2419fbc0","snarkHash":"0x074c64f131c74498e22d9d6522c530c39351576e54a75c633c6de204483ab00d","conflationOrder":{"startingBlockNumber":30389026,"upperBoundaries":[30389093,30389147,30389213,30389281,30389364,30389433,30389487,30389504]},"parentStateRootHash":"0x331ffd7f618324b54339706420cb7f4361ee44801dd50fc901c65e013f8109b9","finalStateRootHash":"0x7bf031741eb6f9c15bb8233d183edf8e345e26c971e25d9278bbdb5e48bcca7c","parentDataHash":"0x013073c5f584ec10ad221035689b038577aa0a7b7ac8f0a6ea355c8571a27bd0","expectedShnarf":"0x27bfba217ed5a1e16e628eeb6699a41ec6aa7933aefd13dc6b374b4d4ab45980","prevShnarf":"0xb686e4f81a269c88420e67e8235e63faee8aab7b735520cd112737fd283bff4f","proverVersion":"7.1.0","verifyingKeyShaSum":"0x213d4f5d80ac94278aef1a2ae0f73a0e069d5756b2346147da1aae310fe8bf1a","decompressionProof":"0x012ad46f6c32ab4df2b5a324fdee1f19b7fe4d2475925affe1ef01ac59068d68d5a9837d3dc90b4b2ab348df5d949072003e376304ffbc8d267848b687febe6f4b341e2733e18536fb41f3adf29610b5651129b52f46f95568d14bc0c255b7770089f9ab7121a643287971bff26576949bddbd410e351dbf6a676b0786f0afb777a11532aefba3a1db38a84b4618deda002de464e96f9b319fdbf12c5b943548a1da93ad330534b95b977c4e4cb159ba7bdded55838110eaaaf2ec6592bc759c015ee9f49a414e5ff330d5bdef4b7c990bd06a67913e8b299e5fb1b3b852f4c9dd46561922f57f7629956b8b1f8de1cd007953a82bff8b9b3867f4a8873bfc637ce34a44a82d7ee0bc38c13f65526c0132a17a9eb0fab28002a375d1531d82220198b03382b49bf81cf373127c846e5307529bf75496d0c891ab9d0fe205d4780244e04cf9dd4d40d33d017b654cda5a011b9fab446a8c913b61250389089577e062bd391179f21467be3f0a0e82ffd95a6db615ff85f2fa1ef40068dfe3fbbf00cfd54dd5e604e83c0d036a51970421a942a6f9cf82f65a5b7d55db9ad76aa20640dd2befd37818f8e5e6f0d9a0fe1500e8547112dd1d1b181676dbcd1ddfe39b16bc877e5de61addcde9c27b65c4e235afef8dc32de5c16070c5a7db7643a100f092ff0ba193bf7aa0f0f928828cac3dbf63f8f9e6fa76398818aad2045633fd7a11ce88482b9ea8c0ef9c0688f0690089d0baea2ada9696ee9fd2119d48dae88cbf226fb9a77d0e3392360f3345b7edcd33f11e51f6af830b8649ce498c8800ce88d1b24bac34d53c002ca11b439de8ffa244d8739ebd350343e1e3a884d8b93093ee107db2d437288532ef8813760036fc425e2b17ca673e5c1c58a3b07a6666537b2426c7f6f77adb0525c830b784c6388b56190c00c81af57a16602f1c01567ce2328118a3a934aa65c502567226667b9a66fc587b65b422c3f695b8203dbe24fd5fd7b6615ccd4a0677cdb007014fa543dacdcd526fe85d96d06c457f9c38f0ad5d0f99a5f1dc0e2ad3d669ac73bcbd36e7b9f930414e79fe0bc0b8ae0000000708f49a56d1e5b22c2d5878ac5baba39be00a535aaf8a106716876ce7813e202708c34961cbcdbe52ac982a4d57983dbab671071dbbf5ff1ffe111ddfda93634b1022b19844e9402a30ec55c48e6df82b1da3500152f0e7406473275f52cfeea50db714b4c7580e9c2d9a47b673774c16fd2284f55b885932070d368531e59f6c0d705529bb6b916a5e4e332f04e622ba456c702bca124cd062db377b512b4e5607fc8271ea9d01211c6d1d9b9606f4a2efeee2e0260007c68b426505f8d0670b01d1011cd997f2eee5dd46bf38dd139e7777d2c64cf4ce10de77502e2649acef008a89994a96d14e139360f5be8b602c7d6940d8fa08b87ba159e5e553bfe7e552915daa71ee8df46394e99900cf521b0164adcbfe382cee152e4d9c6be754f8434b047008a96df621efac7e160c49249f71bb0c7ed49fd8163d0cc3ed7db48001a3b7a6801459580b605103510b454337e7029d4b8096ee076eb28e356d88e50000000100e77491e61d8102e3fd98d6c1ef7920fc4d96d5087bef3c08046d08ff49ea69c8713d6477e843f59103bf31e0c882a500d9e6e690538e82ae634d811c27fd85e0ac3ead9c8939c8450871e6297d9a91b470eb67ce8f430c8455c950458c1963","debug":{"publicInput":"0x10b752b5efbe9fecf32871acc16b7aaa2e91452e8b7eea9bfdb147fdc620cd71"}} diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389505-30390023-response.json b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389505-30390023-response.json new file mode 100644 index 00000000000..beb91c313f1 --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389505-30390023-response.json @@ -0,0 +1 @@ +{"eip4844Enabled":true,"dataHash":"0x01575f7528b40c5c3181e4cac8c76d85e9f4e7f95f51e9235bfd3e4812c911ee","compressedData":"P/+DR4rvuKHEmsLIrjjTTOIiVEZYCKl5ZW7uf+DuIlo+qeAAYBBWQBVM0BW1sBfmYA4HgAHwYAAQAABmnt9KXQwvaAv2XOPxdTXACLwWmsNlnIkNn2ZrA1P8ABulBuPyK69Z48jrIU8fFB5FEufwAAIPP+AgyOhyiJvYna7/BAYmtgMIHKyN1fwGACeX+CAgTTGGVvtriPTh+2N8KD8OdDCf4AAP54chwEPz5CztgmAwFGnstIf0S5iy/8BBfPatGm6jBo17AAd/ggP9iAv4hARYbrscTrOd6AwOdTElP5cCrWXN2cfR7WUlWTe7Wsv8FAxyzn/BQCLdkP+LgIKn/A8E0h/wmA2f46B2H4eowFeKrIulCcZJnfwnJtBSNyHrO8paQL4dYLnCAWEDQidiIQRmG5OgwEm9pRZkW2oJdLQuwb+HjXHIiCb2Yz7MqGhL5LIqOsuERJKQz+DAGv+EwBWcpfy5ggsSSrv76rY1qecaJAqNWdHsWo8oMA/GuakiPfwP8VBL1xfBglXbfwGBFx0GAqutKNBgajEcbOH7Q705U9UroW9RR2r5wFyAghaoCWP/noBIFAKD/KwdJY5giz8hD4+LjMo2YzJnsBBYYMHH+GAbUZ25dIDMmjwU8votJau9ANtm+m05v+KwF/+EQCsQzP4dAPxz/BQYDGtDONYz4i1tbjUpDcZbutE8gE25H8hApBBQZCAVcy0kGEuvTz4oCST4YDo1KUQAC5B+RzbqwLqbBKpPLhZJ/x4v0r1hCioY9Rg8CAsDlPtBVmP4AC80hyJMv/PQOlp/hoKD1/gTAsf4dBKiP4/h4BfYP8fBDlLEzMbEasLKYHJmyHLKbhkzMpoYmLCyHH+VA9Sv/SQIgT/GIQV/kABTQ/wwBkWbBbgGdp8kNh4n4Y/gYdmshw0pa/fxWAP/yaAF/wKEzf4dguyj/IgW95CB0VzP/+fAIHX+DAGPAbrvv2UAyuFQkxUZBuKjNe+uJkv+Gwb9UC+T/KQcct/jwDzN/ykB52/wLAL9SXIcb4WSAP+wbT8hm9tD0SGu1/jYEeg/gAcWiFw/j8Gf/h8IDZP8HBoRP+aAhy7+EgKP+ExNH+OQQP+BBBj/gwB/wQ7pAx8qC4R2aoa1Wo2Q81eRD7T+PQSMCRLoPyFl/gwDDclmJOvfNvJ/YBsa6YdItWB1WZ/8fB9M/8kYKH/GIJGSE38Vgwf+YgVI7+QBSfHAZ3Jg5ScAAP/BQBgq/4mDep/8TA954J/igHvk/hMA9/hUIYC7joQBokKPPGQKqPpKbQAyUyump5K8djQaZ95NS4L4wD/CgTGSFH8BwLL/Iosz/gwD1Iae30qZDFHffcVcnu7+gDB6YqEn0yoR1nw1Awo+fivPyFC+pAzSsSSqmcztx5M9N34kFrK1KzLudf4FKp2/+ZiqeFH8AABGnt9K021/MMgK69ZtvMY1sCUSXvq5Cd3u1xhojVSL29f4ABVK9rEQlt0axxIo0DVePj8Mv53qcciuUC/4xL3focRDZ7DZ10v9l2nvAVDNAvGt+Pv4pMI6xR2cYPG4DAAVKqHNpIZmN/fXyZkc5p0MAs3YX/gQwjA/4iMI5G68QmhrP4bMI3T+CgEWa5/8XAIoH/HoL2An19hHQBgAAE09vpYxers2+VAPKaa7921pOauPw55SSbDf51cI120RfKsSr+/wyETOH8xgVRGE5yoAH8Kmqf8BmzDr/EQLolS6h5iTfAfwKB32ozEOwpEVsAkIciJI6kvMGFkVu7TBWDbpZgdQW9OTXmwo0pIxrahvF11mjmastXz7+SyhXN38CHJIJ/EhyT0bh8T60BgfwoErn8BnR7j/ER0eEDxofoGo/QH8ChjtwXczoxFWPqvJNS01QHz2HV/RFey95rRRhgUINpGKaSNrkwZYA7TeIJ+FLk8rWmkfBSl+Xy5/FJ33p+GWR4TXAYD1LHqR4qSLSHgP+uDIhoi+hOAEnv8CHoIP/EZ6DmER+ZW0nAYAvdGXHSqIgnHtnA7Ga2gSy760qwf8CiKjx/M535N4HqeCm1GMG3WTMmb3mY2g92/BYauH+DgobUrwYQCRo7EhAJGjyj8wQEIxf5KARdP8hAVPv8ZHPH8WFhv+YA7J7+Di00BKCCNMPM4PcIS7neH01DTm4a7Sn+PC40ParX+BREK85t7eLeU4zdRco8utK/j0m8qYmrenq81egeoyX9mrm5v7DKcPibknVrvnE1jc3jx600wSwV/ilGcIIX3G8GagMDpdPjT2bOhmgZi6OzcL5FBzUTLMX/BwFb+lfzGJFEFrhOABPwKNY4G3yoLW8Q+8d4TUnbCDGFzaG9KCx842apY8CwkLOTWmKOIGC3ATR+vWUhAxkWeOfE+/qpUJfwIkkgv8RAlhESycldHb/DYaY8/zGB3rp97zAB/CoLt/goKGAS/4I01stdNHge0mAwJl1iJbuxe9ZFLOS1PB4UpTZ/P4ACs5j+GschAJxYVWDAWVs/ycHa6/4wEEG/8fAQJH8CENWCV2TZousjFHEg1mqWmw6RjwPUfwAoWnQpwvhaRvk1VzP7R9jM+HjWWZ7k9GJvQRz2LL2f6qmAxv8UqJCFJ74QgEHAYFN3qORJwyr/AAdAF1Sb7rcKWZXQgksv/FAyIS+iIHH2+if8LCpn8Bqaj1/ESmoBH1sQhIegwGBOeG6+uYDMcAlY3g856LodzPZahB/hAKbD+f62FDf4xN9/4NQW0MrhU7KTkGahJRPJaN/eiRiz4BIf48Qm7ReRcAAA2ntzTI1mjLU5nEAqQXHThCZV7+AAl0etocRcS6gBrJ7VyQDmIWKdw0OlCL81424XSWhnAtOH9n/O5YFv4HEx07+aQTHRQBuF+whKmyEU/8BBBCaekKu7GYUGZ/wOELFL//fyOSfyMOSAJqnwOBqgsuMm1MG7oDAfUG8F9AE+0uuCvxNsEBEsIBG1jDQYHt7sptnRyUk0lA+KvSxsiUSML5DxgUPBAXGI+qLRo/w8nY9X8Np6LT3CHH/jgEQo9eEAxA4yS+hqZr+ACEswBCLSmDMDnq7wBXTgdXrdNTijm827PFMJH/D5f1sMtflJpDieUQUvGXLSKPDDCjJNn/Q78sih0N/h5RRHP4JFe8zXt1CSLzvkYf8BAT7xRj5VmS3YSaiiyfeOZh3poZhhp9jgVvSNIQZbJPsB7ZAmlEaZXsY1soPkfxiK98fSkPF92zr2f2etVScyUacZb2cj/FI70Aex02XbAYFe7nXUKFLl7LIl6vZ2o4J6d+tI+fwibQfy4eOBlgfwoIU/4KELtE/+IjPchF951UDb+BjPfP/wEIo0QpH649EK3Vh77hmxuwNWlh39e69nDpwfay4XRYpAPv/AAUm2/yuF6lKu3ro1sMVbLRb2ZdTgXh/EoJtDgJa7d+yBQNAYGk2QAB2Q7xvKSuYLk1m68+LfDwzgXyA/4ICoMwMnHKEAlfjaP4DABHGDAmNalG16xdIyZsn+ABSOhZGO4HcAW/4AKz23lMaeAuQGkRV7kqv4+Z9ID+PFmkk3Rr+D5apIP47ZSQGtRo3+PgP9g/j4Df/iBPB/gUbt00O1D1NvUvcELSb8GIFW4fUqa0D7305cVtn5Xf+ADJ6dM0USUt3br4PcK6p20zsb51ZgpNGgMyU/gYOND8VOCv4DABBC/hgLBh4IRi3N9gMC/GXN/gCo98X2/fN+14n4a6q7pvH/Bwf/EHKyN1/wGGGef8uJVUQ6fwqIJeH8DoQf8DAQc/DBMaF6e9aahR/Ax+lrudqtSi3P5rqmQuqTYXg/xX8ABBLPE6ROxt4BCnUKlBimYiRTRgyw9JNSoDxcg/4NpNJyXflE+t/Az2d/EAJrDgKFwlikpFkBgK/7atqPGUOP0UV68Y3F5jpmFXEQfwM9nfxCC9wViPv+zcfwsXb1Y7CAExEczBgnsmf5ICWBf4yOvx3L7v4MZyf4TMP/4kVDVKCO0zkIuuvexMSi0F1Pfovxzj/wEG4e/wOgGuX97Ff+M4YwUV1SZG01JTY8SsOZSdM4Va38eCeMX4EP8CqKG2bhmultoU/4CDzgQHPWw2hb4NNC71yQmaFc1HuIqqKuTELK7+GQ8xz/goKW/llRQ+Dl8AMDFGS+2Lp0QzFLBIk9EN09MwbeWeP+BVvx2/goMb/lwRRbv4FW/N8X5oo0GKzy/U0jzDIkoxu8fpfiRKxv1uSCxcqhfmhwUDQhhw6SKMOZRaZts4zc08QCMxueb+KhAvk4QOnHIDAkRIdfpOemAEC1hG1M+Smvg640jb/gZa2/iEQLk3SQrElY/hYWH8PyBFrX+IRJYHiduuNVSDgMBvM0XRt9hkakUAENd3sx0uPsD9SH+CDpC5/v9Oe/kZOeAh13FsD+BGgjiqyoHYQ7knszKdgUyzwdRZ4e+5Wwk1lEdvlzWsVilLi6/hkMLd/yYw5A6UkU8AMAABWnt9OYOymEDwhVd16sgrI7W4aK78CGrwJjLfA6RLEGAH3Dte4r6ahayY8Cf7aXqQNsb2rpUxtryEC7/4lHF4aXkT9+V92AwKeiqgks1+kGSCDA7JJCJn/AQQeQBsXh/FI64Bd8XXY7VAYFkQUKT6B7UBrOPvLOnrM74I/4UQP8Igmv8uCIL1/ComN4fwIu2/xCBlARsz7Xcr9NAYGIqCRynWkwFnA2nbxe2Bu4rqGsB/4OCObhZUYQIF8Zw/gMAEe/8aAi3AISf5OBFuP8YAqGR+6gavBvrkKGynsth/AA9B7Hf5KMCvp+mBcTYml8mNMoWYrAYJpX8fPzP8PSHoGgf4ICMT/84Bi3QCwfx9FMwAB/PoAAIH8fMd4B/G5pmYBFWZvpTMfwdHcxeztf4UEliB+308I/x8FfibN50dLHt5zHSRKuMtKWsdoJ/ADJDpGlF1Kscw6QwfwAyVvw2/YAOLl8VOnKRXU8i0APZuTjiXFw/MAC8Ig1aZRknoWfSPWXqBm/IXUnoae8ycSle3h63FEjZJPKjkIUbueGPOu8b/jd0NpAIMOsdm4cxfZTHNMOsYleECJvpNNxwMQw8421XliEms8zypBq/jS89d9AN+UZTodz+Ag6Uy+ErADdxRL/wEFznPViLIT2UlK4F0pWmMhdbYA0zG9zCe0TYYZl6LCDxlKvhP8Pgb4BDYiPvZfHn8JaIvlg4JihcE2pe2KclOZ1vld0v3k7np41DjxO17LD04ojWTfSWx1bHkJ3AWDrr1xbFj7v6x5CpHgRG5eRZSstPmEKw+qBznA2Y90gqXsJHU0XU7+ot/ADScCsu7Y7AOKeYtltHtQTmX5LgmntqhrJKytYeVRaJPQ4iBO792lSO3wjuQQUpMIjyrbahdirYWmU+UHbQnuC5mmE0ltvEMTjovAb2NQvKgfAJ8mwCSL7t9QuOy/UFCouKZ2ffwKqQ9g8cmma3+yCVTfhaG/ziggM6esJpHfz+jFoZxKFlr5EB0wzIwSQtv2uz3ICl3VzfJdFl8TBH8UEVaeEMc294cBgNlHyDB6D8o6c76hnpQdT2vUfTaCfwO5E/zKTwwiV/DQxIfwWLQTWz/KzViVAvkAfwM/z+FGriKb1raurRXBI/pHzBRsFXxLsrXVjHuWCMyx9meWjkYRWFs69Qg7h3F1R35Fj6hQ9Ry2f8UDrWKlY7bFan8LDinh/AjVh/EI60DfkM6wldJP8DP1HpBosMPBp9yWW/bQd1u+xiTvGLQLbthAhGrSNV0WmRMc5+HKi8JOhQXK/ifuU04NHmnopFbH8CEAWEfxEESolp8Dvwe/w2ESvP8xmRhrVFrp/AxH38CkhY6jYqqqxlAK3Ki0wuFjT1HTrLRb/ADvHKAR/rSXorHBQWA5XVGNiPd+npaYoQGUyECGazhd/Aqga68sIBfXhHfMCAAkEw1EpHQNpTWjGiZT3/ADxApZbJocpD/AJsWdekDA+yrmrXPQGB/CYA4BfIIYQXOECn8BgIMJ/AgCNBgPR+sofeexdHHXwxtiZ52CEemimcAbs10BcggI490rgfxPDZWB/Hqa6AoH8Vpz38JgeX81tNn8Igb0Ukfy8ovCZohTWA6R8CAXf7sYm7GEy50TpeZSDUtIYukzOWPWBkYtyboMnlf2IA7w6EWLrJQ2MMJUaU0ZHkHqKONZL9/nlwfLat9IlaTQN/Fi5fa4vIkDD4Pgs+dzlNDKhuve5ubRLMjIVWhklGK0EGGmHUdysMzhgcZrlXrAFVLRcXWU4n++Cnyrx6g5wvFQu5yuvrZMLgXloHupMozkMWPrb4YQa2MOwPS+m1iayLs78eOEAXXVLUwxtnSupCp1sBdyAa++4UkAubYnTfXd2ZlA+EvrpzPSIzzXvFfbX5GbqL+CvVZwjoOZmzsGCZaytnHjZ/DZuZzQFAnWBIT+jvO5V/OeRbZLnVGqRIAGbyH3FtYAyvzkFtGnF6wrmzKYN9NggckfUkMeLkPqJ+nv+Uj+r8BVCOVHz8YnQ53dqnlZZRtwzJwdQml1x5GygL7GvF510f6n2pz+xVi4KBJefM/fp8dCQP5PLHmcJUIg8LvcXGS6EFRCtDYByegGMkOsJneVpSSJZUbulcLJECgqgbVy0Vs6yhW13EARgViugApJSq/cH5ldyUOUwIqtw3FAlFdHLE+2RZVneq9Vkd4LG9UsPY+ng3AE84orMJ1F0wuZ+PqNIUWod/BY5sBNIQBaTempkKasgUSfYpWJqAZj2QksLBGGztpULv7ch7tCwOI0ZPGD4OLDDLEojNojwuIBP2Fn8AJdV+/nnZb7pj/ACXkaZGVqT/UGmCQif8AJfobckimq3U0XWbMhq481+75jEeC12JR8zUAd6gNW6FMTeERSHm6jpU6VITpxunbDS//AETOcAsHRga1MOG0fwBE4ijLE1FG21vZofwqIj/wJQfCv8EiI0FbCJKfwmGe/wkOv8aPFhdjShCKz4YyEshvJJKRfDLJ/DoWHvOXgcpbptDGSNuXObzcalCR5KLSnto//No/JEMP8CpDT3/wI/TfxCNEWYLr7JpC/wNNOfY2MuAwLcM+bTrtGFj/ALMe50mtMe+r7F9vXWH2ldXVIl21kRjZbVPkzoIF0ue17gT9fl0TIDn/FJ10p496o4FnAYGQmKER627BBS6U3GFXYV94HNvMT4F4fxKvJw4CA+fJSnvn/A005/gIEII7ZgDg2WqWMGYpEssj9gzQFDUI0v9YhfHmU0NdNx4zsvAdU54dz3QvzVzbFUxrmt0hPqlT5BI6CBc7/BQP8ApiWgSk97I0Yvm9y7cr6pAqY0mTSjzTwmwFyCMgiUSvPBlqQVmDRiWURRZo/FmwnceZ9753ZW4n8emqQCgf58Akj2NAQbudyKJy4fjHS5kWMb7XcyHs7su0H8IDWIuSaDrMhp9/4DBJP8HB+5IAu0UKq5jPyx2bLVQbJFSySQvcLmc51bBgJqE7Wo2lJxoYMhwwmzKYMLIMBZ1eDW+rbBc5Kckg8YBM9DHKP8PS9rT2+n+Lg0Fr+HDdIOjOMdtI5TkRhnSTZ2t/F8io024IFsrJc2DHqsCJExf4bHpJ/lW/OPzrEE/j4G//h8dGgP49zjwBBRgk2/e5R3RxjRIPL+cN5CI7v/YnUQzpoADmV/dSWq4icWkH6wf5k08Yl1bAIj1b/TRWaE7UdtxStvO84PcYz6p/j4/SQdwYkY+1mpn9KHLo5vcJPdhcMu25nh2ZMLVwHmEmjVmRElONiF1igIc1hLo+0V9Vnxz3LapJY39b+gj2fjYlKFdP4/Af8iPDezVimycVdWoZ3HFu3hf+WYtgdG5sNrKjc2jAqMeXI/LjCaT8AUvuNQFsBPQOEkm9hloMxGSkzg6Op78hbM/4/Ac+WuWvOHVTLMTrLo2VLxHCez/bPHSeAk5oQ0CRybV/NRsq4wDPD1L4CvVCYY4Nm9oByQrEYAI1hS8VTNMDdt8BP0B/iO8w7Y7DKF/hIVAEJN3BERi4mX/jSQT5A06CC24/h8SkAyT+pBKTm0dxqYek1h4LpV0reqgC+OhLgjpvGT3CCT/f9XVfacP+BhKQF/gQt6SWuiCSQVJWIEmXd2rSOi0j9v6XZgDDUjoamDrfNpNo1ZjMxMzJkMmFj/moSkE2BegDbRzT+AQDwKEs3HOFnWDgKpQ0xg6Ig/erCtjP4TEzcrAaQA/l7KNQP8+BW9SDQSjsUZxx/AKRTKzJ+b2aMdgSV3dnlL+y0NusQ5tV/AAJeU4mwLME3sxvWkkjALI14Yp/o7kQ9tTdNFDIuQ7fKqxjRDK5/H4bI0Dbl9mumBI6KIK+HHa2c8K91qZ59+fBMvbHu9kIPPNImROUXyHvFZVf3fnwZxniB1R0XLtPEPFokzlW65m+7NB/EmUrpsUwCQjwW5iXsdP3kM0siinFMLJ/g4MB+/mJv3J7HmHADzhoow/mQ/v/gXDN/gOgn9QMuTLn88TH+ALcNF8/WH7dGy3SgpzkAbnkMz8Vlm0nuWguz32pcc79NDoK1t9+1hs5FbkC3sAXu/4JYpgv+A0OQT/hkmlJ0P0/wEIGBV8BgT8KAuvh/A7nP/D4CLZOQgfRjBAfwqWz+H8DlcX8PARYDqypN3sMLwH8Cge4OLAfibVg0FM/NLo6MGKWYmGt2a8r/gIPBLRV4GfXBSIN3slxcBzxM1/COGvZ6b+wA12HLBP4AEMTWg/gmIx/h0D3hrHJMxN+tIDA/xsCOVAkc7QgCN1BxfwGACdMGA/lh/kIEcq/jhcTpLH/KwKah/D8HH8xnCB/5QBtQv4xwYwSoF/4cXkgSn58xb/isEf/hEl+0Q2/h2jfGP4+3QwH/IQWdP4Je3NPHi0+PntBo+IdRRKtP2/QX/BER+ff3taflGoSZUIPp46Ea7akj50RC8HkKsA/x4lpNnJafwJwGeoXUFnMjUop8pg9WsUoiC8nYyUyHRXgy8MOJlrg36VMGrK/F1gy8w9pbuOhVBSF1+8oPp11Ib/A3AYIrISJ/wGACDf8NCi8C4X+2wwGA9OQGRRfrosolgpE2+FnM5POjLrv/A+8D/BICNuf5QCCHv4xYHokMwCF/JSf8DYkoba7vm1rVX+w6BeU3v4H1Lf4JAy/5Yy0AlXdo4AMBUMFMuj1FOb/JCF3KCrjNC5RD03f8GA0r1IQgEwoBb/AYAIbQYCmiv8Lo5X8H40ULLohSzvhU4DXoyFwXmIxzoYPBN3+QgZVv+AwfwOGvb+ANNSc+CLgtERhO7NCERQDwBxuG+ZtS7WpfQ9hN0gIVf+ACj2888Upd0oK8xodLsXaPITEapspAL5BP8EBhE41w/4UBkYwehIJRgbdJKkCuTEA2KXA6ONJIUcqh5+T+A0FfeXguaAArV3IElWPd8me/aJ1pWTF3qrC7+dJIBpTcYRnzx0AT/x0H+cEEy/x6T5f4+E+hAOAAQEBAAEB/0QHIhfyk9FAKB/HryFAMB/S70UI+26ghrj6/w9uNJmt/Pz9UDck9i9K/Be/IKdZrTjrIGvXsXwVMZnLbbGLhfWpH2caxpXT/AHA7WEO7tmEDpgfnGazOpgAWQtSxjnJzwK7YKWbGFvW03H8Aj4jmjTkSbHKAvXcmj3tUlD2psQtbFpsWMBeALCWj7RSHmvlNM9Fdn0QpY+lltesDd68lsspDtM4D3Qf8gb6M3s1QOr1Qnnj3rezI9ybMnq5RgHM3HElUHDyAZp3BB5G0BJ4DG6OqAfvuXmimaGbifud+4rhldiKILwVmpcWitfwPvdSs0K0H+F1yWVhNSa9ktXoceFu2N3U6OWoWg3lX4Aq+URHplLcYMHFSb0hmpYOy1KVX2eo+uGAk8UObfdTfxDi2h8n6SgO/e1VMC+HzpXcfKJ+zskaeIyyo4XkjXYLJTpfqgyTfU5o16p2dsN6npw2vXrCD1A/wAdNEhxERvAMyjWGJRQHGgpD4Cgxy+tXK/Z+t+Ba9klzezAh4DZKwSYPhTNBifEV2x17Jo5h+5KLr4nYbIm+nKkCuqytP/gIBgyKXrG6DHDlzAu+eOHQvoUM/+AYsrzfa3IpAzZe4HQv4E+szL+IhqYEQkdm/gDnd3j3+Hysd3hASFI9T+AwAQ24MBhzT+WiscAOP4rKx4AbPH8jc3vF8k1uraQWhTIRJO5Hv4rAH/4fDXFXTmDJvmQWf93VIUkaZkCezGXlV/wrAmgX8FqANrx14tYxH8PKE30o6/8Vgj/+IgQ8/D5f4Tiscs/mArH/hyl2uv72KxwUrQQ9egzPlhVKvn6D+AQ3BDlccY1j+PHZpXai/AAAZp7fUSVWvvSPrxcDqLFP3sIfOJUfkhsyKD9dFcvLf7Fb62iXW/xcEHfU/D/hIJibXji220WYMy1GAQXBtgGj3vDQ6w8/xADXBfxpMmfxBOGAAAAP+kA7MTAf8Tdl65WtTLjexUAlU00AACXJwEGAwcAQgJAAQFAv40HS/4fIAQr+HG7XUnP4cDsIjabUf+PABz3Ivc/lwAf3pzw/jwAf4PvG/hxzV4VsK/h9m5Ysj+AFHK+tdEjWEph2Ul0vXsqMGt/gBXaifxDhtjCAPPEfkqY2VNGQMWbQeMmsbkqkHX5C0Hij0slFM7Wd7Sn39wYkly5pOIh8horD1AT2BXp4MyaJ9eeL0CtWjiHzq9Barf0zJXbqBFlNxyZ66HzDd/MibuF3k8hAA8ZXs8Urg7wSTX03UP+H2ll1A+tZlr0pL5NFDANCE39+7DvUabo5QcyOpz7g/8BBX1mxhskPNlVTKY4UN6x89E/l/gIOPhcAe/AKKlB00UjbdHv4A6NAiZqW0VTzYJ3rJz89WO9lUi0dtg48BE1Flz+zVKsMneZjVZjTOfz06m3f7xK0FHb+xkxfpQ2yrALj/ZOAWyZUp8Z+p/4E+dJ7l4HyAlLIPzKH1WThIX/Vv/IluP4pOrAJuLSgNIv4WOrP8FAif38QAIsMCdELAlRBwH8KnNn8Cyzf8QgRQX7Jjp8wW3AYDaqrQfsl4jOyrWOWMDis9ySrE/1v8Cfj6l/BErqMQTE4SnNL2PeuBYbaZH38KlvAWWaQYN4+kOC4WPGGMnnwH8KnOfyBf4IBBjelhAJL1x7+AwAQngwefOJT+Ezk3/BAWUPJqyZ/LT49tPejOn8QMDAZ/PbCx/D4/oAkH8eMbAOFfw0LM2X7Dgt1fn8fe7f8PjMCF/CsryA/wr7zAv+UAx8L+R7RAB84fBH/4FX6hYHmMvuJXt9LbHjV/lk1v8BAnw4vcxFcwjFJTyyKukpYK3zl5BpiTjLJi0hJ2dOmuB+L6Gn+Ydx/AN29/xEISjG1ZFttvwGBAh+POz+pqmUzNGw0d17vZakU/WmBfIFdfwJ4YE8IBOHKm/wGACK0GBBo9/ioSeeHBBAhW5wAC5AoX/Bwk89/Kl1b/AwDN/HzZqHf4ICgQP8LAZJD/FqAD/BjVr/iYFcF/isAf/jKjD/jlBBG9IjqqfMv46pIj7eaxHy/z4EkAgH+IAWBz+Ewn9gQ0Otu5kvIIxH9nUdbFZXGKZGr7mh53J91OR3BFYY4n+HQgEL0cAYiHnrWEmbgwjRqAJub0ooMb+CF3oq/v/Yp/kbYpAyR9T8+BV3oalCJU5DntEj79Yj9LaPY+T0UJZh0kGfe8YogXTgGiRbScDpCf8rpJZRgCml+2pdetBnqsUAC7/4lIzYbGBwTKhV5gMB4Os/MI2VLvoE96bV8FoQD0glOFv4E1fgD/iMkS+u/LewT1gMCReytB6UKRgtDVwvsiXl33sTrnD/+B3k3+Ylzr+BdrQMclOK7uOIuSeD7Q3oKKKm/GKD7P+al7/+Cnk0e5VP8A5EOCQERl4bikBOfkPVm+lwhhFw97mxctbULztf/hQEv/gV4H/iEqIAhk3mPTKGYD+FQPA/icpDAr9kKkhjjYDAk4vDlSQFzoUhGnLKFSPhSI9Y7FE/wYKLeNOjCBfXhAEKAgyFWAEF5CsoLxPACGxGUFq45j+QnrgeNsXX3YJ/CdR78OwOu/WqpX9XzLuXA6Rz/gCBCSgZV/haXyi4/ZvfR5u1j+BeHP1IsNQHTxhRLx85VBwxVA9cU9KVUwhl+RLThPScOBo31QL8eeJ3lQjJ8+uI3I4eohzWKV+HsPv4H3HU7o/yQD5rfyHuOuwCKB7acIw+e/QU5Ef4EiHE6rquXLbDCAo+4KJFzGfBAWr/L+46AID/HY2p/f+47/KG47JyH+FA2ogQ9of+H/C4Gf9V7spAPgVeofCO5IPGLS6BYMhQMesluvSRmPy1GJKO/TcL5wzho41iGGDyuaIx+agtpMfVIyeW7EwlM4WN7jKazTUFLAwe1Un1YOraxbVdD3qGlLQnGTDBOv8gS65tKaoYe+NIns7kGVpNITufvaLSyph3/AQSKV80kI+jnPHjD40uIPTSvshrNRllc3POsV6q/45cb0wwAmADRWOG89311AJ99J9Jjlx+qZFS1sftC+ASId5HhRr7dpkQ3dz3i1es+/w8vL4MUsMpWjFHhVJFoXsRUT5QoJkMvymj5ktTRCEO8kBHo/er9Moh6I/WLZ1aCJPpNx+BUdJz8f3jwkFQHPqZAJihOKteHXUdIUg52Os6jukZpq4PMxGXI4XUUbjmTiAIN404q6bb19KEG0AxVS+nn28Clgr6ITFSj8pK+CTu8QyGICvcYlUhvE+YdWn5OoU8o6THfgna5jQFFmAh1qoagJgpGTOp/CaXT90bQ0CXRVZFCoW570vpAD8dxvnnciP4l+x3nrgM3HKrIzgQSDEFo/RRnKFIGsSd37/FA69bJRh9FyUQGB/AE6+uoy4qeZwpeEWVuCrnl8+GLaAt/xSYlPQGtFADFkBgQtQ73IqTmIVyANxcRXHxEZGd1JJ/FJkgjUvA11I4wGB9vAf5ZFodzyaCfbXkOWoBK2+7r4F8hPzggVHaC/8BCQiP8EIGEGCqCZ/iwKmTCQT/BwVMn/E+YGx4haNFg2N+kKWy/Du8QT++iJ/Mt/Hzdb/D03B/j4OwKwj/JwR9f/AtknWxmR/DYAf/GTob/D5U+gvEpPFaOPTpVLtGEujzlGwhyeHBwwmcWY+1jBrAS2smsLbcVgxM4HMjcFoo2ko92uFa5p90gwBEZU4h1rYMmYW3Df8lXAn8CYRp/Hz0bAfwaQAIt9/BlwHn6el/DoL+7H38Km6P8BjNmD0JAD0JB/lAJ2r/gOTyWHj/GATwv/CtZn8z/+egpIqAAFh9ZjN/wgFuCxs4CADU5uc0dAvpj7GF1iHqFuP8LyVPgfw+Z/wH8IgrH8lhf/8ebpv8BgP53Ot38D+zgCCl6yjmMavRHW1Q/CzZQDc/Ng/HJ/FZF1/GQk//BiUP/ImBeY/8/AxG0B/HqF0wfyXCG/w+eP/xjzGgLB/FaQV84+Bldv4fyRf4vEv/4fC3wKqCNj+PcsUC/ywDKJfxkOT4D/HYG//MUaTT/5IE96TQ0LjkzNTAx/xIJGp/wGANGEcHP+GhI27+BABc5MjI0Mjki/wsDermrIzGLIxnDK2oW2ISybtbQdGLGwmTGymRkbt6Uimze7uX+Hgb5w3NzIwDI0MTV9/hpULAmnt9SWmWjY+/gDE/WOlAWtx6PU+b6MWIX8QpGS2YXPr5GVXyf4ASROmEHN7dHqCJl/3kXJnAQByGuqgkNt6/wcC/V/wS0Ufy3yHQe3gVaqRzJVAn87yhcUIGXLniAecy1r/wijM/y4BCav8Cgl5SWwo9FqY6uLednrAq1u8S/wBk9agrSAxz+ta60ufRzHTO/IuklzAMR4QM0IxzsE08S3d+aipfMP8UFuEL85NWDGsBgO/y1siiJMNzmXN9SMCndwsJ9tmrfxQAieefZgBSjfwLymepbAEHDR3U24y/fyMvRqH6ZK53zmz5mSIPC5indpyUpRXYajgH8QqQnOtOitdQbhCPVby6Uv+Cg8gcG/iNxNFFo+hEm+IDA/hkIlcf5pTOv4ZCJf4LSkP9eAhjP+KgUNsehACY6U6/ygHNw/xleb/wfeL/5wBQ2/4EtMTGBwK8tdQM1lmjmlEhQkef0QZwaW2nE2Io94Z62X1XGUGebJDjEXSsMQ1fYR6tGnZxGwaTqH+CQuYU/gN5DHP4YePvKzA86D8aAwNPMdWmAMInM7X6gjuCkIp87OFvV/gX3vNP4KAQXcYP5YtMBIzk/4VEW/4D8/3f+IQMmHAwNkfE13n4DAyOSTQf62OUEJ08YjfHWetJX9ShP54IDlIQE/gDacz/8BAUpn/CYBB/gwSOS/gQWINdGTf26t9KKX2OczIONdts6eZ9KDnvATk1tN/aphNgKXRjh6vsGsxeJTjbKbl8GgQimL6AZ/4Jb0P4eApwHGiribbLL+Fg6j+A/e1y/iMDtAKFUcGODP4GEWjicFOYN3rEwKwAbpb75waubK3G1W5cjzREtNHq9pn165GP84JN7XUWW5br0sehaWOZoVkGyfySgiM1/goJpuy/+WBfoIB7PAgNEVo0EpU/4AZU1D+AbJT3DnKyFdcbtQ4/gpeU/gcCE/loZ/hhbAdVEMrt+kjg20uVtBGPEDeJGn1sK/ghq5NP72aue1pg7N0cswlcmIh2T1UFY7OJOVGP48dRAOAff4FJGTzZQpi4TtG+2ykuyDeKhhBpMiulf4A6aDwkmelSTlgTBpJtnJdTPASf41ZUjMt3gRHERGyPwASIv4GHORMVI53+AwASl/hkNKwiZ0Cfze4DAixxRxFyOq+nAOjcABsPrEiS/jmz+B+D3+CQEG/loUDBVjW/gXMlQF93YAeRg+14ZbHbKut1O9/mKpCIEQIrZYaFzHo5+jbX4wq9CfJPsjFc9o6C375MwfMVRsTf4GI2v4gA8IbyIgttcfb+FhfP+AyR82/iIBEgOIqguAt4D+Gg4/+CwTT+WOXIurgnIAMCS3FetfUIHAS2j2OIUBo6oUs6DSj+CHkm1/var8/jFNp/x4DB2srVv/wL1uCIIs2lAI9Cw/wA9sZUBTpCz+7MLQmm+ZEyPVc62OsSkEObzWIeBKw9ZINF8Pu7Otyuw/YdUBPv6fxQHHgOXh8EMcj7/DYce9fzGEtIvtAWAB/Cozf4fwcBB/ww6smnq5hNpN0B/ApvAkT9qMTpkhl/lSCCv6uyH3myLLeZW3CeqL4gB61qMbO0obGPYSWxIAcFJ99b6g02rIiol/Ip/FAcq9RS3TIVjfwsJi/wH4sPf8RgWAJBWGYFbP8DGWKZhFzC/wB/oG7ZIraSY4yijV9HNq2qxsDM/IM5Q3CqNS0vqN1zAeWLobNsotx/AEHFdfMyUKrvzCqIfAXf/EoPPDQ0Nt9xtdsBgKXFbo4tzqDCHIkfl1HImiirqmi9/ApT0ZT8yiuy7/AGQzc4H+AhVi1U5CtqnCyglv4AhrUZ7JFDwJoim4D+B/r7+YhMZDGV2VAP4XpN/4DMDIGmhAJsKd/8gMAE6IMBTGSUENi42qJtMHSJgD4QR33mnASSthCHAmidNZygAC5AcxgnqD+DObH/Hwls7/CfBN/FYA//Fk85/AMBTf4iCg4v8OAph/+MhQ0qrRbRx/h6zCIQX/GwSGZ84eED2P8SAhZRv/jwGV7V/4sZ9/4wUlv8FCBkb+54G9zwL+sp952dzrnwQCTPeYvrNOAEDonImzHaNYFrcRipvPAGm1OvH6ZA5+F+SLOJt0L/BNNzrO8a+9lCB7/FIh6cVIDK6tAYEmGLR+UN+/nVplQGHca6EJgApUu/wP2cfwQkCn8tF8YFbmX8Khxn8CxZv8QjHwnTXQFmRsj/AwrmpJBc6NuH30A7Nlujyx3BP2N+ybtXXYftq0JDjfyZQrlAnOiSPdA3uhAqbq87XXboerCdDBe/xQK5uc8VPUhSfwjCYX8B3A5n/EYrnWc94YIJX/gwEFAVybNkPUeuYw3q8PQH379mkKZuL9egqXfpe9PuppSGP81K0xNSobyYhreY/1yYKfjA6TGC2Tv8GBAYv8QgWGl+fb+g90BgMQVDUnRDiiu7QJqryeRCnlbnjM0/4OCAxf5iJtVM796oA/hoJK/5kFc/8GAuTP8B/Sq3aCJWZDYBWp3hxWhL0NYSdMKEfqBH740BhZSq2U/DRsP/CgJf4fwOGKz38BhFVN/DN/QDklaM0sV/wsGufwQFr/w8AigaNohAnWI0B/iwJtfgwEq7f+BcCn4m/8VBAt1xif6ECBbf8sAzvDh6m1m6lLWwAPFNw3YPISM0J9/gIGob/j2tWHUz+BEab/HANm7Af3jVHJS0d1eQkgqu+CbXTt/KmRcQ1DMQ/x4zsg/3K/wLOIi+6O5UYcn8AVY2FhfpOkkLPxmfB7R2SxNXQ7OViXloKuBebof+MgUi0WhAJhdUz+AwARVgtyl/ycCAxf4mCQCP+HfZKqlv8XArfSjBc4QLwgE/s7z/AYAJBwYNU/H+LAJCDBCX/LwSEH/GPE/VS38fXi4m3tzKktzG0Kxt/mgIcLgNA/j7lfAP45A3xDDSRluHf46y1CYSD8Ns/yQEw7T8cX24EEf4eCyW0w8zJjAEQO+crz9KeHSRUyY0abj/iLBQ/8fA/7/8PgL/+MAjBT+NQYPpKm48FP4+oyv4fecI//hanLoQAAiwAAT/8bAhjwhptKBT63cx4PKAszP4i9wm3i9UxaH6ZFFaG2CNMLzfwmFYIID/CmgyCf46EVnf4BsoVh/xQEm4PuX1f4wDWoPvAPtAZiSyHnGu+7Ll5niQMrmAYBe1/gkoy/h0gMhnDr2nZn/YDA00rKsswD5mZ5TT29fWuB+Zyc/Q7/gUUyM/4KARf5aUbEPN7cDecJgb7JN1Ij3ocV0FhaxlGegElv47YICJoQCNrHfgwPbyf86AOV30/w6afiLv/j4HK7rOXF+kA9xKrjnXHxf/escAwVxmL5xGpBphg3ibUTq8FaqoUJjfnx4kAeMQS744bshlqOKrWNZ+MMwcMNaU944v/yUDld/wHrhMQ4YAemLi6MTAumCkXdmIgPECQ3SPPS6cQ3VDGe+Dz4T2wDRdWqMHm+B2cULGhqFoizDUw9f8UA+XkryqLfJ0BgR16cGRPRe0uPt7ODRLj25WDqpjqfwO6KfwUD5f60BIlIgIwNoI6B38BtZmPYPGm5BCr5H4H0BSv8HA0B38EhXs16/ythqFeDzwBmG/X8DPh7NxJHPEUSNLTHBrITSdXuwaYI8IT6SXhIWI4iqD/q+/wBmaxykCwCBvt5d3zE5VXZGauTlS8MqcP8DP34p+05r/AYAIe/wx2Lt/3YgW3J/wsGz+H8COa/zEACJDgOJf0wIrx8B/CoJr/goGeJ/iEBFA0wtcNWKyvPHwOZ85/9SBzPhv/fgOZ9f/68BzPhv/QQOZ9/Acei1cjgb8jnmtvStjJnZR7fe3QdRjo+eXl2CKwoIwkj/B/phWHzjtLhLrXYBvCd+uxsRur97spf4MCZbAqPhAZ6kejPgMAE9oMBZTn/Iwbqx/GmK0GHoT/D4A/RYE0gu/nF44QFJUSgrrvCfn+GQtMXxA5QpI6wW9Y7P8QeA9e104l9BPqU8YmEog7rsP1ZFgMWJ2+3SvgWO8kKMT4M/8NCjLvPwUH1fy2k6jl5/Au5I3OJ4mA2rz29DSaoVrLUtB3lSg42dPfc/4j/KobHsvNfZ/4sFL+/8EBY0IWQn0IBLTpnz8BgAmxBgLs2/4uFL+2T/Dwpf3gw31IoaniJZcZ9fAvJDCN4gkX21/Fat9/OWZ5wfx4Eq/wH0wHSfCRZL3PUhQOJMDnBHuHhG55bg4eDqF8u9SK2hbk1CsJc5/x3oKYFyfwAgOTNBW9WVupifpFwzRpwTeDHJsfxQMfnX1XTb2GMBgVFUHFkvBrWvFryWIpwPFOTrZWFJ/AnGeV/xGL89owE3dkj0BgbjLL8Y8Z34/maPY22gX9m38Ajggs5/wiFwH8scJVgu6aV/A3CU5EhnfBmgD5onHg+OXuFh/AHU31JQBZ/o9FcbcRhwjDZkD4sz/CgFV/E43UAlDFFyBbEfwNkdf8FgI38vTABjFOvRF8eJX6BzgOB3pOcvhAcNvf4DBgzpMhAKMIuj+AwATxgwFO4JTlMNKJYNSHCMzz5iqnLQqgLxCeu+HNx+4RFXAALkB5Foscc3+K9uD/EwW4X/ExtwfwMBDf68CRfP4/3f7h0PTB4MzE4YTc1ZTg4QkUzSM0YUZEMDM5QzA3MDNiRDI3Mzk3YzIwMzg2ZGUmYz04DQ1MyZ0dD3/KQScmTNp6YmpibmJmaGpl/BAAm3/xYAgLe7Hpiampibf4mCT6v4F9JXZxdIHJ8dFUsnStBG+eSylrpPFGx6x8lWwLRoj9/nTPYIIX9dqQ0RumUxflfLt7E8qCgdNg/+BUusE/goRW/lqIlVrDKwG2vzS4MbKHHnkMxvoIfhdcDepqBP4oElQsAZeyhJgL+FjJD+Be+/+IQzqAkW2V1sZF4DAKC4L0tYxSsDsE5qzxAuzaN9plJH+BLvvcU7+C++/x5P4n779rXPRaPscktj51EAJ6jU29f9c6/hjofRWORgkT0/ge1EecNcQ6QRZdiRmD7vnSI18s5jw6+3LLLQbZgQdQwDFn4KGlJ4J9BYIqVu4c5JPlY6dxE8zh3P4SEmP5W91A/gweN0AMD/FQhUJQ4v8BqU0B/AYNJN6mfACR9q55qLgVU7hja43XnAW/DJvboXgCQwH8ExQ0+0TS78D8pGF219zXvtfhayWJQMzJov1P83enQN+oFC/OfwoFV/wLvz/xCLlhxAIAz6Ey8BgWcNi4mmY/v1sqx2fGjR/huNnXe9/wJajlP8RDo9MYquhdH0BgO8xS7AsTkwBsKgSCYMRPe6b+imzfwO04/wUTxfy3cb8OB/CgD+fxOF4AYRY32Up68B/ggHd39YE4RCkm3kwT3xwdW60Gqv5reCbdblz1Hum+HV0c1LPoaIasxJknxGAywrubhhpNzfguvWHdn+BRCdN/ghxZq4Khlf4nHAf8TAkuX8NeJn0alKIfwM1uwoLP9mjE5Txf8CZuXiHnZ6HPdJjxY2c1beJ6j2BhTK11uxN5SkmAlF3RzUqiPiR+I+KKHGB/xQElB9TApkVTMBgFNhNdNa40uqiHZPLGmyrVMsAGn8DAHeU/zKKLgSay4ELQID7xh3ni0pE0RYH4F+wTfQ94/wIiziZ/MpiQCY/B/DQXU6fzGT2f4MCMyP4DyUok9FczdR9ngRc029C9nPVJcLeAaTuR8NAyY+WUEGqNj5j+FA5XPwUEB6/xEGITpjEL4f/AZPY1Pw3m8rORXRyFF9c05mY/kq/5vx2wQKewgE3rkRBge33/xyojAWL/DyiNR/44FgI/rwgCpnhIy/gFx3twmJbd44P6fAgsOLxHDuYvApAiNtFgo0dz2Q5WpqKWGxO67d4culNqGxHkE34/SaJPRvwO3rNc3gz+KlEaMbIRk7ZW0P94o2IYax9n+AfatkeGRBuNItpaZlwdDe5ezub6PdVRG1b62dUvtwAkMhnPLtRjEP4oMkSXUzrK47/4WDaP4D9vCr+Ii6AsFxwj7YfNgP4FQynw/gQSd/iIvmUpB6Uawp/gaCykKkmA9k17NYoy1CiIuBUNnKwfqBSODJ8Tk9PO4dfG0JXn8xp61fxDVmSJOceyJDf4Ac1BxVjQen/wcK2XgRfTwgvtNkL/AYAJGwYBMh3KQWAmEkSac7F5Z5u59OX3r4mPmhnAXICCf4OGLSj/FQfYISl2Adf5ABmnf3+H+Okvfd5cxIhvxGfGCqlADs9JnzFLI5PBc9YTmBMTbhJpej+DPolSZJbRPGfYSII1tXxypFJ7r53Lf4+5SQ0VS3GNlaWsutfa5vr7/hNwdDVO/wEETlcyu1sIa/xfFq19HuQD2wpn8D57v8yjo0MROB/Cg21/AeH+nfwQyAVn+MAUCUGcETR6wXFgP8FAqo8KZXTjH2sK9myD3m7IluL1EtngiAVn4YFls/RnRX8AnQ5Oa1nHvqHhw13vr9OGllzsboWlM6xJ/wNriC4uCYf8AYAJl/wwGZBms3ykKkwGAcqD0v2yn4I+9wvJuuTuqcMUmQz1/FACJwkDvNbaHAYBuXLU1ZF/Hc9Eu6BBzs4jRQ/dYf4KCQOyn+IQImHAeUvbcFK9YDAW4e/4WBg/WGAjNFNXoIWOrHDOcf/gerUFP2nNv4DECD7/WgTA33OKEAc+NAo7O4/gMSIPP//BR2f+PBlVwbvP4E0LP4dXznT+qRfOHP5+Xzs3+Xl84c/kFfO/gPnAmDKY3RkjiEA6/wEKJ5ZtlNHMB/o9/AEc+LgVO09nTtT5LLYpk8HH8Mizjl8wUJWf68B6bv4UEYP4Ft//4iEpjrtfUS7KIDAyEg8dkTpMd84W29DTP79BY6c7tj/wUEsLgkX/BAlOf/wGK8YYB4J54+YMaKDK21jtGhMn4kzarw4fZ5mpezMpAYAyC6ESGaBkDcojRz79L5oP+j1zUv8EalJd/e2Myb+BO0e0SxjSXC/nRvU08GNCbG05/HmMyHUib/Ao6dOO8+kH+TgMHqlVbCbjta+FgD542EDTuJABUB/K2HQk4g/4sC/kASUACEGTBiSP8kBfyH+jApZz+ElKL/GAg/aMPMH/AE/T1gjf/w45bjCpFf4+FgzhD/FQXDn/A4IGA/0GCBtX+MhEEacZysPvND/h5lW3v4kBAzhXYeNsWScNJ8lrZkxz8xwEPkNgf4oJVqOKnAAb44DA4u7+AKX5vhQkW6iax+qSzgb0snQ8ghH+BvrD+IRGoCcDm1sjzOgMDXGUSkO7gDSTGznf4mfYS7chxuVv+BVVAm/goni/lpA4LJK//hUtX8M+BLQ/+IgMrrD8XFPHT+Bv+egYqIYPL2A7Cd9yPGXOYSb6jOqnsEoLJps+LfvRYQi4rF8ZKLbG9zLL3RBpiUoWFmaHdFzm9/wcFC5/zIB4GPdZ/wupBY38EKQQaX1+p/ABWuux4lYewpwKR3kDSWEZgIQyb2TzNy038C+lf8Ln+iffwGf68H+eglvgCxv4ekPLn+Hz/X8CeMeEfqigI07GBSwRd2T0BBGwAUFpe97ZAzqe+8g6RY+0JQ8z2P/FquIFq3TezqGeYA4wV+tC1s4hfkpI8Fhb+JJDz+HLIeB//zBQRX+kgoIqh/gWyHo7d13MGpF3vKGT/eBVmkdF+8NIFrpb+ALgI7mqhZ/GF5TU3+AcwbQTrn09uR+KRB4b1Dg4SssWTXWf4oOLHFXsmg58IDAM/jMmb0/wIQedispz7j3cj2m1Ig/gb/d/iElABNZ6HtwXxP4WE1f4DXeif+IiGGA4AkG3diLoDARAupijmH2IndCijb1bh3/XSV7Y3+EEpR/lz7tHcD+E2Iz/CAxq0qhd5/AYAJiwYCx5n+TgnzkNCQ/OFvRJI8TSlO7IHaFpdH5CXX/Hweqh/AvwTT72IdeJV6KdNIgp1dSqI30Hpp5GrscXQYj7MEmPMtcoDfwMirbj/BZcF/K028BK9kFr5BgfTV7882PGouFow1jFYQnM10yycYAXyC/4IEiVVVShAJnlOT+AwATrgwIa1P8gAuOn/DLuhC/mwy+IEZHieprEpr0uabste7snTKI8rO0JzVF8Mjwz6/w7DBQVIs/w/Ehf62DCrWnt78mjw+kdEz05ya/bhLnKbZ8T/eDVNiagWIVKUEGJ0uCj4T+PrIf+D2aGAYX9/h0C/80D+HgV9a9BF/auYGi6iL0ptDdkhN0noDUsaJBLaf3SG2mTizbce/j8F/EGRafKUhL/cBWasxm/1pIpKWmAq1B8xEOv/AQRfYrrjQUD1irTUtchJz2YBrMDhqt38AZlYwHYNtAm2pVayPm+QQdmz//D1CMAoCw3jVcPISEGeZuGjWwve8NwX+AgZUevUr8RmWNh25EBmxxi/9XC+eP8RBa4F/GQCuB/w2DftPb6M4AAgAF/qwV4MP4zAVwb/IgOHQBf5kGDvv8jDCC38ZgK4D/EARf/GoEXgVXaj8KaRB2gG7XspFSTLSN8LdfldKbrCgepW3LUwSFpX8M5pgO09vrV7rxmAPJTpEPte9QTy7RUJT4sh9KxM7piasIe95vpx9jfw6xP1EINHoiBfwGACBcGBMRvKL4PDIvgBhDV+S+k0OxvIwTPTOIr/w6xPiEP8jxG4uLpAT8PMT/qJ/w+wfoRLadRBy7RBT9KgLG0GuA3oud1kcaBN7ZVdMm7+Yz5/hgOY69NhCemW83wvFmeKpWVi1pmTEA6K/kBifFnF3kZWsus/X2ub+FGJ+E26vk3EY3pDOMbZRi2uc7t38oQ/+KThLtCSt1zheAwECB4wyhugZHK3blZPDJmbJ7Su6YALw/g44R/xgFav7zOjl6G3wGB/ChrP4c8HJxv8MRb3u3QdjpeH8NkwTn/MaeoZqxDIAH8KusLZAQKfwgE9VdRBe0t/E+Bn/CYCR/Dpb4hIH8RLdjr/VN1Njv8/XXer/L112O/yDdd/wHFv164gtmEUyWLVOyyGb0rbo5bXLot/ka7oy24R5TcbFDcZn8KGTX8C+iX8RI/jBaWRHnRn/A33nZ5OcUynUpnHEBbVYrf4CELLpeax+CEFi7RX6B16QB8NfUedHp0cD2reRhZeqeKVH/AQRnke/U4AJOM/xSbJQs/i/wAR9QSxAYB1wCA6AUedOFZG4JDXwYXEJRBfqP80mDYCo50ukwH+OgTDo1/v9Ay/0kFTMQXPAv+zbS2XZ4LvYCAVRliPn11Z0zahrX8Luk7CO4xqS4Do8CvFEbqhyMyrV6JbFIwX5afvJWegu/+CgZq8o/iNhLqOpxohVyP4WCWv4nGagFDJphgv8+AwAp6U/TNS32fGzTR8WQQMpWrNm2uv8HBCof8xDNUt+daQAYBb2PyAxk+fIZfrajbrHSyHUzknuYF8PP8Cn9ssR6H8BgAmxBNhD/4oDCWofVKa6ehgAAuEj/DwYS3/CYCsFXijhhwe0YePfSrS14QyoqzLWdGQSZl5CfKzeU7o4qBr8iap/wL9A00ABfzo6yG9NBrIpcu4S4XW7UA+kr5VfwAZojqYtLEyiJPrl/+KhXPqR/gOJhDdf9hB98es/+AwATYgsiY/hb6BGT+D6GEMxObqKkP0y5dQOKOsPfeoaDfv/xwG9YDzgwCIPhUAEyMLGuPBDUUtBZjtHz6dqBKKBQ5LnA7/gnPPC8fgHvATG4bc28jaqxafUkpqVf+AgSlf6Cf4qEyZ3gLJ7p/4WCKP8FBC+38RJLj7azUqjf4CGqqcAJjhtAcOes3J657lbs8VKZDU+VlH/BgSxWRQgFkYVx/AYAJgwYE+CF/5AJYr/hMDy/3wEsVlnEC/6+CWK6sFiSq9RBQlhElFF7Ykoxs7CkJ/gBAJvrrScMV6CXccLvBP4KH97Jgmhgdt7AMf76OVUa8WUFECwc/eXaU8vxt9bK52ZK/5wUJnDC7qXRXsbjfjK0sN452u10wrvnWGkF4SWMAw+Gc3R9lLdurJsTil6QTEduIuCTOAr5AJ+F6b5U8wWoJ7T3bupQdcvNRqf2CUbBUzjeQIjr4mF7wGtTidjaiYd7P41BF2z8BBj7BDkuWa5uhSbFFZV+M/LYURJMJA+1PEi2ipWP2Kj0MUbuODtIQlKTLiqTgdowDTUZASvihoat0HNyr+ug7xxrQjH6IDF+7sJwzjGjJ0IO5UMOe6K/kZn602HOOQAZ/lWE+T+ouIENcIwQ8d/nYorG0+Aky5TH6kiFOycGNLHgjHHulHmbH7lXJ7rmf2ZAhn1ebFIw8wKpCeBS6Iqc7ZuZuOWNdxyVWm0aYoKLwxk9L7NaT48SwQno0HeimgiAPEvDRl5XmAK52Bhqaa6afS1OA1iOI1LBFMx9G0UgJHxqPc8UCPdDf4CD4bCl2g/PSs1zh2SyhV6MnEEdJ2YUFcow/RmyPMKJvRjJgG9dNBdVyzU6XTYvBvLipOmcndiVkQT/AegzA4MdZ+s1x8ZMsL6u5k7SA33t4a+bNNVZzBgR3m7e2l9JR0cTEk0k2CLK7taHv2gvRr9kwSERuYdYfeMjnG7RFpOvF4wgMuwEf44fBHMQ5VM0R5PMIt3M7vwlS0vFMyOldSQQQXVVumOFnrgbSwBBs2ZMAlEddYeBrMg0/8hBLFf8LU/yB/IdP9j/w98i638P0/3ke1oDvbwIhUo73Dbc6yQFLkH3jb6vFFtx60m98axBdyY/xBnYYewCNlyVQ4X/eKLi6LzW/66CGDhJJZAIEkv0P+Pgca3/BIL4itPb63wWGYfXIXktzpOHgDeG3QNror8M2U5vMUUc6Sq/oNM2JX+KhpPEKE/4FGpf8BAzmP8EDUsGFNklKQyqXjCoI+8AkUU6k9uJQDU41G5NQFyE8gUVx5t/kgVWzP440KwGg/isBf/jEB/CQD/JQ5bJ/GaFOLvW2BjPh/Dos8fAr/LbYePQ+cLfxWpEfxijDgLB/lYTBZ/jJvP/j/iWv4f+X/4QaEVK+u5cGl0djE64rM4P0DUr9StnT/j4FI/44AW/5hA/wLEc/xEV/49iRwJ0AiEA4J7208L2DXuQAjXTKsfwPiZyVfIGf4TB7/4TIt+AA1OAA2fgA3uAA4AGquMmAM1Y76z+AAhmX7hbcJUPOQUSuKGkKWckDLNrhE7WhxSDYZARCB7GoQPcrd0sZ++ORJB8jvFazWDkF7hQ16zABsAAOBp7fbq4B44jaT4AOD+AwZ8GAwDfEorLCPgAf4dWwHak/j6otFP4fC4QEA6AYDAOwDAN8EAO4AgAlJm6vZQ3FLOzoSxvNLdeJl9I5xwAHCv4eDsgGRBQAg/wMC/Y/8L8xA9DWaMBgN2CgDABgO4AAQUF/w8AsdX+CBB993L6oEItahwft/gBmmFQfDoG4nNdQO98L/BAj+IKfOShH8Qx/gUBG/wsI/ifwHVz8gB/Al1qgALcAvoC+g4AEADZANx/AbC2QANwA3YAABAAQAPCA9QAAA4AQAQWBCh/Agk6BFAEY/wIANASkBL38CgCK8BM/8a/OP8JhUf8+Ij+A0k3aG4x/VZlv/H5jv/GRBH/H/heQ/70CLQ/5fActByFHiFx25YQER67BjKeBKDzlgyZxkdVwuSZgX11dkRLUN4XKov4ADx1ByQFKnQPu//gAdtj2AlJ2Kt7adGjwP/fCxw/11o/8gCaY8GZ9Eb40EDwOtha7ix2LjSfzqr8H8UYNgGW7loxcaPAYGODuACtSarn3vh886SS/ZBIluwBv8C+Vbd/Manicq6rdwGB/CcyR/AsR8FQgEwtVN/AYACV8GApbH/lAFAd/xYNfM/w5EffwrFiN3+Chmx1fQq8/gMAEuILsjv5diw/4DkxcK5tWVEPc594f1jUL7nM/kiHyfIqK5ALsLgtAi6+qvuugo2Nxzlhjywyis18ww+t4E+nO/wEOWD/xRl5abqb8MQhfwsD9fwH39N/8R3AwvwUUqwO1AYCewvyjBAMKBILaAEW6Qqv+Agr7jZMNv/BwWikfwVTN/60BKsouJ/x4F46u39U4s49/P2LPu/y9izj3PIDT0sgPhT6mPOgzmIX/m43cKYrmYuP3+EAtjz9/v+kUP9JBbHgv/Av2JjC8sYWLgKpq4mPQ6kFm4u4lRhpI/Dr7LQp+xwBGHbXzTbDN28kivHVxdVpM15Sa35o4MpN9PwKKak/8y2zZCrx38C/YmVqK439I6UpIjekdTdW3HicQpB2HUVnkZzCJ6gzM7OFr9w7uSU56E5wSEbVVOsViTyvndVRj/FC6yBBHhewjhbQGAH+a5O/wAeO5gtAIksI3JLw7E0Rfu84f80v9oTPBa2IAH8NCurv/McDkhd9I3AEgRxNKwCY/LsRbzPQpxR3vjAbID/f2KEIBLo9bfwHy3yF+cLHboEjaZB1UD8mFDBAaOOeJQ4IAzEM6z3rAfwT6CWatPKP4pnRnUS0hDxJdyQPyEyoi6cVgRFVMqFzMwrA93b/CgTD/E7b+BLZmoFrBTfwsDbfxO9GAYTZWnHtvvzCw3b/goLdv/iIczrpn+ADbkbjiT/BgMfXo6tGej4KYNaARDnYN1Bg0IdRth/9dHqXcTNi0mpUR/rEMpS9TEhk2p8L8x0cEYtHN8fhhzcfwSFXYn8B9bmZ/DI8/yOBuUTpKwGBWC4ty+uwbyOmr5jW2AdGrryCxKX8ECl/Z/f90Af6SCFQlL+BfK3Vhb3H1AGdQpcl6yDJ+gO7iJSdiP4AD65ObMKbnYvuk4qyZeC0L4EqRMIsxrhhR4kAQB7nWej6tb/BQYum7DsIBfXhEfwIACQTDUSjpZI314wWKpLr/OrLcr3Zx7fLy5UMEt36tjthAYH8JgDn8DNzMB/AYCDAP8CAI0GBOgN/ZDcz/CYHl/PjcyB+Wx/L7cyAC1yr/RBLAPxExxrPFO5Wz8o2QGVTbKZwqu9PQ6/UXBGSy4RqxEAe24YyZ0M1uuJ90L3LW/PrXLoSl+Nyi7qLxsfBaRjccZOqOVe2cjy8tXguOLGm26lufa8WTRgceC8BmnKI67gHr3wF20JdbO8PQ0QbZtdeDm3qGfI6C3Euq2lmpw8yeE/63G7fSC1wRKgpoPHlkfZ7X5xffUN0izqvRCcfwAlEQJaR43b58J/gIbDYOMYLbNGtEfveixpcXxk6oi2bzljUJaQdlI4FIHC2k5nYVpSZLgzxzZryNGsnePFkuvpmALiCej5CrtZov6U5GK0h20myNdNdX+AFnlo6tfT4qAPkAj184JlEQzZL701qLAv4YnaGrsFAFat/cFxvSvqhJ6Ay2uaebUyLu4uw3k5uLuDI4l/8BCUgetcFeZ4SoIzSmLlrR3+vrR4CGvl2Tya+vzXX6Z11ZfA2EKaiUOVAsgqU4OjGJwQ1vEjDZtAQn1g954j+d/g8TyNKUY42Uyi5qgyQqHh+2Jx4BGqC4MmzxWZFFtUCiogFoPCf/ls0VNgxPHg3yZ6RRqJfV+/ldhPYAnW4a4EAwvLpLK3Cieh8VedskpXaARPZdmApEdvgsxqEZAhoXg6mVVCtmEr9A2FH5yTYasuBUCGq9jmxZiyU+g+nDoj7YtMGuYUHUJyqKPcrfGUZpLgNl4/P33+wAscX+FBEf+BzvT+BxEaCtlD/WQWMix6NgeRzRAY/fShj9EhrShFxEzwfDgv8DIQhR/BQ3Vzy0Uhjnd3YCZiWoMCfqvLjpwFMuDcZLQkd4MS/4ODWuH+ZAIQBD9HA/wEHtDtidYeGtA4bKVU19t/wAzCc4sxIAy/wKhBv38FD+n+sgbNCdGIH/BgPTXspOiz3EmNb5pTjoDpcFpe2/iRjs1DuXxe9RAZXcJKTJ55/Hat/t5EhdNBph8qC4iOQi20p3f4MDncRlmHt/+AwATn/hlA+CIF4ClifoD+FQPD+AwEV//iMBFTPD5NrwK/gYlQ3cppqf4ABI4HxT4DfVOHj8ck/gASZF/Z6ieHIob9P4MWBFA1StIK/hQJm/gVMx/iECxCPF78LD/AQhU7Afw0ETvP8FAfE/L0sF/AlIpwh1uNZ9LSgT83skz0uOiIpdaFdlv6Rq9xuc2wplRtFjyYEpUK59kRVydhw/12H8AC1MxTOt9puYBd/8Sg0UNWxzPXkT9AYGc/vp3xHregucTH9cl0VZqPkulxX+Dg3ib+YgO+CCIBEAP4VDnP8FBqhP8QhNgHTPfCr1MWQH8C8EXIfoCo0p5XeNL4sm7Sbse+5JGh4+yX2dIus1a1iRmrDX+AhhyjTGEbieA5CFuX1zg0Bfb/zkiOU/8GAknv8RBEzV4jh30Uf8LAm3h/AneD/EQFiC535ZdyXQGB/iYT2D/wcOslga1TwgaEfVFCBoVAoUGD34P/kIHWS/44r8wL+H3m/q/w4IqRdX+HBsBL+DCWlI5sMLl3QjR4uA78+n744kMpfjMrQFpZzvfsXltJ4WrEj7moP+Whpv3Ds3sSN3NM78YXQQcAAwtSGgDNQ9bjCDrC0OBYc5NSZ0eyElzlQcnNb/3vIYWFrY0cGmJc2kvxZ4x2HxclQcN7RXb5xnrL3fMB4BLI3cRA5rHII8xlkZOkLaw1ml8yyLUwBuaHo/XcJDC/5VqQpozPp40v7D2irwYBY5ozpOLgArFSfNR0dCO6fnlNEnSie5YF0hyjVmFYJCG9IUCGAtSUQsxeQok9iHoFWFpG4UZBcv3aEpT3nRi/XSPC/DDibb8HbvQ76huaahHbZLACLAFM4fMF0cRS7zd6/3K/4ACTbd2EjLI8+OLjJE8+0q/wEOmm8lWbHwMlB2WTIAKbcjn0x2pDPH4hQvQJldVRuHLH7KqOt0ZUk0kdVq6DatrikAjfIEFGt+ZtoHiYqoS9aj5z0HYqdOVQR6asZrjhDgDgN0ArvfBIk659tiA0DLVGHU64W46nC/A6ABU8P0o5MtXkRdivmP060LChDggMc5WXsjeqRZKkGuy3BCOuOuCjdfFvglXhPzZSj/9jqyjIbEu1BOrzFOrg2c3Kmi2SlqDn8aM2t0BgAGIp38HtGijucIJy3UR9XgzRYoz8qt3BoWG4f+HhNxr+FQCP+GtWqKHbKv/hMvr4ZJqfL5kVXzhibp2LFZITCs37CI/hMAT/wsDqQTCwOfwNlif46DHJFKBAvZSVz+AwARn/14GOSO4Ct7hJA9nvgJQ4TZzmefqII8Nk0n0AF1ujBIvSaT27wEd/sIMckAAD//wY5J/xIJABoiVX/4KkywoNz3b+H+DJMoIvmbqCz1TL53pgIbC/rUIProxGeaBITKVF6LWbAztP03QRhv331G6HoH7GF5x0Al8OUwsOQlyOp+UCr+ktPiaG6lT33j2VGbGl+m4Rg2ORFoSHTINgSVJgrR9KWDvklc0bZVa8IzfaQmpKLwEsCjN11WBuD0fRLEqLh6CuOkfvOItpmj7vAj85AnScBNk1sXEDh/KZtn/j22xl5mR/Tu8PTcKusGTLwBbHPad9WnihbKovhB0PL52Va1iIXzbWMH+H+RIQFfFqMObQrZx1H7DtxDXdF5M/wENAOcNwzLILREh/habZVjjSv3bE90Jd8PbNzmH5biKXrDJSR4fwzaszUVQp2H8ASfAmF6lhJ2DDYPxuAEuu/kOUKvacE++WhnIlDPqhrg0DByI1StVJC9G4cZxmunjHD7rPtu8Rz+PTfBp+9EGJnCrGzci/hMcCwSY3EztrIgVb2PhwareERU3Fx7+S/VVN/YTXgaTR6S2B9EcUgbJRkdnZ24jwCbgOxwcmLpSKEEzDtLP5/joJj1Cv+9sgDzh+YD1LsWr5GCt0k9VCVYkdvhvP48gnzdQ/gGM6egqtWiS9ZotFvALQf8BCLSmY7Zp5pJVPL9CN0WGQwKHn0I/lb9BbPuo3Rz7l96VnCV7VBsdafYB/AzBv/EEZ08d1aNeGgwGBocBrAtaoT0caAZrIi/TFYNbisR38A+N1/BJcb/LbpR5KH8C6pnY2mqD2S5553YqpkVTf6mQQRs+ed6vcw0LNBn9ZaG3QSpvByXVKwpOvofYZkpxZBUB+GcPrYBfH+CAQJwDU/8ECPR0L/ggMha/wQKaPf5MBY7BGlYAG/GyCbqUxi0L/AQnchM1x/G9Xp/CtHyoLyRHxSL6KQAAGAvwveA+igvfGVBLd/esqWROYCbhP+Dg1/fPmIx7O2ppiwD/FgyjsB/4IHdkoCvhASe7FT+AwARbgwCi7L/FgyjsBwhKW3SF/roEoW/g0A/gdQPIfEqjw4yUi8y5VNci1MRktk/j4wCpFfs/Sj/SweV//CUii8fwVIrAXJAeJEPg98a8/MkoS4U+YColloYRjpCTkQNtDnmiUAf+GAvtootaDpxiGlutqlNiGbOiKMgWjPXf4rAH/8WBQYDof4MFifP4Mk6qC4aZHGIYs2wdGdSi6esM42ButI/M7AyxSe9nJ//lYGXq/noALgpyOGU/9y7NawGsSaZ4uw/HJTApBq7JKUbH1FZEgyFI/wILBF4j7mycPJmf5WVeiIQ68m92I4KVPaXuSfwAsUPAAQBVL1DfNqndpyR/8ALBtwSBL5khruBs/YxFxJs/oMalxsdKMuKd37MwELdz64olabgCmHamUdM6K6y7f0MWHk61JDYb1PWsT/50CUafyBf+vw6m3SHs4eq9lJQ+slIMA8kc/pCdU4/gCzveC0pBMhuyz8A9xbEUl14lVpg2ILMvExyIUJONZLlP8HDhAINjv+SgtGKk/0IFoxQ/+fAtGL+HuZ8DwP88BaMP+Qgww6/z8FoxQH+fAkbTA/n4vIAv8jDA0/8LgrhncU3+MBDHrZQ2Xc6TBfv4bu3gSn7ND+fxDIGea7/AQ++dbOynTAJQ5oh3JiYIaY6U7RKG24H53HbVE/eQrsdFUv388sbKyYENv6W3pmrv5wtAfcMIapWDEaMA1IxGEdUhEHVm78tPo+J9AbpNG99c8jGaewtM408rz+IdXOn1HubyQIJbrAeFjxJKTLblzsd62ww/OuAJnN7nfGT0mH/4lFXZ1r8ZDzzYondHe1osYwa0Hfd31wi8JxVD/hx6LAODbuPRXv1KhvuIsGtYdqXGVZmz+W1lRlBJpvoRvfN/X8PmNAM9m8VWJuHE8lrA04/QkJ3qoV2DPQh1z5dXdOrUWYo+g2ZqSwR2O07QLY00qv3/HtWdiMZaHck4BTd+07rGdczpy8dExVxaBCO3NdvjrIaepv3V7vcZsjaoWQqrNqGwrzAuowHzRe8q2Jezybd+aSuFaOOzy6syfg2vLJgA17XDLBFwDiWh0/Z/4CEcQBVSxY90LwKkbv38+CcrLqe0nfsfuWqHyIJj/H9tTthdKk+a8174IwxZBhf1J6SUG4VrxSC1uMT8ClJYhFPqBxGliU9VxXZgzi9BL7sgL5Av4DUXYIFDD+CBOkOgwquYJRcTI1va/ebcY8+g5mqvf8BCEbY5b8kHwFyBItvQsSj/H5Gy7sqKqJC+jKqciIKaipyog/4eCo9f+HwB/+HAaaEhYsBncmIqbD+BSWMBsm7FbfjNfaeCB+n8NQR746j/wEGsSZGGjtx+Y0MUhN3yTCAhux10CmpMYXQBIcOAE2U2DKAQI0JWeV9z5RBi13Fdlg3/JgEafUpLnMIvP/C0zQWdpI8Z8QMom9EHAoODGNzrk7nzPH8ADq+XSSHIrf1PKkCykcktcPUJy3llz/BHtTaOqByjf4CBAOdI1v5NAjbMfvJYUZSf213HaRvXScUaBJKMRdkIREm9SQTit5nkWhVUPvj9sELPM4eSFcSVd2viFO57uP8BAzUW5uotNSX4ou/sNgAH/gwJTONjczNTEwIzAuOS4wINkYXRhLXBhY2thZ2VzLXdyYXBwZXIAACkAAALtVwEeAAAwB/q9IRd8iVPcymstaCd8kSCsjgJ/ihiqgJIgPqER4DAa8KFi1ohza2YNEh4fXO+41bGy63+KWpIhPv17gCFgP4VMdP8FAp9H8QtbAEmNABbWM3/DZvc7fzGZdKbvIToAYGiWND9ulWj+ARxKI+q+wkPfd3aU/DhBA4BCARvwQ5BgXHTSgqaO8xNMY2VlBGUSOhF5EwgVfUQwws1oz8MgBcIMJR3f+GBhqq3oXkxAMi01C1+QkglxcW85aZc3kiY19iODN6c2duMAsAgCH+DQAH/CAfEd6+gf01aCO7jTuEqpi/sszrFztfwSpaP9fQX0S+b9OZozKf8KHj/+CgJHa/iIF2pK8+ymRk/wYGIxe72iT8tJAJAen8Bm5iINIzHbQANuMrdw32B9C8PRfTWT4Lcj2oqSqL8ifbRhgSAhXMvPZXD/8UBNucUfJr79UBgQs8YqkTyaPMxwfFg6fBIULjp+rB/xQgLAS6KwYvcxMBgP5T5AW9d2lSOLuCIrY5VBjHhrAM/4OD4Cf5iO4pUgFPj+BnfH7QG+FcYA8wajUMKJ6xzwKju4tm0l8YTJsDeUQ6R35cegsvY0weAWsPHsGrtT5NfI5rt51KH/hJDB/lp8hAU/+FQS/w/g5FF/wYBdq1lLl4BaJP8DOBP8AccY6ZMeT7aCaTqv+AhSFfawYa24V9mjJuTK30HjOxvzaQWyVG/xQIQ7gQfBj38AcjL7wgEr1Ad/AhgD/ioLKeAeT/BwwUZ/LdyaA/yss1fxIoSP8G5//+VhND7/IwjKOd+lMef68CyXv8VCpVIAZfxQdh/+MBCp3ogx7+AXkeEFyZHEYLDUJT/EwlzX/g4SHDCKYmW79SiklpdnORMRe0CCeT/8FB+bKp/BGOTo/4yC0U7QUID8nM2AwE4vsShIbejpwWZN1THYCHfQDi/8/wYBh4W1/tQMPCarivOPzQS9GAWWmQlrBZzcxiPlLrggONIQCSv8BCe6sFbHMpCtbzJovWQe6YPBi9gT2IN9nEDxkAQn4JGt3/BH5H7Ac/AbG9bj4mO1b4OQA7GSZ1Fb+1EgOrQM1+Y2qP9Q1+A3+NhcbbV5/8OBVx/++Au0AA/1sFNoBQBg4KBgwQAgQIEgH+5Au0AQokH+HAB8Wd8f+HAD8Gg5o/h0AfElD+HAH8dNef+HAJ8gM4T+PAB8lYHT+PAAfK1D+H+j4EYxtqGbEZwIYs2Ngk9t3oJ4+bdOFUDpE3M4eO4r2cxDuhWb46x5l0qmphprAjZJ8o1O5l3W8AaRU17G0/s/3j+kw7UcWaVuMmSEUl6xlorzySDv+CFOy2uRjrD3yQY+503UEsvPQAa3bbZplEDHRePWHJi3UsLYjCC+KUVdvKPx3+H0cAEXx8w4UNY/bt/FwOuusS/sUiZTYoUnxwDPpCd/H8KytpPOOx3Isw6HrazZLFoTCmFBMOQghKrrbseXudzdWgMB0X1kv1u/RHY+1ogStdL15ymULCHYoZOwAFTSeW3AmkGofFhrCG6mLEDzYdPF8Mm2pCkRgnVpgpBggsTVsy2uEYBwHvtP2v6AmhSRABDk/4BqFpkCuQT+Jn+KDBQK/gCqatdbvM4DAOAxhq3Oc3V9B6R4XQF23k90/bOg/wcIBf/zEwhxDTViwBgfwySdfzIYGzDFEgIAYHZh2X+AgRl/6G268s8kWtaqcxPAjPyz/BA0Y2+wIBJbzj/wGACaEGBURxKfup9WRns1VPF8XvM72lvIk6HzmZw19/gIGHJZyD/Bg7dD/AeNx7A7epknLUKioFGCYRV9+Xa2/jVb1yK6hHO7oIkKtHadzdP8KAvP+ChAQP+CKgh+uky5fO6gP4VBOv4nMFwMHXk8IlCr+FhyH/BgXjIbcIBJet7wgEl64BBg8+o/70C8cEd4//wF44AzJ/jQEm5KKMX7I8Sv//BeOH8Pgj/8Cb2vsFq+f9UNYZ0VMJmzGVB94gb0NRgT6L/uxSRS1xRatsZRTyxN4qwsvSTnglGnxf1rpXdPpz4mfwN4gDfgqmP8BgAhz/DAqy1B5oFCYR/wKe1+wdnmRJDCXVeDVd6hsFSTIKqNRk1LbdCQr3nh4St9Vk0xpoEEZk4eJcsbe8rhfZunM0vHZRHX8C+qzVPwUA1/6wB5TvTH41AAwNI8SA6abwPlj96EWh/BPDA9Q3M7f4HPZf4JA8P9bBTSf7jYHHOHSm/gIfOzXldhEKvxLnpi6aTn8EQ68F/W3V0bprwwYnl7Am8AyJTG6cm4HmHQnn/FeYqzZSLzy6NMIcLowu5gAH1yQQkYOX+eAaulQ7A5/gUMEEneVA4qm8O4hF6j0W+RelRWk5MoottF1iwgUBSL8s5NWP4UCmf4DuwWv+IQwSHAX3V0irHXP4GDBRGtMSwSWdlGjvKGXOHR920/bHgsX9dlUBmaT0EDyrlv0/8r8aHosp68TFDscj1M6lVMNXgrEC8P4lDcIcEasJQAUZS4DA+c++3E7VCmF7z0qfyMUXelCREFX+KAEUBPq7PgDxUqH+1gP4aDDHX+Yg5EDF2QwBv8FB49/sNIpiYCgZnejhyYbxgjydPX78dBoro0tCIS5HRVh5I1zBfwyjAzz/f6MD/MSMD9k2T/XiMD9Qg9+B0z/DyMD/Cgw1/goNLIP4iDmdWghAg7Z4DAxfC8w097yEqWX+Tq7jAlJ96yIAS/gXVHf/5iDUwG24sFi/x4GxUn/97d9aPRUtRKhuoCCCHhXgdeEsmVzf/AHFnifx5zbMiGv/wKaAD3hBDxhnGXzb/FlE6mgq2v29du6gO/9sUTceF5xCkcOwv+OA1qw4v9pBrVkd/sANas/z0Hqb//+DWrP9dBrVgDGQP8/xBCqr22e4A9Q9JoTWVNDJEcN56krKdWxCZTFAOzoY6nFaENoXQJtldyGleA/Lg6fHzxjI+0xQLMOODd9ZsgJ2OV02m9l+o/XBA/4CB3VZU1WHOu/Cl+hun2EYT4VW/nB78CJkzpRpdsvx7R6yrHKu1oQUnK9Jc6syG1cppquVQmhdOLtWh1tQIjKLyKfGBBRSljnxGq1OVhGp3g7DlFSNsxEIVidBjPw+bQsM2E0yFCkenIE0MtbHt49XUyAUTq+KETFhHZd/4fiCUvbJN+qeWzbzlmgTtpdBvCfk/4AEotYCLH8RyB16xb2pscIC93boWvaNqGG7yTevLCVIbjMyhdkPx3oNH4squB5e/2bhV0WEOMTGJG4pnVazwgdXhNSnoT4I3vcSS9/jpQJbEeiSnijdSsXlC4t+3wFCjynbkz2cGr7psfK9FtZZ/gD2C3DgwhedaiqC0RFN+SWMKTm2fGsvcsRvUBudsTcycGAYHXIf/wEOAuFYRQstjL1mRn0SniVCEcmO84tcb+EfCHrg6oFZA4FgdMQtZE3a1z0m/fI3GqCYyHKRFn8UlLEy80wvVUn8LDgnj/AbELAfxEVPgQc/OrFORL/AvOb7EIthK9+GZLTqGaTLDWB2djq/cfKJIRpITUnJbtdtXo6y9aoIkqoL/g4+TnXXOg/cHehB5ZNfODhIWb+Ch3H/VwSlNa3BFswGAr1xV6FR99U/T9hVpiY5DX3L/AH4eEMoF8gOv/Anp+K03SYfwGACG8GB6Eh/igDXm4cHoBBKXQ/a/woOPpf4QDUR/4SAWf4rAH/4fQmvPLAr8h/AwFN/DIA/Fib3tiDD/FbjccoYjTVVBscaCOwWxbyK8agmRaN7/jwGj7J/8eBMFyP/lYUCK/zQGuhfwCNUBLACsyveAkpy45Uw0EoRoK8PDW5fkoz1Ock916HjFPbC9/Crca7fwGGD7kITDJcnfwGACK8E/6P+KgYSAC8PKQ87wAP8RCRvn8Jt8ohfHWaNLlYnvYjsTRr+LYe+1B0U3mC0xNRXhm1Qd1CYmBTkQQiphBa7oDBJD70r5CC2P7vl7/FCPS5ePDngFvAYCUbZ97O+2PB8Oh181Y86vkweat2f8C/0DX/MZW6lT0w8AGB/Cgfh4fwcjhfwx2orhA3cUUUUB/i4ItYx/8IA7ateWznXKJFqbNF8lQaLLOmQGTS0msMm5ro7SA1/hIEi19imXdCMuqn+FOrJEve9Izk0QyhNj6pdZL6H2aGJK8X4bgL+FATL+A/x5s/iM+q8HggIRQf4DA/xcMHwash/AadOcf4IGD4P8YCt8X8xOcX8HsEB/BhPwUsQWylLadFtdyYzsmUmJcgABLh4KDBIICgABg4QBAP84OcQEdw8rQfw4APzIE2/w4Af0svcfx4APxQSyH8+AD+bQVP8OAj+q6fw+nUgi0xCZX9VPrtjil6UA/ZtwPbRqc8ixY3BNzlNkgjmpaJhySQu8dGxAo317/gCCp0fboBv4Bo7XpBQ5pkuFtcUyvcCebjS0/UenpFDMBYzp/cbOek1DVOReM6FCKZk4FSGJMLMw9WiEDJ6hGKSAm7SL2x7cATfpXmi2JhoD4zYtup6wMq01Yf4gdgski0BfhDI/yABzpbAg05zmdy6oy7p76Mg4RGuRfKLAJpQCRrzaRm8TS2RohOvz8QE1F8FmpyKxgYJ5mlXqZl3ZSlXHpGezX5gd9VveT20YjpEVUg36DgsaOVQ65kqHPCFEvsx32oCc24+60/KAfbqec92puzZzV8VYksfoyRhf2/gWCA/8UA7BgR/AhxMqqcIBMplCfwGACRcGEc2B/kAeg8/MaDI4To8SahnUQzKlm+cNXWGB6SR6QRsLlqXo2bow58Av2IQf+OhIdSBxej/fAOwZ01dIZdzHKz3A83x28ThlhGz3WStJbI7E9AguBYPKfbrXrvd/HpwMBf56DERQGEdHhLSACOLeCBgOlzqHlOloNy8pwICs51wYbO+nk3ZSgcn+HgV/p/n0FfryrYF+e2BebQEg27V5SjTXn/AQm1R5QHjs5nB7B/ZiPv9fepSfz8eDFgx7nbtmplsPjBNnxyjH2rkiE2oL5UsdQ471MwwluzOII7tz+JxqMy922RAcSQCoGJwYbm5OwKS+U9kl6fQqMcjhHBto9sbo8hhYJsNbB9qhcdaaY1lucmETz+6t5drIhDg/mjSucRq9BtnNx00HBDyZSAOgPHISwjwlNhLjkrTTsQOhUOXFSVrFdj5UakZSv52KrLFjBk8Xkez3x7rAo1eMgmlug+K0x+k82v/gCGiDYN78w5ahEM81+U9NXXRSrnlMNQGIGkYEOs7JtRwIt8gcNrClo1Hj2H/Om7chEjfEWUkIdqKKI0JFTAqEFQAg4cdOTaNwShDu5SaiQlMgX3CpJCQCHAQo9AcgE5TcyCL2pnEQNCjG1mS9B0Lm69RVvOw0Td9MCHmbNTvlLDhOsqT/RBY9OLW8kHoeAVTKpLCEEop8IRbDz+VvfyP6D4KbHAc6mCo/3oiNMQglQyB92/4ShKSygwJc4v+/hY5GiWmuQVNR+AmgCGuNv4JpZC4DDgleGnA8/CiviZB97fqEzt+mai0puEHWIYYTkIOX0QkVrMbWKBbwBasbN0eHrcA3pH6mHPnyuGlkFvnYPPisaOfpNmWYo+MFAP45S5hlgxmNStFtGGf9M420I2Zx94fCtExyP0JVSGGGlkokv67PcDeeQX39RokcAX+oGrnc7DH07CUmsay5ILXZg16YvMBX9siAWtPv9hwi2m9+ecZq0gEgoC7ymIT3Itmr8swMg1IoCanuaKEdnSsMPt4EMwl7Yo9SN59nmfjHysDF6rVOfdoL0WdrCxqSqJxWL4p2wGnhlZIEyu6FOLFXg+4cfIIY8QKm9QyglB4OhV/wE+GSwMvClii69eqKTEbBN1bzDDN7sE2Bsbneys5kf+8ukoamwt1GG3EjIHgMFpOgj3wF/uCd2GTJX2kTND4/gBSoDf4ANtWhohB3RzuMbMg7yg4O2pmz2lnXqNF+qcJeqOkv+3sSERHCLawNVCjm09gCh4qKsyGnI/JNKThH4hE3W+uMMRI/wEIioyRNkcRC7b+IKay0DhA7gg5UYlYspcnhwfvcuL0WK9t3U+txQtZhfUtkJoPAQROeVyjYkMtleUY0Z2m9hrA7MO+S2/+vQDBlQw2ufaEltUMOB8IUJyUYza2UI0h9pVSxJNWi45oJ04fMAHLu3LoQeKBP1lx597wvqFuLQztBxRjls8UURsSgSsCF1sbCPSNOVdRmiCon5dQc7yGgDIxMqrwb7LMHMk9SyOadZRkduqyG4zwwaorGkJXWPbIvRBRjRUUT2MM7QNuEPBl7O8BsxqMaPaiTtP+M90gFYFAA28G2DfXb1BesqMN+r44IgPSZQRdFK3RdJPLc57WJ7SeyA+lMyjgljOn+wbNt9H0J9qyuH9UL90He+T+Yuz+gho3A1O8POwtxJYnMs1lMZHh27ewSxrCTT8XsozESKI97Vb/bWxgmPvOEIEaZbPoOQutIZsQiZTv/MHb2nNLUzX3Hr0oAiV2zIxWAxBo0yukcBuObSXqkQCOShk5U3H7wMns6KOUBBFtkO8uM0+neo/dSSm0cAGFztMEjVxAwsqmyG7fR3e1Q0qj7YEBxigSdg/anhh4X/d5PFJqllw/Xh+UayFgFoPyW1A7TQgJvIt4fkAlJAQDG8xjTp+x9+RJMmcxPWq1TEfP+IIOEAF2SijSl2TKwoO7kYF/BADd7j6sac0fcxd3u1jGXVonCwL6ADpWDFeM2RHI9uySV5LcRAQm3/gINTgHfFSlhAZQPWwoAUvXjaqXr0tS8s2TlJE+eHwDWPio2IASCc3QMqrEy52tUkoQbiuVedBt/IZhqEilFvG+gb2cB0/ua8oQ9we6gi/IDsn4Bc/0rDABvo/WfBRTmzmABYG3eE6Nkj2dQXfpIeuPvhOwqDHY/4FbtC3Al6dg4cv4Woomum6FM0+Qp9DfjlzYZKGQaZJZpgSa03jfmRWWRpuC4cUyFXkpMnt9JJ1pD+KIFxfv0Q5e0uAwAgVpv3HWeqvat2g7ZzUrjM4Vf4qf4FgNnv+C37H/WQUPgAiQxgf4wD+vM/+BE1+/8AH9eTMB553lxqTza7mp0oqky9psyKfbMQzoBs8rOUsDcTWetPb7Ff+yg/rz2Kv9vB/XnO0//AD+vH7uUg55/CM6lYCfQ4HvNsROm9WZ72leBbJH0ebJ9hMOQYgYGD2xJQ1aaz4dvnQ0DgGMIcQOahMpnX1LZLNZLWhv8/AfSuifhzuw7DYzTOMMMFndvwH+aLXMdj8MT+OjVvYMqBWFMlSJHqxN2DGw7lGhZP+D9q/NN1kCGcACQkqTldU4BLLr/H4D+RPNSptawHUazl0bX3SCtDW7baojk0ERVkQkwRywfiS0RhxOiSIBpLLCRDMgkv5iODReepOODpQnyhxoDEqvfpF/H2K+gsXqaBFaWT9z7TxCXudycgjMiX0FLmz7T55aZhVvs5oosjTwy7Lm3jyqEKtByxYAWsF5FMQhxaQZOtz2OmZ7NH/ACvl/noP68b/4fEpP+mB/XiHbpARf4CHU4Stve8kvGt1wXFO9secLqK71ucGMpLHVdacMP4GEpP9iB/XnsVf7eD+vAPF/6AH9eQeWNvXhZ5LcJkkxSp06IKpDk+fX796Zj8JKCIPKQ7R1vhME9h9QU7A/CRyGrw1cB7p8lxAgH5zKO6HmnQS0b7M/x+GyL9telSrrJ8R/wBAK7JMb46N6KCzNHCs++EER1qzgI+FPYJ2IEAAGuif0JSjXQij6JNVHEpyRwojhaeizZaNtb5jEdf/kgQxUgwrokuOX5QRMP8BCxCIOYj5ZywL1I+P+DBPBkCe2AP/4DABEiCewyUPUFQrAdSuwN8lpYn/Sw1kxgyxHWGMhSs87Pjv4GpMDGF8YNlxI6IdYj9nakL+nV2RRFjVOqRLrubA3ArmjovlyugaTocjdQTNBlGOLF9AjphX3Emv4KCT7edxS7uEuAwGgcDyVNm2tfu8qhde4jqISnqveX/gFpdXz+ZST4D31nA/xcK0yY07/hIO7h/xYNcUf7MDu4QA/0cPxkTS6YIncib/OXzt6ncNWi+4AAS3uBgQOAAYKAgMEBP84rXqFx1YWefx2AP87pY2v8tgD/W427d/HYA89lpZhf8dgD/at6jf/D5O9PQFYDrhDJOp76PMXBj6SDzBo1BgRN7a8KD27WifosexdhElFnurUMqcop3sEYaEnFiEkIbXhnE3pM0Acy+LLNFh34mRhXmJVmFdg58FrxR8gvqIcAxORFOQs2KL9R2YzfYyRh1PHGa7pbdOxTmpzZx7rkkuA8nszc14DH1DD4My//D5T8RDXesDMs0oqFZICCEViVMt7HVW+RNaeCK/RVWTz+WJkiZKEaZsZW9FnPnwN63uWjQgPjkz2cR8ZMZJElTp9/ZmjcwQWbrTMppatDBvVsSbdtJYM/odQNMUjYnlB7+KcCE+O1nAF9EBrYpn0GZam4HAiwn1WWisTBUBIzk9+Uf1zZ/BHu77GrlExJ8YBfwCbEf4ukfiJPa7BEHVtEbw74Eymr0e41qlP8Mm5zyfzG/WxL0mZP8C9+vscEYvunrbFp+RJaOLVlNlov5QYF05lK1cm4aK3+0WIXHj4vkkE1vbmXu6dVaP6+As4J10E5A/4OFgxlaL/04LBjf40BT6ALnbfRB1YRdV9Q96P4ACDpXa3a9pHNM/eEGtaQTRldbFKOyO/49nZQP+Huz0EID/PQjYaBUH/ugU+gBoP8rBi7qoP66SgzlQDVF4+1FYEemjeW/F/5YDA6MwJZthBjUEsWZpkIbdgqEfrFpqM/gr93aGBUmKRKHOKXAgesiYzX+1LnJX+GGG6CyNlL1XjCP4NKBTjRwfKoGh9pN8xWnpuCi6AhLQL/Hwzzf/mwMc/X4muPKaDAT/TwKpRIxJaWfwVUqj+FTfw/LqqL703OBlaQ9jVtKBCu+O3Ko9WlZn8ABANPdtb6XR5jHyH6E8IAHD1xOF0qEb209Sv8F01BCtd5MT56P1IcHAzKVHlnxU7fgImHgLVbN3WDb522p3grw49RGL5T4NdUnN/gIPU9S6FIkaGD4qE7nEaJkCa/MGdoPpovdKaIJrlx4jOPCg2D6w4IBt1jgimWOwRtX6k3vswwXMQL2JV3Jlu1L7J5n4FreS7/Sqa7jggfBcAK8spU42yzkRlZfZ6/WsAIbZP8gARx29u3IqFJ5rmTVXL7CktaLkdgpiezoDUzzHOZi9+H/gIFqT30c28J+ukIDEEaLaEvhOJU3YVoasv28wPhiMfyLXqRoJRiiEhTY2lHMEUNIv+tyifjVxv8BBm904FvAgq9dzS9p4SKenbCyFT/LIHhWD4C1vPSlJbeAPSPwgUHctp7MinWo9VEBicf6LNJpTXguqWhKQHu82rnSFJuO4ltxGSw7eB2N9ASarn5pxl3gsy2H7U10pwyRePAs2vvCM8hSrzqC191ZbgRxrWKRiqwSB7Irhj30kowl8nIjf/AN0Pj13IeP8BgAnz/DHQ+SQf2sOdTAYDM/3jlsJrchssuAW4pLLAzn8QFewF4fxKAiQ4DAENxn5CB/Cyld4fwCcrf8QgIoG5wguclVt/DZdr/BYCN/r4Rwu/gPncPMF3DxeNn85kyBmxGV1hihLz1ixsp3S+UxUErgmWWc0k/4UDGD+A8op+/iIE2BtO+D4LWjIDA/xcMhiaBL/AsW//AT/hf4eGTn+oEzfD+aIt8B/kiLf+nG6dKA/dAsyUgekZKSTboAAJbMBgUJBAACAwYHAQj+cf8I4fFnq/hwAfflwgP4dAH9rc/h0A/32j+PAB9/NL7+HQB94cP49AH85g/gdAf/L+H++4E4EJvWUslcsQxbkpuGbWhZ3meqCM0oc0D3Wy0NPJyouVqkJ/VLYzfYK266VXH0tEF52AdgW/w2QuMSTj6IGc9MsUqS49ds9pdDTbu41PEOzB7ugMuihLoh+G5XcxI/VsvKUi513r2gjltW+wAJgmr9gKLv+rXDshJ9frbYuVfbFD+H0hwEYV42qdRWwtXPHRHTyMmeAyy3I2EdThNrMLLF2YcCiM3WG5ETpdsoRAqOyIosxbzKkc5Bpd1mXbA42rLip79VPgIZc/VYr1JhIF3nkrWvGLSMvNNNKWfQjBabJdcD5Wu8JeOcJPHWIdIjol5ukZ+N1x93MQnkLd+/YkzHFHeaKwJ2/gQwhtEPCGujs3lQuwfSkFUywt6MTIxI+Gs0hAKBo16DAS8D/xcCwhf4eCaFP8YCrfD/CN29NzNbAbg+QPz3/hATn59kD9gNBkif4AyOq7nrkfek2TJZ/V6mf1aL/p+Qmh23tIEwexck31fFEJLIQ5lCAIK9PZBj3X3Fe7P4pGCDaxyorJD4DA8xz9aLpitvOSAKlBNuGmnucZ+J0/gTeiCf4jARVCJG+sTWoDAyEtm7g6b9sCuuRPPzjquSynzh3F/gezc/gkba/1wDRGCuB/CSDb/IFIGv8Cg3AK7Q/+LBeX0CaP8vBjkP8JhAH+lgxyB/hMKB/h8I5/hnxSTW2FMqo+/10GOQgP+KBqmzvAADYQAAnf8IDFvv8Jgm+a1zAAU83NwOZ1kdh8BX0oIaEvMz+MCSQyVjcMcPOGYAqD+HwO3/BAY7eKUEx3tmAwH1aEOlF6tMtTDow0hF/CYN1/qQMdv/xIKbVN6L5f4QDfkfYRLWXc/243oLRwdGb2ILzOPMj3zaVL816d1ZcfYG3FziYC1U8DwTdheXIYXehIOOZTB/J1mD/wcDWs/wUStfyvGYQ8w20YAYH8J2Lf+JAnydfkP8/EWQX+Ognyf+A/7dILHK9gERz2hj75l3hOKBBxfvaH0c8SuAWNeYSr7PiErD7vkkiaMlyPwEprnBnu/KFDNiEiO/igtcFjliYKnr/hLEx3T+YwXU0nfTnAMD+FAfrv/gTma/iEDJsR9P1ipk84G95ky1eByT4cwx0+gbVDEfR6kK/Q+JaNq5TmS93IHREG1YPT+FBbf/BQmxH/ERgN8ckt+hWdAf4sFTMK+DBKLuf4Ft9gIR8/8XBXvkJ/oQNHC4P4re95DopbiB0hH0EjXe6iXh4i2JUSR/wwKeg6vAl0ayEa9+t6XVr8fCmkOzg+5/BqEzb5jUCraPx7aWeaA8Hxh9Hpf25/58DSIv4eEDsE/h4nWBXj+Hte7+HqQgc6P8FCuW3+KAYDcG/wQKEnUBYQgJenG5/AYAJxwYF6fX+LBC7kGxP9IC4sv8ccXiB/w/WbYg2eqv+PAoidJmP8+BFZ3+fBUJfnCP4+c5CbPj3yR/z0COL/4eBgH/8RBZRuLHfxW8Cf5eFr1v8fBtgT/AfIDN9DqkxM7TA9e1EZ8WigACsZNAAfjc76tfB7JwYBJ36ebkMalJeYMtuOCwRqEeHtdAN9KVbPzbdQsaZNNneQnuj8frYTuh8tkVYhm1w5dHckcCfTpbu6rJihEz064l0N3rJz95l9OCvqH+AhhSDk8gaB5VxCM3RwhbfDrx6E1+P1ERnHakzdN5xktXPa52dMszkW18bOvCD/ZkyeDEDWt1zm0eLSfCxnBX9coimrltosW8++KdaHo3CyVVgNnWigOudMkLf86CpeYJ/MIX+BEH8/Bf5P/nAN3J/yUCpB4H/DErFlVW1WpTKtfwzKCFwavUI1/DT+ItE6s06QAfxWA/949gm0mNWrlFg8H+w+iE5OiumPj/FYY//F/G0fwH4gjnmtYrxS3Svl/cXqs9fkxTfNRB5tqTj1GKU+gA9+iI6zZZSHPONC6mZrkXF3prE/TBovHP7ZmE4gOtazPYo5XsOxmHGKAJovih/XUPh/IlK4Ip901lpj+jez5HMkVC4NmsaO26xxLUXDE04QWXtvWkB0aX9sx/gxwXf6QSI6YO+6CrT2WN8IFY2W66njMv0pG/vOS/XJRZB1GmQxL6lLX46fx2Gf/wL3oJ+VptFZA2wsptd9uJ6jkEj0znYXipccPO5t6AUXJufUakrIFQ4ORuNjms1x+x5BE+p0XxDdX+DgQ0L+Yi6E8plEgAP8WA5uH+ChfV4DWqj+BqsDMdvRgwPvgv8XDYfrJ/oIHNw/yADFuk/+HAc3D/MBEcE/RJX71J9wptVX/F7vgK/bCUuL0qMWVgJ/wCYRNdoIJweYIASopUSQtsvxeLzuVMyZFpiD9eNkt5a7v8ASYpCdJcIYTyBtiI1NwBHf4CE3qB3dqxg2sVhKcuDPjdgylL0XDT95M+OOYh3jzarG5dTIxDb5xMDfWbjSypXR5xTEgDGkjewgcLdiTlKKEMyg6gvAVIU3NAuyoRQl6LkoerLjFy8bbVn+tFhn/ARAevzJ8yOAWiq7U0yVjzL/C75XSHsW2IIl5rbw/Pt6mEK1oocS/Pigk7bmocbxGf3srN9w9wuujYP7elXCb43B/gIkyzrS2RJV//OAfUxGgYASoq1/IeEeoBEg6kZrUehr58rMDtWw1wIesX+0/g8hedmpN90ssJHEHeIvdba9ZzP7cwP/GAcQQHeVybbSCh5X8O+fOCVP+LhQ3/2U7JQWZrCZ6aX4D5wygN6LDWgulefZmvqSJrE8sQYs4Y40rY7U01e70+3WwFBBSL0qJdsAG2/gaS4P4hNxkrR+1Nw/gE53wD+/wfl8lSSAnhWQAv0aupsqSThgP4HtjP5iPPFb1pEoAwP4UEH/8GAgLv8RMseBVNq5NIJ/gwEBdVxPOD0B/akamJDrDNAGvSsBRG0POFkYL+8DAZUHPYA+cLPNcSuJSRpMDsz46xxcv7fmaNgw/ilQMAAJHF5dnv4WBN/D+BIPT+IQEUCqqSAod7D/gaY2W74fNgwm8y2u895ATYB6r2clUXY+hvf3TubSNb+AKFg2DZ8Bf0z2+8PxZuz3YUmJv4BKjsT1UCRrnQL5Bv8EA0WyPQn8SQWwNSfyFBbAbht4+5ISWF2xLWwMoaMTLHaHo/RrJIX8UmcJ94JAEuLP8enPgJwf5AD/iwH/OxD1AB8H//gHSf/x8RLgfxa+7SCfQ/4UCZaxoqb/4YaMzPTQ/d4Y8i/g70zrdi8/goz8AtSc/hgdYNWtq6vMmgv4NBf5tSzGH/woF/1z2Wd/wx5pnB0MSNcMitj/DcOk2Xst/DHnmiUdCbHTRuiJ/Bt/HqJyFL/wSPMK1Iaf4wE1+f8hD1/wBQbHatSp/nYGpWNhrm/xgDclTrgrtscSnf+NgtmkOuU/PxkGazf4oHjEEsH8s7+Cm8UeEOeg/y0PGIN5RNy38EkhNnx5x/DAG/3VhSGyGcl/4cBdDU6Si/4KiSqctfx7+B8uKLsiQygTtPF0oYGu4RtM46EnE3+zxCAQlFA0WqW/BErgIP0Mw58l40IZozqkdEesSMj3QUAp8nT8vBigTtMfKDy/wEE/h7SyaBciGrgWi31EYTj9n6607r8hgwf77xGpDPNxxjMhj/Ie92fDnHU4EnIozXEPVIoXDJZE9B5xGtY1HvNUQKjukQ4hIylrKASO6GS36WYuBhKNlKpdokIt0s05x3oF178UQcoOssaqu8ILDgxzrdvySe5U3P0YaBeZtfDNN+38PmNSEjUfHbx17VIWWQ87dZqSs84XGB6DC3tS7j5rgUO7xSSi4V+UhANev6gDfbSvKJdplZOg/C0lUYGKWOJEFJpAfTtWhjSBqx03CO08pUa9csmzgPk3ASw8YvuXLR0dZLpGCnYIG52mhKq0TlM0tSeZKAWNmR3cM0zBGFJvLLBh5aQzKNIWRcpxb7K52w9KyfjpAx9mR/ABeMxeEB5RMtRlhphu0wo1ZW6ybXMHMsSIP3FWQ7xzzxxUnxXGoudJ3tSnaV53df4sICxJL/BQUuT/AJPuU9D/JP8BgAmlBgI6V/wsukf4gClymi6eMUmAcuUQNAFzvEs/Nfj/8BDVyRd/DguRnF3gKl2mHL12iQL/C+0HOBy35Dt86sbA15PW+DtjnjwDKwrgfXsGfr1Q7w2PsDNhTVo0HlWCXvNf4OFbDP5iYc/8GCWNn8Bufi+EiYXTEjfIk1o4XpZk0EhCuTl2SEBIHk7KsgGzRagIyp9jSR+r8A/SuS+VhQEcvDUEeOOvV/FC/b9c2RvLjlAYDXutc/MNTuNbEkONKk45rInigltf8UkmhMl7/MDWEBgLRM23kQrNgkUKBEdXSUH2Dw7dq8/za59jOR/CoN1/AfGG8/xEx+qaIQzksBv8DFNTFV+Kdn/wBj9Q4d1/JFmuVATKMYgzXhw6HhuGaDZXBVCH7eVP8KPjzibQgFhVphBSb3/ioEgkIwsmfHUWP8SAkEn8JudU4a3mQPpBaK2oE3S6gm4NOhlhDj92mHhj3boboRiyeBn8CV4/8MtTf+FAlv7+ZWptmbytMofoO2TAoeBSk3kXiIufxkJuSSppISTFtcBif8mvvfApYIqzTPPriMh1/ZUWTrVZVx8/gbphIc5Sq/4DABLP+Bhqbxojq1FFE/hsJndP4KARf9ZBC5CUNdQBgIt9EgCkFG9uFqPwKzYttn+AiUpPFP9z/BwemD/MQEJ/gwOzh/gDdQWq1N2vop0yoOE/fb4yvUCCwIMbJ4bLcGS3w9fFwyqbzyr+FA5f+BWtv+IAXSHAx/eUwZrAYD+FQPD/BQejOfxCAigTDkzYehDP+Cg02D2buPUd01+4pztOsiQjXd8B3DcZmA4NBHdPU3ALzP99oIPKiXeNXwUWUbyhJKJQCiQD8SskiXwLw/iUImhwEeVaBHJC3+BjjBy2+dM93fklECAqUZIFBwo/QqroJEllA6nPtWpGqsrNQWuFYgy47ikKmjtw4RA93qr3hgQFl/wcLGTfzEDkKbswo4BgFfvmxRPe43TMeL38tKmzAZkOuP3f4OEzYv4JDGNxu/leI2AYHD8n+BU+12S6VHIgY38ipqXh+UegMDeNp3j0fiJpB6kewWLUDU3Q3wnDGVZTqojYUD3DTpGbx9IWZ4Db+KA8LCnc5LI/ZgMD+FAXT/BQsbL/EQUkApFaZ343/AwSY8zbNNDcAn4s+97P6AVD3NSSTPfNfLvHO8Jj/AJCj9GBxcoz/DIK27fzGHU/4KC15f4E/gH0i8OiJVifZliaSz7q8xyNgPOnFYtwR81gN2Vv/bxpEpfb7dIODcmgEPfpLlaCCOwMqZtA8D+KBR5voA62cxmAwK0Wi3jK0871MZaYC24e79ODuc9fP8HCquf8xAd8EZHLX/CwWh/goQHv/iIPBxOTv1eAnIDA/xMS4m9sFWN8IBuaw+f4IBRnP8TDLSsKEIVQAuR/goQH1iNsNHRiLCrLuMKfr4qbmS6AVIFn/CAKhZRH/bAw8b/KglJL/Cn1l/mQSnF/h3nr7WNP/gVgrgFiIpKJt7n+AOzCxAePrCOfEeg4Ucc1RfRho6tPfsKAMZ3+AO0jh/x0GoX1MITNTqW/wGACG0GBLyBKRYuU5ogRPG76vXUpSZ+kHFqXNNj/4+DULwABhR9Eqwi//h7wQAb+H0W1mrBRIR97QMCq/JbdXWlc/z+dIXEjuvSagDSOFQwRc/4MXpqXfxf4BTyhrM/FGYigxt/TijYCgMEv9BAXcelsEVGytLda+1/ioNQv/hQPn/wUGRr/xEPIWIZT7B91EB/ApjLC12KNJK1igN8SrWPpZu8QKk6xe7MRZJzbHKdX2hlTeFNs6QTh8U4/foimtwFx91I6EILGQH/A3aI/xCJMIxhJaa7ZQGA7rUMOQbw9ClNJbYJlY8jF9/Iv6D/NQsQOBtVUf42DTH7r+A5R74//8GmP/w9nfDf+CiItC/xYIPESv+SBB4j+A/UyJ7bME3fYnNfffgwXPMPhSIArctUTC1oSvExURj1g84Ow/xsFr67ABCGVq+fwIACBPzlQLX17EClUR2+kxQdcTxtYtOWjxqZgkb+HP4D+AP/fDMUnSYj5mlU7wsSwc+wzXrG/18nxlLDubL3DTiAhXNwUTbAWS70ZOYAwD2IxCtFzhjNf5LEfjv/Bwt8Z/BIY1z62Do32f/AQmsF/Cgaz3/xKO2wzvx8MeXzX8DfckZexCnkX8A2MZpm3Pj0jcQCxzELJgy2TAKqsJgTEL7gPPaEv+OAsOjYAf4JBmP//BYdH+dAsOghT3/l7t//HgWHRyT+tgy/gDCH+r//8Fh0f6+Cw6P4iBY///BYdH+WAsOj7DxZwhBNT+OSYUqi91PyBfoWM7Ux+sF629mtygbH1sEs0K04XEqACdiQ3RJMSwp9Yu2ZCmZsii/mALIVisek0/zwUFh0e9zwAYBQAXwAcH8Bgz//8AsOgDCGOQ//8Fh0fP/gsOj//wWHR/7wLDowZXr+jKCa/22CqmgvuVV7GPp0OdXx6n3U49UEpIX5tmX/EyQvBKu+e5CDw7pSykMR0liYlVjxWQnLaB/hjURVd/yAFh0fw6f11X8Fn9bQRSmVKH725tGoz6qe7Vs8vaeVFStQJX8PH9b9XJ3rrRcr/kAEL/P4gPi1+54TUSv8zM6qjuBhnZ/gAS8ei6OWwO9BUEgCICd2hCpl3+DLwRyQDpGLy5ZFRh8QGoaFvjnMIIsYfOQD+tgcXeNlbrP/xgHIN6sGQwXkwfFnOkyIB+EQ1x8wXyN5/xSoMBLKSbMX/Ay73IONdUfwALOiRELIYlF/IoTq2DtpYZ4kivA847sloDP9mXJ5//DJmh/Mi4e4+Oj4/MDOfcoM7RO32iV/ABPRhyWWAPV2IJBvqlxMbEQd0FwY2TGlkjdMtP8KGvf8CrIpKX4K9/AYAIzfwysiuC1gP/zABSIHwGACDn1kWbfZAS4/h0Ra2bgSQ2HnfP8Drvf8EICN/rwUPLKJchvR3ZMBBIIo9ZEvO+0xmTa9X/BwYs184OGyRgj/GAYs0hN8aEpmB/iQMWa/hMBW5nxw9eVwMKgVDHFG8PEkIydGyUz6WBa1wGgGPUZFlPI/wQGLNaUguH7slOzXP0LzybtbuR6NSjjaf4MEI9gJiWgD+AwAT/BASASB+Uoyl1SzRsVQagY7OJRLLh3/4CBZOIcBhTDjl6AAP4Gnspggox+RXJ/eViGAyB8MJ5h/SRN+P26MCQJk7CvRKdb3ooMT5TJAy6g1luvteLMNUxCr0cdaf4G2sP4gAcaGxqXcup98/hYHF/wUKWXfxCEFgMvh9iDJv/8DUMTOeyNipDvcQlsfz9zEBsij/Dp3cAJymsQ4OfwJpstk8Hsfw6piVcIVIPOq/wGACL0GBMMXKA5v+X8oa3SBdXPk4YTkNBBVOwqq/xEpiDkyQ7L5/DymIh/w/MZOFn2I2qCOY6A9pMvaihua/O8b1SbDVDJys5+khOs0J/hgR63ExP7BQggbKJ78bcPFSCK/8BBI9Osau/8hKYjjGyNL/ggMQVfr+E875bfGi/tPgFL4BFP+1P+tlRowOjNP4HRPv4Cg3z/Vwf1nEehCgYH+CA0kr2oIIY64L6w0YzSMRnggKHOhEdxjtmdsnJPdd9Dg8ByNUUczoCZTBi9ZiVqu4a4AhkSWMvxr2P8HCjYI2O/7SFGwaL/XwOAoAf54EuL///Ao2D/qAJ4nwlL+CuAgRlVOv/4AKNgkBrO71M0katY/wAEJBB/gIkpslRa5a1FPfIoMkGjZ9oJO7SJ1HTjOn4gsEuFDXI4UgZ1Y13o/gAhvKyBZhnIF2unlfb+/9LRfu1xJ+DjijknEyykIQkOZZS8mzV9tIEfnEgWFTON+ZK0U7CFZnL8cLGuAxny4Lk3TDRsCNDP+trF1nmtSDP7FOKZ4EVwzEaNI3Ue2bfcoO197Cg5mI7SlxB+Ae2TDNzPwoMBG/wEEhyzi3Cz2jf/JizP1wjyW6rKYGHONps1gfuehnHX8P56RGp6oAOPcCD4EcAgWoXcrIu8C+pvMVM6Kiujcmcz0BvTXyiDsjACjLIzEoLxqDRF3ma9obHrBmHODN1gCaeQNs4/ngX1w7kW0sHw2rH5+D8Hctj9wTbhG/IsCEMsFaXhK6klPPHDJYYZkH8AEw+CbBBw3aCqVaKMg234AanqEKc0HLkTLoQ26wppMwbfgAbRtJv2RHnySuBCOKIhh9I7SGLXKq5qGdN625UsIPw16/cZRx9rL4XFDH5w9REsR8BhcnRj9uYG6CmzhLWvUui4m84RsZCgNYApDPgXgfxKOoQ4CYo3+AhQ06uQYD+FRZr/BQU4R/EICKBBd6EgiwCkBgDnB/ACdQbWTVXYbzKi7aZpnzLd1pPtn8D/JHPMRK7/ArPpE63lz6iccynltw9dsFvFAh/MKfwMvSJ38BDV/cwVDgf6oD1sATMujBP/wER2P4H8OPRUT/Wz0Ufzo+LfoE5DKZ/wAYpj6SDqSaYIDEhRv8VnykzUxbqQYeDRCkeR9egqgFS1DKecf8bn6lnji+Pkt+gAYFNvySbgcNEz0B9Gfr6FZmdDqbyFP8EP+k//+wUcw/i9r9/yAHVDORw7H8Cxa1G0zVt0PH+Ag3pe4Ual6rfuHuW4drYjbSMFHt0b2uBgJdoDjCs790sp9Z+43aQc89ao2ijPjnkh8htghJshIc5Ssj+BKJr/EwzE//CVQl/gIVBFvTPxZTqSp0Vv1si+NFyMwW61P8WB1GRJuIiQlpUbQf4KGLWf4UBIbw/g43j/hgTMQCGQX3XcgMDQNmmaerErZDXwUCDCobRjxdwnuz//ww0+ATyXWf4CE0uq4eR0ARiMSvesj8fisVTb+O05cJN3os/xwPsY+S3/AYLSwfwIACQYDDjH8LILTiBqTbiPa0QvKZS8qGr4V/ADRhi8RuRxZTWMcaGFAeETJUdYeopAQPfy6qA/w6U2NVnb0f+PgzoM4JZcnwuxH4x6xN+LHzau9Qi3LGHl+p1gBJG/a5Y1vhovm8nKovGpUlOZ6ekcQtSxOse7YtK+glKsWcUGxWqsMf+bFaEp824+M02sTMqDo/SI5+30Gszxv48VoQhrNMBAek5SOk5zqtY62H7ZtewVV9t+wj4aw6EAj2M/IMBrCiUEdhoDH+D4L2I+hAEw6wbDPZ+QyJ/hK5NAAACw0uubde+xyrLpoqrB4Bo1ne/hhzbX8m3W152g/4Jgr6eAShNguH4dKfIDIa/JSsA2yLMD3tdZ1exoOrC3rP93k/ywdpaIRGjwETPaHrtcFp+tyXCHv+KSXM2u2GBcXSAwO1iAc4rNmdBGE6q/MWe9yI/MxDeP4oRCvqBvnLAloDAvFu+kUo0TQt+D+LhA4DiWFEwKCz/gerR/mIly/wYEtM/wH1yVbhEnrMP+ZUWdUZHxoiG6BO8pKyKLwZsltgtxGTAqY6Nv8KAVU/FCeCUjIwPciBAYEO5koheWoOzPMiGXuK1BM2291emT81FMZiBCtZ/CxTH/AeMQn/wUUxf4sFJX4avuzxQR0nPgbrjr/M/yWcU80ex7L7gbxkacKd3j8Ui/dmy7GusZ0j/XSqJK1ndyRa5ZpRZNM3+8fwaglExIiu/ihMwe2VUDOqrf4WBNP4oHETLXHnvn8P4Gtz7PjpVRf55fR4QYA/HTCSel6HknrCU6GS82TRkcisWM9HSSlS4wyEHi1oMiUENFJ9NLcQoOxP/Bwl1x/MRgJagvWJfwN67W1AbrLlnAyJH0YYBHxqEBG0uUhBEKkr1LNjCyahpPpct8h8TIlMCKas5c/WoM9+1juH6Ywl38UArbRURm3WDsBgSjgZ42YihTH1MltF5NKsbLpSPagfwKY3tv8xmen+ChACb+FAXQ/wUJZCfxEbfwE9Iop2V/wN5h3Sej8Sh509sVMfCVKnGQsvXsUJP1jagfv8AQwZbp35PI3DZhfwoB4fwH66t38ERziCVKszrt4/wGASfI4cTrLymtSj7KYzOSadPXgW/wEl28r/exdvTDAunHXGwRdN9I9ItrAlfwoCki/8eOezGzsVgVXI3LAvfMe9yuBPbUDuw/NvOlXd8gRbA/cIBEweW0GpV5RKShCL9gIsVFUEM09KWpAA3qH8RODAXIECHRNfSP/Lzzt/n4Wvb/gx32ZA9VHTSfGZcQuJ/hogmtdUJnrUg/14F/+/wY87d+wPdhlc1BEWjCF2HszjqZYY8aQfzWA//wi+KJGX/Fb5tQJU+FSw/H5iVKXnqfM7WRtl2R6x/hIFG+vgb3XZ/Hb9ejqwPLmT9L1RHSzqHVnw8W5sOe7QL3XMh72My5EZOKPYJ5aSvB+2R3TczxYH/BwkcIUHPdkf8BgAhX/DFhzmHa/qEaBAYDJPeUjUnS5so67qoht7nM4IJfdbf+Dgfsb+CQEX/WAu8afAAMNfwNzH4Tg0Ijav4+isbsyni5FtCgsgg6ttq4ouqwo1RSgfLUITcXMsglG4mUNv4es5MpWBLcKNbrxn+DBisT+IQO9+ryrwnyT/hYDv/wUD9B/xAAiQ4GQwoEWHIzAYE+qptBKiPrwGGuNpmvqvyq8aZXwn8D6bP8FAmv8u2t672LQ+Ssay53klAqCdft565MJ7mP+DBdrURhALIqMX+AwATOgwJ8u//CAcET/CYHl/vgXa1By38/18GJ3j/u76Nqmv8+XexlGDHZwzTO6Wb+zjBs8Ewxjh5zH8nPHeX0U+5yKz3CcClNQMjM6zyddV6AoibYckilWqiNeJpsRzUpaA9G1i48LT9TZSWZe9N2NWmCe3Lys2llu3M4GqZr5yyWJhP6F3LrfNHXW7XHb7gx6EtmsQlr96GUtDH6ZXRBKzwjOhfrH+ipw5d8XfnPhxW0enmlEcqn/WiTWOGz0/CrbTPh7qER5yJD12ffwBUd/hV7J4aSaMF2wOndGyRMauJnToCP6mvgDRNTKhJpWUc8WhOBk5Gy5MSbYPYELiMkT5MYtiNN1K3D097AERPFuloJmUavIQjKbiUbQ0hrsLw3zgirUNcGLrGDKXwq36b7stqUs5QBYXk6EWljytxZTnYqWQLDzbhRoSDitv8AVW8NW67eXUaunf4CIlQKp0C48SQZJU8Uwsicko/0/b8lexTUhxYH2rclcfu0JrvXXkx4eHTx1x5XWemSngFW0dGLh2hsZGU6FEuiOMeMJ8/gEMeNnJSrqkVM32Gs3+60XA4Z4bTlpp7Q6rF4ypuCelAdWluoC4fx1fI60ZMaO7AlQDBZYPIp9C0xzQv1iugJCxwKR9MqYIX0kTcQTKSafDpEYQyQzrfu9pxKPcMI8E2rEKx1Wj0RNjLpm/OAC7Wv8CdHGMmfpdOikO7+3+AiIODeM9Df4GLPj94q8i/rcCz3JT3retEkf4UE8v4F4V/4hFOAJ8w8Bjo07+BjMTysdkHW1MQUTrutmG0y/o+LoNMgU7rYTVj9y9oLubEUZjP8YB/t4FCFk/ab/++g/msAt/8iFKzf8MOJAyzD0zf56D/bwP+ggQP/Gwf7e7t3Me0H8O04QDDZ/8SAgfVg0I0mn8A7eKCcqPVmSDbX/jQIiQd/FBHDhoiQIyavAYAsAsirhuc1Va/9cyLmfBuvsQRV+n+DhrHv+CSDf+W8kGtiH+BfZzNMUC1BLJwijVKsza+NZxnQOaR8Go/LLloSOdwk0eMzsAJJNjeSv1MYBT6pAraCBa5NGT49L+EiYzP10JsLfwJoiaNrHTWI734sD3xe9TzenK/fTARB8WnYARcKNnKumWn9yQ/MNHzREpNmyKUAXbii+90c0o5rgXhzxKDNQ4CLIHEXAz7AYH8KAeH8TkxoFNDStTnEb/DYPCOn8FlYf8rxdKmevZ5gfwL+/avhGGWxLv/VIBe8owylcoglTGzXMrQS+jESTsAjwxWz2n8KBRn8CpDX8RFPyyzyzP/lV/A3/VsiQ3wmJSyw4kzdQKN+APDSLGOjGBc4l1P4CGv/ajwGmgL1bUd6I07y/uJUUNoh3/W+m4pqTe5d8/BwzpV/MgXwCsMl/Av3zunlLsGnYAUBAz9TlRq65+DB2mVk5cUokVXI6QE55n2LprREudxyJXcNNGAr/gIMjvP8BCaiyDA+YjefxQH0XoEsAA7uwGAN/wAOHJgvcRycQEv9e7Kl9XhZI9wHEf8D+zv8xFyn+CgQrz+BP+ngbuOiS1t+0yPf68PrhGlPZc6LoQ3tPXz6kYkyuMt3eTE/hQAqvD+DhJL+GIAD0E9HGn8SAwP4ZB4Hr+YzIU3/gAh+MlY/gYDw5LvxAdbNRlAg29G5ojb/ARACG21N+gCejvkwIqy8o0XtcN5dvhPkZ6DLxXzp0rLwRnY/odsXzM38DQJuVIPOp/wGACLf8MiFwzQuRlgSn8LAeH+ChN6X+IQEUBKAEQoYpav4GpOujF1ZPJ2FADpZzYNuBaHrAe3S3ZZghIHyAi1Vi8RlyTujesC+DE2dmrfX5rdSpOzjeKAh/7L+B5pMZoP5pmk63/ZAS1QAf3/NJ/zjNJ/4sBN22UFp8+CeAiokRmF/n+aToYk+PI060ZX705sWXfraBTpMEZfAgeBmthdL9zkbYgKkp/1Phyy3PjM6lOCEBzz1/0+5nRJt4wuSz8E+wJjIkCHgamZiEn1s9SrZBWm41GuqI9YO4KoLXNa4PbQCYJ3YQMYrGK0ma9inI3xvJyxSguqzS5RhZmwvvx6rCGELribnsj4LefD6QvKgolg4e/nRN/hT2IPLNf2qybAGbQDk9RxoQrUTrCeyFk3L9Trzk8DhAEqjY+SK8N4mKh2ZXQ1T8w/h+aTH3DWwXOEotjiCR2tuGEzTHHSz9SR/FzaDlxW5Nwa36nMLl/9Z960wAzRln7cUqBBwj6Ia8+zJvHrcerYR1vGpzJRNhwU0hRsjpqbMHGfcIymvQW4Dw3y6J19a66K6a+oxCFgB29R2vPB3f3ylIj7hhIpdzk2UBKUNp6WHNn/ARK8mRQ5m/Kgy4HkRdtBi+NLGX2mb64TiNOPgvRsc/G8kR4B72hI/TV/EBY9OadA5UBIcbsx8DYpJRJn1/znuISbCB47nOAEU4OcgUU2T8NVynRpuXokj9psmL/A+CsWf4CEPdXz+AwASE/ln/8zqJtN4A/gWvo7B4hNo29zeRFjyw1krpffQDS/CISDmEwe5U4pcnWRqPmk16WTkYmEJeV3UPsD/tleL717Zt/gbFs/gJAqf4XaAtTQbrmUwYDAu7lMlPbwlY+cMAKDacAC7M9HFPDAvD+JQESHAQDViic2+oD+FQTX+BXR/+IQEUDWLo2Lvt6YDAr42gYNHphS9Nb54mYLTg3CBx7Rf+BzFz+YgF7azc+8AD/HgvOw8QgI6FjZ/AYAI5wYDcZP/EBedgqNSy+ZWJ3bRAoz9i0Uww9baX08T+KwB//bgvOwACOsYoh/DgA/qmib/nwXnYPf8dCeng0Ef7AFvEP8TAxc/yBFzBgtxTwgFnqppBgUwIfzQEg/4+BjdP7/CQf6TCQf8hDDy85/3s82tsoct3hhMBVP9PNBK/3V2CWurs3/HnbIJA2OfwLsDeBtzj211RsixIv8ArfGxpWyRruUsIoDUgAqi7K3Ze8JRBOf/Cg1L/goZyj/iEcxBJ1BzcWWdoDAscbHFk/IueTFW9rPD4ZfDyxwwsi/ggHVWP//CH2/+khD7c/gZE09H4TjJfnxTEtNfqmWCHYXUyjtvLzaO4HbYbV3RfLZKxJGiIt1Vs4aeKSFRv1HltH98uv6eV/iglMVXJfOtTAYDAOEBPowsWIiS5EnilmmzkBQFEYBz+B40/+YiMAi4he3P4Gm0feMyeAreP4BEJ6G57TVzsS7PqD/B/PlhMGeymdwSokNzOip/hQCq/gWMJ/iImxbQ1/MTACgMDfD/S4tulnpkoEGjZZ5kZrxnHxiD+aibFnQfeGAMD/EwIytfwLiagcIBM0w4fwGACPMFf0X+ThNtQTo/II4G1Q2Tc5TwKMcOcMbJw8Sr/HwLNV/AtCP9vxzBmMqjT/SEaOKRE9GXoWHUe9QCdBrxNB69JtCRcvoWdihBraKJ/5nywCb+R9a78L7LgN/xQEy7qNbkzszQGAg8RzdwqCf3ydDSUs4o04rYuvvDt/FJdkSG4xn0Sr/CwN1/E4mMB4c14oL1t38DSu38AB8ZH1gtPYzL2IgA+X5neDJb3CELXJtWUahwWDPww+xK2NDCMNsaaIXLRPE+PUvbdGR7UurPbf4Dg8AT+YQiYPWOaUCQD/GQoVpd/wHB4lX/uBQrQchP8FBEed/jYbGp+v7/GdP9IDXySb4fwJk4e4HhDh22hV0RgXzik2GXPvCsok2ViDkK6TgcXPfJkLnB6aq4HVO0lYtSnp7SYTKd5Sj/Mhmt7/FAzpvVeR4eopAYF7uCdRUBwUAp8H/V0IXPJAa2BSqf8VCByijT0GrwGB/CgdF/goPGsP4iPwS/xVXJTdIDA9Nc1CMPdClht9YZ/rACQsXr39oT/wcNzO/zEMH/4KBrYv4c1VqD+9iVpCNsKcTJYkrtbmJceNNSNYylISPX/PAos0NpON/Alqr7g6/rIMHNpGiSYB8Ql5L6SDBXSxPUJTv/kFZiPo+3v+Ahij+Y/hQIU/wRDO2f8QpMQEAmFamf4CBVj/4FucvcKAgE/Fdb86/8BEP+h8sdX3YeA8rfTNzB/DP1S/+rv6m4kqoaNxOCOScoIgFxQXSEkh/wy76A8b38UjsCUM4YiTQGBGhY8GQPligzFKLORMVnZGuxp94j/NSgG4zDThfwLhMe4ddw/2qCb+JpbSWNLbMpMaX7iGqKYRv9584ABsmc77iVz/CgFV/E4m8AXR8LWeqZsBga2PEW7DoeeJWNfS5gZOONDuSKGX/AP1+fwUHbfyvRbqYAucABgU9/p3mJgM7tY+t26txwIyGWAw+V/wR072P9/miP+kBqbV0VP4EqIfcS3LDcpWP3QAtOdMImfa/4Ys64TOFQ6XduOOoSRTy/HAg9HcR+uHJAFi+bC0DTymRafVeTLg/gbBhLnuxsP4DABLj+GDRErWCQ1ikO/hYJZ/gXG9/iABEhwM1rU1TuPSA/xYEjzx0Fg09/kAYDJZP9CBixBB/kQfX3Af5GSB/8SDAZJSDBAMBACCAoABgH+qgxYj+HEseHXGC3/hwAfY6cY/hwA/aAjzP4dAHwkK/hwB/a3DY/hwCfbgeL/hwC/g6Jz/jsAf/gM/GWpcSdw6ggHbIaRWt+xu3ISz7hgfzwGMyUZMib2yYTdbt4xMs+E0wdBZ3tL0iAizvgOoRsp2erla6l9KhV3oHOptYwt6HtdX05MmiCfRTuffCtZHPPUNJ6ADgq/wEGQNT+5OrPuJNyAIul3DPGsVcpAeMUpOQz9OGkrw0KW/RBlpLob/8Pj1AIkizq5W0QUZukZQ3LDkmS40u+uNqr0uB00DsrNtAWSa+PjaYF9Nz7UspLgD+D/4CD+CzVssq0oAYJcU1aQ15VFMwST1n6CzzPGoP47gPmOoCtYGJsQHuwNptx7p0FPU5DrJooj+cb43k7jRow95OrJk+H0QpRivMxCcLBW3eAhmqRuwn/8FEjlOFCewAZLxPR3ybxQXdrAIx4kkn9sEV/UIB343/ggh1G/hP0UhgSSyMOgAID/CAyQre4tTRb2tbdgCP1Sjz3nu0sGow1H147nHPbrPMRkNLgdVl9tVCxjGJRFpRUrVrtmW85Vt6qhg/8D7ov8EiTE/rIVI4oUub/gYedGp2BWCaEDaxpUgJUjTuEkXeiBftAu3ptqnm89KBS646F1xtaN3l/gBGEqIeR7/iJbYdnzTEDfxj/igVCKE0I8jd0gMD+FAO/+KBUIsfX8eTvGAwLopCPr/LFo/W38IDTBhE5ax68i/gZEk/gsBG/lue4aLfzgUovHmZT1Xt5L+gIowaROEmJK/eGWCz6USM0RMMPAwC24Q+1K7P7hrRNf2vKN5ado8W8fE+hOAN/gjVem/7/URr/SAvco8m/+CBiEf3IdUFyOoIV9xf2BALb/Oy9MinAbGkMtdbi2AWu9nPA0KCaYU99k5LevSYe/h0USRW/4AFFTYGQ9U/gbwyMX7Ld/4DABH/+GXiJY4jK6877+GwoAdv4KARv9ZDlSMGTuABgEsl3FB5OjYDXIMCHUHkGVv8AC07j+fH/A50//MQEJ/goGRz/h15heYQEcwmo/gMAErODAcUt/lp5hIQX+K3mH/AQ+4KF8f5OSCFuR6KodaZiYkOsBty/xWAP/w/G8/4+Fbfv8sBjhQOl1bTmi3X/DOTQoPy32ZBF/xWCP/4iGOI6H0/h5wfAv5geYf8TA/BP8BLzCArPtCAcnDgEGJJ8B/NASD/j4H6W/v8JB/pMJB/wUEsYlKCaaNB/nR28xOlV8p5i5je16H/g4cr4JIQCYXOBv4DABI+C3LH/Jwy9B/lghUy/gR+t6b7+BUPElGLuNCL8ybOLjohx0yf09hvexHaavfumCe3C91Im02F63ThmYo/0aIF2qJpFcfXnT51jrrkAu/+JRo2G4EY1z2DW/hYhDf8FC18xiLCeH/AYAJ/jQP+suDaZekuDgP4VGTf4FSNj+IQEUB06WiSCjK/gZLYKbF0vXLrhzlDjL4HbaC6DYfJBUtVAO9G/T3tdjM3VVBz9DG9VMMGFvIkRq3kF/LNDQP+ocpv4oA9v3vIkCzvoDA/hQHo+Q2T/wYPwxASqEbL/AYAJzQYWZcf+KBh8CAuQ1l/y8MPc/xhwTCPTff58Eh7kPc29ja2V0/zcMPdBn8GQzH+JiDzb+KwB/+H4nj/RAKyAiODPfx70TFBB/noYe5CyQAAAGSVRU+Wf4e6zovPj2wuCf4fQ12TujwuG/isFr/zAGe3/xAC1hR9/HsMAA8J/h4qLOAeyTwonLAD50H2Ic3U/oNyQ/iQX/PgMnHgT8lgkf8PgT/8YBl/8NhZ4Dd6y88dQ9/DPpjicpiwoT7P5WDoav8XE2bX+tB2o0D/10ETegaB/P13t/kAlwu/w8Eb4H+JhW3j+PPhr+DhRP+Mg0P/IBT3X/CouOZfwqIpBmAf1WAAfw/RUf4gHva/4fFR/8fFEV38ZB4f9Nhb9J/JkJUMAH+LvUiA/yQQLZ/00Eh/4eIXi/4fHi/4xJdwAAf6NDdQEg/ysQtPfxhZgfw5YwAg6eyls3/56DG0AAS/g9I+wHg/iseX/hM6l/isAf/y8HgxNPb7C/8OnzwHm/wPi4v+NigDP+GxgEw/gWRALXK3D1Cd2m6Vtjd+CgWXJc71jxe3gLhOWa5woL7fVg/E/8BC2urHe7QgDZhuJ5ALiRKbEc0l47F/g4LjE/gk8N/ltfVtcTA1knmjY1lox9aGDLhDBA+r+2eZIL/wcFIqfzErskQ2xlgBgYwUsHYKWI4ajIHJ9mblWQU7Zfu7/4OKRbgLx/gNAa/gRZmdoMC9yu/yQUi3fxtzAhH8eiQP8P9MIJ/kojmH/zgTLchJB/kGKDcP4aPOv8wEgrtX99wAKjSbkyqQaD5cfOvvV2b2w0lsnKHqlJcOpcx+ZUWVgn2/vHa/jOkP4ULZ+NWqJqqAz8Oh23uKfy9d5/wwEBhvDoOkTcav+Pho2pBcvoM2Ff5D2QuQ26vGmLQT+0KfJbn7uRgXg0PC3FIGLVY1Uvp3ws0qdM/X5STJH0Lm1k24SXPk60gUVE5OHoGOAG/5ZGkvyEAZ/xlXIZTmrW2EZmkG1Sh+LJQnZVwnt84/oibxpTcGP6xTE3OZJLpVckx55F5TI1PbOM5oXm/4eBnxP+eAZ8/xIFYcfwmDPlSFv5HwTC3p7q+grh4Z1r9TvlTXnUO9jKqj+E73NYCqGnsPUhbyfqMdbXohKi0RfONrQqoZsrnJsTSx2Qiv3mH/zWDP/Dj9iwyJqXLWoYwdTLVNVP6GyYdjQhZMGLgtk8KZlZ9cfHvNhX3iz+IYoqCfAEMrVLuZif/w8DPi388hn+tbgxdSfw+dXII1ia3kcTQJY8LGk5PMr0eVcCINbyC5CTF/gIJQs2qOloFI1HPxO3EI9LvGooePZlppJSH54CL5Q2gccWlRLMFrFW8yHP5qDP4Exc4sgq6YeZHDyZpD4RlFaIC474Iqyqui/N83SbQe18rvsGfCL7g5qsuVrIMHbjs47KT2f4eDPxL+eAz/iy43HTMFU84fBn2zTdTHaIUAYUqTnr4V+0WsfBImwwci5uAO17BeQFOwc2FOWnilc+KLQQm2gcMQFalp83z9KT8XBBRHwEvArPtq/5rBnwPZMqOe6U8dX980NPjKZR5dxHnQmBDWUQ40lSQMguCi262ZS0wgbbMFhpohBIt7cAyUjo2/h4GfBX+eRn/b7EDdwP4gE3/PVwVDgfg/MLWGTmz3TfolPFnGNFqdGQ2XdgiDfUHQmWDM4o6cs7bIX2CG5ZqXkokZm0tntQ/dOFbvYB5IeB40/5rBn/8TEe7mutzmip0xZql8CfVBXIYLPalJ7t1fqIO4kQZ4ixE0KyX9MCb/BuFB2ixGIfw+Jvwrtf9ZVVAxiXYY6Um3bRKnxNWJgWHgjyH0tgAKdbvkhHqlUdyycXm/mYaiQfGf5FRaZZb9mzcgteifyW/3PnBO/zWQPiNJPgx4EfOTr01ifApTuCZXoT/qlqPD9GyYt405zANz6rIKUyR8tTNt/H3SaQ/piVTSG2V/DwJvi/88k35hqvdgyfw+XxoNeUcD9L+NjSf4CGNKl0U7TOm1zlJfMn58dRZT3C8yaJbCThQSk+L1+hW48KDoQ6ZIoKd1ljJbu/OeIaoyvsArXo7at/loTfJACq4z/+PjMMB1Fp2lBnADG/FLSzPmBubPYfaITYKuMEiVyHpaTsnljiDrx9z7VzP4A52lWMq2rb6xQOhcCMzBTZOda8jaARpFxCnuek3X2bsRpsIvRC/EGEgindhExYUAoDnS1o6xvEPdpfKLous+RGFr4w30d4xGBNJ34q+clqJ/CPhywlgUjsqdV3ETynhFJqKBQirzK8OP1RXGpzFpthB8QYrPLGCPMh8PaJ/gDpUltufXq8Q9UJeWM6ygjCKuAme9zwBvDwaglPhPoKpT1mtbzQwikzdMUuwugdytCSrDcUYWyiHHx0zqne5k8fOr+HHFsUiJLc3a8yQebvESqvQAexjCEbnL+BzkkDDf4LOSfqc/v85J/qA5J4WHgkCPaxeSEujz+ADyJxv1BR8mce6mVbky4TpbVdaHlQFH+mDkngklz6OXS/h8yrQUIcV627CZ6IZ89ReZlA9iV3Br3ZKCYKJ2QWYL0DOv37nuEYifNzryF7ycamAn2DhxmRKgaZ2dCjK8JA06jhEpk/5/OSW0Ak44mbtPzatiJ7eIKsHW/RoF2DUdxQ4S9UwfnU7vOP6YOSd8/FJkZMP4gOSQKv2M8cm88rDKZpCAA2HwvTO51uXjAfnuk1C1THIaEV1fmyMLMt3efYCyKq7VQ2GPrno8BFKC1QGmv+gTMoG9XBz+fjknmvbcAmpxy1NVLVJ4BspafsgJdxEGw/tzwYTzaxXCLeb+mSWJYSKZeXe4/h8Gf2ln3ue1DL1LQrzjy/QQTpSpw/YQfcw1zJhjIKoNycFQDmCkpCim9Bjx6lPm13RFm86gQqbdAvES8gGVHCO3d2f/n85JEQLyDSR2/cz2IZVPgLG64ujCObT69U57loew4z8TKQd/pg5J4YqBjOxsTP4fDPyAcAEizBCbcWlo1RuTAWzexbjMWkrw9/ta+G4yGaB6t3J3ChA9ULlmyUrj49tLify6v3fYKCcTHiVCUIqaI8FB/5/DkmJ9gs1g13qtpOSQ4JyEoINANBtt0hYMxgqJU8s9qc3Cz+mDkngiTofJuD+IBN8fnClIFLC0LISHjwogCbWDBG9UOPjo448TRj3+psUMu93UUe2PR0HT0EHJwk4z3chKpFsANQTBGiSuYgj7IBWCIb/n45JZn1TFjC6V9h29AIst3MStSRG6aHGIDPkCWJFqIoW2yl/pgTf3sEgo9mt/D+Hxn/FIjw47Qix6ZRlIwmLzoxZpsFpSFHdn40+PC+2JwuPz0HsYl2eg/YYg2P8elQpZHqhGhBOXSUeBnqPoEYtgPX+n+fzkmpzQuZ4437bp9u5SfPMgGQ8dohN/es/PTsSkHW0rRy6D+mDkncfwHTs3mo/h8gfjFLwhDWktGm/GBUYSlpTGICF4LZUqEqNFhewJsWgls7Ho16gwokD1h8IQvTe+iUHmyQ+1yhzaTBMomU/4AOfBFKHTQb/p45J7jzGik9BarSjLOc8BaT2xkrlFPijRQup1JwP6WN6Z0XpKTsD5+VO+RgX9jXh59WA7NDEo1UgXZEE+nJO+YDSqpIAWQMilFUbtDg1iQvUHbhiWrwSARf8BA051BzJPIUw4DPGTvrtKDjD1IoO5EqGoGnsA0sILi4Xqr5qmbimv2UlLsiEsFB5Yaz9OAU2ZDZjK/BEdpl5RoPg27PcyEg4OW0gAigD3q9pvUo3Jip499k3gJkioDffuwb2rtyxVOcCoi4VRRHLcYG1qbOju2d6P8Ad2/qIFpB3hPLVE3XZ3CLNGCDHhvF/w5TUYB33+hWSxRYfQ+8n2ND32k1mSUb+fwQHJKl/Bjkm2f3+5J/1A5JzgH58oS7Que73bVUx7AdJ+hW7hQjgfDlqyuQOqbjquy/0yck22PFQlh2fxAPk4uU2MrwSwFJbl3/AH1DJWNshxdfScZ8XQzELQWgkN+slXJEZXCIxlpjD8RrSOYWt38ACPlbYIFeALEAUz+y9cu0umjZZY5/PxyT2yojF/wAKJDg+XYAjcumZEMlt7SWHFuxFOt1tQD9pJI7ipX9MuSYyt+ex/EJySRqW3Wrlo7GMOgs4kKvLkDY4iSLP8uRiZC+bPzYetyZoTQHVG52DWxdPzTOtLFzWGEUqAmDtQxuesCruCazVmq9/P5yTgNXWWdzWOxe6gxZ6tIQ9BjxCaI7uh5rNI49c1ExyhtJ/TJyTf1Me+0Z/D+aSgl66hfI6gUA8lTgAKXYwafh8PMggXNGgIC6z79ourIAag/JUSYKgKBuQtrCsYYlJ82AUuzN0HtKOHcboLpvK3+Ahgmob/n45JWz6tUmEZJKkal6kh4wMkK0ATnZawejrNFXwaFN34Uel/pk5J2Jve94r+IHjPSL3/CmFfbMvActW9IadszqkOMc3cs3V/CA78veDg21dZLNpNccrolxVsMHmRnvhWNkJRQjEatUnEJc+ogdCCZ/+fzkJQyj5HH51OnLBB12onPOHHTXUhWzBXs8VxvqKGs1P0Dg/pk5Jqnpo9/4hhc22lgGX/AQJnMP9G1D7rSYlyr43O+b/trgn+Ag4jZY2lycyPnMMA41lF5X9QdesSEBGit0CpbTD8Byw+5Igy/sN/78DF6Vv5/OSY0OXeiFBLdQxCAD8F7QW6S+YmqE3U63bbbiNYPHLMiTf6YOSd+l2nX1/IgwIUHSEbqcD7vUUeIRAbk6AIjgmGDgLGbSpYrY/wEUaR3BUvkRSMxKcyEXnTEd5zPujwFQfVS81fwDznguP9QkNk++ST2x1yn8/uSZJ0UgYKeg5H1nXCENBQBSYMGenOiUePkfwdBF9KLW9z/9MHJPADpkzy+Tp/D6zTHf4CIcMyEJ7p5H89FIKGbpyj4RfCecWAeOGVuBYT7q6KK2hLh1lj2mK6yk6+q66CEDdjhTRRsZ+3fGqp3rSRgC7dV/6H3JMBCRdh5w63/wEIikhx3f+rvCp/1wkFkm7ojMKUdQLWzj5g1QTQY0wx5RZr0wbE967CUh6IyLdbxn+6G5cWxGxvEUhDMkhSu8PZrk/wJo2zyOrg7x62Rb2jYDyLmCpH9A9yfDhf4CMQi4C6KQxpQfSbEP20xFcsaopGw/GCIkBEb6OTpg2kzVlGyPeeA1w9qfFcNk7J3GyI11k6RSufj19QVnKzyjIi+BGxK0St8uathNw+xXfMbAsV525AB2QE5Mwzj/FgEPSYcyCIhBb7FLevvfnuq00oiQGQy4l1L88BDm51j94PfL05J/jgdy1/wUCTYmK61T2a6a+VoplahZGhb4MGugvSxQSzmLX0TjPYIkhG+7hIzpnLx7iEYE/Nb0YC88Ta27eYn8YdEZroOEm0Uo+/Gk69boHmajeI48atH8Udr+6dVvW3nkB/Ctgl/goM+8/yECEZFYuPtBxGQRAYFLjuSbqpKabybykeOpa5xyqODfMAXyFOX8CeTFbAgFfrCB/AgAJBgSAffwsefhSJ/IR5+Df58CMyWD/PQ8FRAtB/noqIpCOD+Q26mwggdV/Mf9eA1Cs1yy2X0G4FDYqwoOYDBOCJwPVpNHvf+HmGkY/nlupipGKnWD+IG6mKVsABKTt6AEqO9o9h0dfkk6B6Az2sTuVFmUDzsu+i4sVrqjdDp7xIijRig4XmQKhbPNk2EahgxtMd81RG7NlWC/n9oJQxaDhNVENHdbHkLs1BFfdrdBoKHwcWSPtZUD+EVFIyr/pmNzFS+DWZjU/h91JTeWUy19PkSvHY9qmXsgpCcSXRjTbuaJ1Dc9U5wdLllNRuvIoB7rLbatkl9au1jSJCwlFGkExW4vFQza6CMInuNG/5+LvxWSYdQtJ9hhhKsQ6yDEP4A/QT0B/GgaVBcRaOmuxsknaKE+/h5upG/4eSH8vRNe8tP4ge6YLZx1In4XdfGhjYjWqJUZvlAag4QwkyLdWNp0uTv0mDH/rj2aNDLNE9PCvYk+uRKhV7v30L7/ARbQ71FNZM8d1Kphz/P9CnunDewWoDL7V9EvE/uUmYP6yu9JMxoavJlcAwd6r8T/j/TAM/vqNeBnW3efw/ARJWLnZGYQzk5nYZVRED5JGYsob8Lwkhq+qaN9eSBjXEvp4l0tWXWmOJxdJG6r2MWaPWXjMukzTP5aFpoXEuTMNhX8/l37KYv6Dep2mCdNgvMiGpY/fDGd1/ABJngPwDzi7Tz7Fz0Hm0/w890jP88xFNXtk/Mp1n8PxFKWSyENuAF3sO9DqHkwnc9wv1AyyT+2SM3PdCzF3EG6vg3vIfXUGbDFiLSUMnZwEoWE5iytSl6u7YqHFiY4jVmOP8stBIAcn8hF34L1MepNY2Pf1F2q1Ltj+JvCDlf2kidLjwbSRPYMXCrkUzf0OvjPQG/rRP7KoopSoT40SgCiB8X6QS3E/01DqcDIkzn9qtZjXPAzoPx+GVUlSlaVsU+rejT1umB7gilC+ofz2x7o8Y4k8f4CB50PLpCOxQT6QHP3xa5PqH8IzdMBFfBQcRQg8zODrDNwwxLL2TKzjSjUHJcvtqi4RvTBEc9PBT7lvP8gDJpfuaqJeUHMvWViXuqKwTSW7tBSwIN0P1CAmU4KA9qo93E2OfwzvGvn8F7hH+ugf/5JTLIq8LwsEypYkzRWzTPvHQ/J+b/Bgwar/g4GLIiv8ZDAIyV5HwCYPhAfwKY4HEgvkXnh4MWk2Az1iotcxHk1hQRudh4sBdk2/KnjUFWsJWaHWWzTeJQG4iqbI6EIUy2+tHdf+YAR4u1fSIxrnyWJB0KyJizHtOubaIRzf4wMtbBEIGR7CUA0JvLxaoRO51zVooZwv8HDz/f8y8hDvO4H8KTMb4fwdyD/4wC0LU2+l7q6rf8GAtdB9hDGL6vz66go5PpFUBjYu35z1lIvfve79DEcLBXh2WcEjhlvamfMoQPoW+48tTmklJUdjjv/lIE+UIPF2/wESyEin8Ngm3+yATyUPZS8/wYJ81+6FbhNqvo4UxSIGrlbr2r8ENxB4RVRIPs0F5pZ/gIQ+KimM+ny6RY8ShC6rgDzhOclH/ARXeA0S7J37l/FB0vpK1CUlzFAYEHFqEj3XyUHCub09VDf/qzcA8sWH81Ad884IkQAfwqFV/4MBKfP4hBuu9mYefCs/OBg6FG1tA6saHj+/FBvPCmzi5RkK1dPW6N09HBYDk5Ew/ij3P4WYaFcS7CoLJL1DScG9YUSnCJy3/Bw5S2sB/ztAcpbeQELLeXAgmBsBjH6eI8HJpqHpop4uMkW8BI/HLzP1LExuH/ZA5S3//4TpU/4gJ0qRl+ofwUuIBVMFr/8D6MFop2YIC+Y1jGeQbv8AQ/wrBsucyK/DNGIt3r5Mtzp/ACZ0ilfeWnJp07zonpxztMzTaxPQUswBeyZX6mK6B645VLdmFFYsNbWi7VBlauXsdOVRoxcclRxgf4CLln0zM4jgc+AV92iZCwc4wgAOqEoEp1jyO2iquwUujVi2+hoyI6zDpfmvH6tSb/cvuhSLHD93G9IOVFy5/xW6U6BBLv04aiLsXWdCgrAfl/b6PM6BXqqrnbuvvPCr4lclTcAyJ632a0oDBCmoluP4fVvgZxEgi1yE7VoCUYLTsjo/8ARIfumJ9iQd0zBuxd5u88axba4qdZanvOHHTtgZUt0qOlBdXj/EArth8Y7ju4RM6doLXFrswpEapwJPZ6cFufYVVnWNLEkNWvhr7XvB2iFR4wmmytCFBjX6e9vE7dpo9DBQ3vgNox/GICeVAdbxDu2QJM4coRUjqyiz5zhzTAjpLxPIBnuJa9Qx1SQ3G9Qc4QzNCNtJwZtbnFcSAcd77/ETrmMjZQBveLIr4mcD4BO58nV0wt4H+JiPo35AXaDBSjIIQ1/gBTdr2qDDRgglP8TFwaUOAoz8FlUeJ3ICiOCQKKr/jwqb7lf9ZCNcH8SFY4b/LoD+7fw6EUAOV/Ds1iENZ/golZJoGMa+wUowtEp3vh4RRNwvgSlPAr+BhywzISp4/4DABOv+GByz8Up2BBM7+Fhyz+BRqf+IAESHAcGZOQ3JPD+BiM5KRa5FPO6U/wEEyLPf1w8fADwbu/8qSy4T6djsS8Q+nFiNvfUbSGaUZ/Hda+G73RfR16thLLMWZz8DEZxr/AIR4v/wGACM/+NBbL2I1lO6pYDArU5C4upfAzSXQes0mADQC/iTEJw/wcReM/wSB7/ywRnHLHtCn8AxhWmtb7EdrBKbJ/d77zLGEIg6YdQulrwWFNIsYgcBmRchTxLAap60CXxJmBMo6hwAgG5oKimQQBd/8SgeEN9uEoG2ulfwsB3/4KINLv4hBNQRn3L1OjbSA/gUl/UZzeIWNi6aoj5qBUcXmbBC7pD2a/gAVo9AQ641Qxd07UWiVAf8BBDZXrxbxPWbzjXxv+AiBmgPKdJ6m+QT/gYpwNnHH8/+AwATX/hoGh/gAW037Z+94DAbapGS57A6Vpuhq48LLfELwoMX1v/wcRDffwSB7/ywDQuepPEQBgZwpx4WNxvw+nUOqcu7L0s/btDYmf4SJLaAJke1X+AwARegsn1/MnByHP+UgVO8BRUK6EqM3wP4HZ1FUtm4/G1p3GXF+TcT3QWEOr8LheWPeDaNeuC9BlnMpqscP6zMMbln+AGh5zhh8G4BsPacKc2d24QLv/iUF3hvtJZJB4soDAFHAZJlCEW/AMWYHE1UyCi21Q1Hx/wcRMHfwSAi/ywGMQBXU+gAfwqD9/4KIFu/4hCQQKHOBAHzAqA/hUG8+Qr/BQKmGQ0IBKZeLfwGACD8GG0nT/iwFTDCv8wAqYf+Pg5D/+EwHf/08VLPfxaUghMB/lYith/zERXAgAD/GoI//EBSQhMUSTWWx3wB/m4W90AtiVd9fU/w4CycJWW2IZCSmqRT6hdzkIBlmJ3Al/r/w9WZkgvKSsf/roNtVBqCBtqpSufHOWJI8YtcQ+ROlynfwTxlqQH89I771J/TE5v/WwTuqAAB/loPFB/hMU1/hgzVHwhtjtsBh/4+JjP8/Am1/+fi3xX/xITga/6yEw//8/EY5v8Phwf+UhJkv+Jw4P/Pw0PgAZJ/h4jHN/ioVn/hQCj/h8TH/ywJOnj4A95GmNsP+vAjHNBP89EY5oP44wPzEv8lgof8WgkdVHkgc/eX/PCMcxmgGdyY4DQAACAf4TGqAH/AhGKvMfg5ixrY3Lv1qWl/I/vLpWQV5tb/CoPbQmpzPcoT/loobo/hQI2nuUJ/VQkf5JSF/4fFn/4fFv/9MCM6A94wS0z6hJ/DB6LrO3TkzzIf54IF3f8hAoz406/x8lqfx4mKw38Cy2qylFCtrr4XA+u+O8xZq+ezTF+Vaq6O34b4Y4yqDDPnnLT/Chc3/goIf9jZx39j+Ay+Xg/hlfJ0Frcz5JpgMA/Bf4BNYRyGNDvTnWif3qqSZUIne85+QNOgg5M/gM/x6v/BCC0nqlLxKqjWR2Gl2Kpuuf4CFXe7c6pvKbIVodlp1fh4Q4mdVkYnt91b/ZRAjD7q3+3iBGEF5v+gCC0gT0/4CFxt27FJyQqSUX199we9jO0bW7EQocuy0/bwLlkrKCBssEIci9OEGBwLjePeG7+++ravksR2Q4My3qQRcjZ6e/z8ITE7PYo4Xt7uksao/6pWNQZt93yJIfI8nnX3mP+zHEt1g2ObgBIIENe1GEsW70V+PqbiQ9CjD3eOFk6ezaIDmAYr5f5ICNvfVTultEqmVQZVBNBJFCZ0qfD3RvP8EEGNZt/Ae+Mv3/giDGuJzQjy+px9UarIqhYVrL27Qx9AgAHrmCQgkTSMwN+K+pp7fdX/2UQY1+6t/t4gxr6Xv/4BBjXH4sK+LaOZJhK8AWEmHqjatPChM4Qzbv/gIiVVKwxTsvT0rwUZIPqeS23ebqSwtgSHXYgFnrcDucDWUvuCco12mhG/zzCfByDEphbx/Ryrs49PkvjwiuSc7YS0x9eAdvoXFti5Hnb73u4/J918jL52UecVnz2ey8GXoCa37EeAgXdps7sKJtHeh/H4hI9uTJYebCa+tXfYJNNPKf3KtcZgPFGxiB0vg0HGg/or2t+KISQovdFrw32adUFULO0lZWYcfEmFAIqiQpll2fIt/H4D+q9wowoTg4m30vZQ6u3l27UWE3I/MKRhCZH8AW/s1keDmlCT0EhkRFq+EH1fRPt+mXI+OAS1QNw8PU38AXDZMeNozPsHbO/xIJSPqupuB6yz5dDYMGuIke3abKvd5+gnyFCUFzhEDGwgGIBWr/AYAJvQYWwhFKfwmAVwFyEwnTXLimA/y444hJB/voPB1ASD/KxLXRf6kB3dP4QWQ/4rTgx/TJvuRvFAO/id5V/jJPf/j4DfAckLFfohKoVoSq9E++evOfTkMZCL4x7zGY3CzHHyplMJmCD5OUp/9LDBzH8Jhe/+NBwwX72ZLWiv//EjBv8VjCh/n4kYN/h8Nj/isOD/iYL//z8NpIUhmWujX/HwhOgfOsCRi0UXi1tzeko/0cL9Cfw+AP/yWKRj99kZxH3M3N866JGLQKE/0IL9yFQDnAGB/igt1H/iYYz4ADU0ADZ+AA4MAA4l09PPWyY3gr6J/QQuEgaGOSFfAgFAioIJQLIfAu1bQWThBV78fg/3wzRfqIvU5a+AZXVF7LuPsMS+n0IRIl7Ml/8FC/cnw+6H8CiX3wAYH8DgpP+JiYcwCAOoDADh+WCfCP8PEw1VcF6O0AgIIBAC4BAIQ4mV2//EB4AIAfDB8AQCAkoGAlIMBADgBgJWBgJcBAC4DAYCYAYBFfwJqnYEApgKAEH+QAzabQq9fPFsrBAwFmBQBAAwGH/yIBm1Ev/ggM3Sh/8XBm1ADggoBAf5CDNqAHe/gUBGBQBAP8HBm1FB/AZZN/gIRb4/gVqeARwBJQDqAABAATIBTAECAX+A0fMBZv8FBm2JyAnn+CgzbGOAZf/BAZtiA3ADgz+ChEdXVAd7+BQBr/ARIw0F/4KDNwwsCFP5Pl4cD4X/IXYLnCJRjwMGaKCobUrB91aZbWJSKB9rjK4GOgKAVec3Lu70bkSik39SUH3vK0vNu6mhD/Lg3Nj35q9ATZagKyiuvSn7TWkfPqumVhLTrxCiGE9n2ewTU8DrOPgXpRC/IEpeF20SDSUsFgZvDOMQ7Pkr1tI5DQJoeaSJXVQ6fTsYFC/Y43sv0i9Z/JIg6syaACHuJ6hlJAVv4JegP4dSnyG+t5j3FEqgMABNq+U/wEWXDjr42DEuXYVBe/d5KB62AXh/EoCJ/AkB7ZzlfAYEMibsYdlmzyV2PHKl1LIMWPAn5CP8D18v8EgRf+thHbMGN/8FBDejCMbHU/YpS+oj4B6iNy7gr4Z2ui9h17hrvP75gPjgmTqb8lxmZuxBXQO0NnEAB7VkdC/WNux/g4jRR/gkE2/18G/m/wH0bMkzlNoCuvTTfamtmCrdMLXbNKyVsF0K+WJn/cZuuhY4uZ43pD7e/oGAzRxF7bD11o5bU1vZv8VAzbH2mbpuwGAAmNsgFVtusG25xv9bNBR9A31zy5/FIRNxMZJlf4EGOD/4UBMX/BQ0oH/EIEUBTs6MTfkffw2ETO38Fgvf+rhxmBxu5Qcf4Gtxmg+QWQa/gCBpXlSoX8WTPBzgycD8nBSaSIzVJVPvwqQmWqHi+VIFb4n+9/UyP2wbJrGKIhq5FrdAvkAxj/BRec5/hAXNmA0GH/xUP6oAqLeR9yUgNyBcv+uB/RQ/gYBm/n2DS/g1DfD9ecgfHtqEt/uIeMB5X2NWYxJ4H/isAf/2wDYPApzrtS6D0H8ctgwXykq4QsP+fB/VDAwXWDpc5r9qVUXID7v4MdFv4SChf4rAH/4saGgUj7r01zAC/icFz/gNnmae4a/gRoqccDki/pgzjpY6tUI6pk/THWbgDrGo7/CgPVV/Lst01mpbS8HaYf4MEzp/ds2llyEgzn0f6eFj7so1rxBuhPrSdlNOmMzUd+sEIcnxDpxIOTvMQkfUJeAjyP4ADmx8/pqb5Dlz+KRVgvNXZQNgKAwHGH49xwjWNNY1KpiLDwOag83Xibf4pHf+3bxqv6/hYc4f4Fwc/4iFWMN2gMueq4DAP9WHjWhCG7EwSqzGrVsyERbHHbq/wcSMX/zENqf4KIqzP8TBhrfw1wH+CAhmaDATIgP8nBh0fOepbn3/Bp/gIqb8oCdimMmWxfOwgnT/JAYHR/wJsGODnoHZH00VjZLEs2AplEBBeL1NTLO4teMDOhr+ssUuiXX8KAlf+CiJbT+ISTboww+EwjCA/hULu+RBE/4HoqldhKf4DABDKDDu/F/habsECX+Wpuz+CAxH/PwD3YfwLHUfxuMNh/8GQIH+Jh3pf+KwB/+apuxoB5fO/C36YD+N5uwDLZXW5DVGgP4+m7A3sEQVgrf4+VkAD/SwE+G/w/QofySpofxEDBkYlUBeOff48B0XAr/LA+xH/n4HLTf5MhrCL5INUC8tX+DO/L+Eh2f+DA3F64o3hyFMhTCtTuVmtYAUhcE6Dv4rJXf8xDwhnuD/w7D2CZP/nwTuHf8kGSsX8dBj6B/Pri6BMH+egfCQEIP4+B//4+2YgFEMQegpX+OCnX+Rd4b+H1JAk/yMIhA/yPuzfxkB//xRbCn84g74Aid8Ocf5YGVFf4fEo/6fA3/4fBv/47A3791EM7SLCkKrk3QCeQLoPF4pSQNP5d6AgIk2DYIPv8+Cu+E/tgc5B/iqxs/kEgjI8nk8eVu8f4+KB/8fDz2GR/M4Zz/wuIv/wGjw/6sJh/iRGZvyAHp9ypFvNd/YtdWXDiORDYGE8Ir+AJ1B7bwZ05cXTAtjPwCn+S3lBknL9P1O+zwyonLuZghFLzyL62f4ZTYf4kKIwFg/l7Cw/kx5QL/UGy2XKVgO+L8EQpYUvrG65fdt/isAf/krCQ/iY0n/kF3wD8L3iWppgo1SEVBHMEhLPszrMalL81KgOOu/hgCJ/gUB+aQF/hMA0/hOF3/hPUr/hPCX/xMHiqA4IBAP+cA7FAHcDeAK9zwP8UDezpz/AvQK6A/yMl9j4Ij81gnG14l3AE1iFA/9wKo4rckM3dhytBSXlgaMjEbEl7TxYFfaBlIE3qlCYfmfxSlpGo5FkY/xQGA8Ae7TumBkgFsQoRLhYyk7GIeCEn/FLSlyYZ5JZ0BgPRLAksFHCXRL/V/+28rNGOf7kT1/wP0lfwSiw/62FmLSEU/gXzdd9+Dszz+AQZYo91Zkbw1uBGRh4rKpuJ5oljDQ0IZnsJdqp7cUwGNshxL6youIhBfluP+ACCMX/+EQKr/Xg+SN/iomslb4MEq7M/gWq9AbAU/xcHooon+hApwLA/ysHfUf4+EX1P9SB6MiB/w8vhfw87wB/8PDzY15/4QGjpvd7YisrAA9BXdG+g+JtSRbctBFi797h0fctde3GQAQe/8BBII5ZH8D+SLrHboUNtDToQS7V5VLkJ38Dfhx2yehJ/AYAIWf40JaJGk25qiT4DA8zRkJUvBo8P3fK2daExo7vnmfU4C7/4lARSG+exghiW6gMD+FArv+Be7X+IQIoEEeVTNa+E/hsOs/ILARv5Ybkjr9VfWAwP4TmIQL5C3H+A6RmPhAJjmZL+AMAEm4MHR6b+FqRkLRP5hpGQrg/oukZ/x8TnIf0dSMsRUyXKdBXgAAH9M0jIRyfwHSM0fbolMd7ungp1NWrDIZGQfoSM0wW72jZtq1jIRcqtys/zzSMqSxOr7d/j4Rpv87/pGf5MpGf4fDY/4qDg/4nC//5QpGf4yEJ/5YpGT8kxjkoAcA/2gTej/yUKRsTU3Biq1dIB/rwm9HBETTNyGlP9dEI+gD/FjIh/FYXH/CYJH5A7VBjMhLqjbW96zNxTMEwbR3qt/FcPd/j4m8D/i29BUkOIvEQj/0sGqffxl88z8fYUP8f/TIAKilGVbn8lg8f+WicKdSwownT2X+nvpmAf4D+mZhqAAIB/hMloAf8CGjs38JhbH8K/KATiYmqb0LCj/+MjAvH+UgkT/NQwJP/AdKE/tS0Tyh1c3pRIqJByedvAXhkSgMn0UsigfBOaAgf6q2x/wofwf4KJmnf4iNASrEkzI682A/gWOqg8twAUL6hFWIQ+CeryJOlioZpcwXt71jadqz2dPshQS/gDuUQmN2EstB2oy2eAf/ChTS1z4MJUoVj/aQKZOBSbp09NYr4BxSIO3Vdf1pCxb3qywgBqg/4CNFWhTOBmjDs4n/YBDsH/ng2HI//8Ma+f6gBTIdtzv4KBERgSAP4f1MQbv5zgqDW3v8j6ydIBcSsVCWcgS3qNt3cJvbpWmSqlzzV7bu7K9juYR1ce/wEa/BrVQPDt7XEZ3uj9/AH0pKJnw05l3bT1j6sEO0Ah7X7PR0oKEAfPeLUdWlNsucnq/9SbHUuD49ZuOdxiTG0UTkAd+UdzPD/RMWwNyZd2mmTRfKNmBEy00I5LzQgQn/juEalLlQAnFZPDeXorFSMcxi2Vph9a4y/8dbyVxd7KiNvYupXt8QmmnwSrbX6L9AzgEWV+iZzppwNqN/D65AkdaDJXeHMSXLtIJlYcEDfRMS+qSC5j9QzRgUDIHBB0I5LuSc0bjjGy4rseLlWexPunMPXFdhcOUBvZZcntjZP2mdHoS0Yw0LNeXXyyWTekHw7V7Cz1ErjKGEIIvrJQO5M7dL1UR9RQZs2Z5yNjERKJedX6f6YdEPFw3yEi3lFIVcRxS//gIhMV4+FYVi6JNzdkuCst0f2zmOlDyKrBJhCmxrbXQ3/AQGsjWArcqrs5ZjnszPyrAPblznsiy3coPornkpsJnzZgCo7DFfj1c5fmqRc/ukZFsavJwUU/xgjKX0wFHWdHNpPCC5iJq/xetkGr9A+f4MFELf4hARBROfeO2gIDArGPE+Amq6PrVf4BxDdEtb1KMe0+SIv4HUlf4KRk/5XuooTQ8G8AMAvOfsQEX2VQI6I6nT8jE8/IcUNhf8HDvzADf4BCGv+AgUQxm/gQAEgwKq9pSL4CS1xUa11Fy7IxY+Gk0yo+lBSqAuQEEoCY4Pv5VYev8VCbdX8JgWv8PRX1D/Cwo68SDdhSNyfw2IrsXNDg+Vcc/8SjqsO589kTemFJYHNRboamvBJb2DJjNaBfPp08C9RU5j5ne6fcX5Iq5W7lMrNv+AhH9VYO8ddYAc2rP4HxQztk9Cj+AwkAw/2AGjsT8B4ocWNty2MWLRRmYb6M4MX7qNujRJEnTpnIH6AFckF6LvsZn/CgFV/gokRaO+OpbP4DABHT/GQiL6ipSCiMcOwH8C418civhhiz5YOLNpPk42J5AvwR1HMTOnW8w0CSfv7G4/dhflwdvMleE9VCjVElhUpUkEhBhU4P8DhqP8D4DXDH5AfdVK9f4MBTSZIHUNMMbFynYtjvjnADUGVSws4RxxEbDn1UTct07bSgiWs63sqGeMXxDBCJQVWmyj3z8BFh1/bF/4OGvEjvjqf4EA3H7/XwWGB/Cgf5/Ayut/AQBB/wxZNrLhKqyj0wGAPS3CZrDwwgo9C2dGAme6Nmg/CTp/A61P/EANCAq9LkOIgRfwNiVS2onFGpvMGPK8biUGUbVTb8BOQsh2MZ5qFP6kYIfwA152x7PLbrxq6DN6P4fF4+ZDKDMHeJ1Lf/AuVMCfxGBYlbamWWMGQGB/CgI+f4KGulP4hARQJuezBR+sH/BgMI9NdlSq8g1dTFlbCQgqEnEhLfwkXwCJWlvoyRti+tvyJc1plcFtPr2osEO4Oa0U22pA6SaYFrfwNjuAX8GY7ij/CuO4BlGwLoFY4fwv47n8DAM39/47n8cwTJDXRlZnOH8cnJIOUxUnvAH8hyO5HV3QEuxCfYn+Gicon+EQoX+KwB/+LJmYDDK9aivfN/4px3PcHae4a/wQEPTDOAmu8IleIKCXnQfs7L/gIeA4Ik3zkYq5f4FFGCX/NBAeH/mAKyFPQkA/hUCE+Q2R/wEDm/X+EiA8MLMYH/eQVZCehIB/7oKshIQX/XBAhGeQAkfx6j7xFx//4Ksh/igs5/iAFr//8FWQ/6ECrIf4bCw8F6EfwCBh+/hk9RByTVlVJ3//8FWQ//+CrIf4PFE//zwVZD/1gKsh/iUPD//8FWQ//+CrIf/ABVkKF/mAdyfIEF/zkC+QgPZKFzWKf9WCrIf4TNP/4rAH/8xBVkP8P7woCXv+rgqyHnwA50RSmQU4a+EXv9K+ylCmUmcHv/CBRRxfzEmjf4KFFcP4Epzp6nd7zY7Vbl8ofJqjTCm2OREvABVycUuhZkwS/LHsJxChfCoNmp0SHq6cP71h0lHSOgYSzwD+CUyb+H0rpVhgAtPOaAwBj+ALfmfmSe5Jc9YXkjCEQG5NH4TC/gAu/+JQEWGkez5xG5ggMDUczPysdg74Oa+yU5PJ4DOOT7+/n/BxTB5/BICL/LdpqQeX8Kg3Xh8wKtBfxAnlAh42VULzhsBgf44JTquP/aRKdVjf/+DmVD//RKdUfVP+/hfP0mmCcQBDHbXF4CDBjQpdE++LQVdJC0oZUiI+xyW3hZ8z6I7x01W5G4MQKSPa92Eu0iBNrMgA44R6VEDlxwrrHZJ02oIpmA9YnUEu5QdWLBZ8VsMrSSeQtzG58zGIkBcq1pTPnqEEoUvJ44U7rm0kbqpT+rj1MfvfIblm73D+lzR3tO3UKfTznGp/qja4bNRenq7SlFEfEd1gfvRG99Sj12r3fo0iJXHtOFBX7vZPG/dRUiljfiPIKeWpFAWjJRe8WP4f59AZlWL5UTAOgAe04JUxxNRNBHGoK+R+KczicasG24CbpshZOigyr6aEX9/TmrH4FtYEQBKAOhxL5vCmBTgThIC8pp+0ryktqc3Fj1yRtGkF9ijGnLPb3NwU3jciQ5YbDGqj8b4CFAA2vaTWf1H0026eODpnbW0C+zyEenDwW5j7Djk1knrYzWkgIQe/8nmtv+o4Pan5/a7ArNsOcUptrZUXnv8BEukX6m/4CJidMG95qxK+v894usOeQAubyiDCv0ZlfrtGGlKqD+BciWiK7hnAJFFP1PY++ogz00jjq0TFCK4cDk6CTHbFP0ksp3yAr9MEcliZSjeZthoFHcDYf6SQW/wYSBahLvCE937TUIY1XiJMGAlz3/k4akUGv8BCbduabzb+2HUyxoisExp3BGYHPwscS6eEN52Z20P8CY0/teBpO7Tfw20J+SoXk7T1DC47NfDbBAN1CAT4Pu0GAsbj/k4etl/zASNqCq8x/4QKLQr3pTQUm0Tf8yVhmkcjl6mzFkU27hZq3HMIC1Z/uolnMY74eqsRmzrYr2rffnMWYwMtA5cBp5+/igcQ+pCU4qUqv4bHEP5kIRv8FFJlIZ6izfdify29mo9Qz/wW6JAmDgr381EZf+DBMaX+EwfetggG7/gYH3s7f/FBT+XAXEJ0AC8/4aLQNv4SCQf88CRkwqvMae8EFZR5RXVdSsA9XzwG7rONmIv9cHYcC1wAKTP8/hfH8E5rtSBoguTHd3ZO8A69WBZdGWEAiTQcRrH5LiQI4APV6qE/01ELNg1dA4b0ORm810jW+/tfwE43/FBKRHnOVv/pr/CxKR/E5GGBoEIyRhxyEB/CoPx/E4CKBTt7v7MqGQGBI7sWk6mtQjMaxTOCEaFoiyAMepp/gwhHECcmEA0EJnv4DABKeDApKf/zJELpoD/DpgX3Cd1/j4gMGcpC5ZW5t+L6Uh1ZtIhD3aM955VKU8GbNoHOJP4Bo6OB+ynZ3/wEbXmipIMlM4qH0wFecnGWctqbFNRGYm0QrjdyyqVv+PjJtQb/x0IeoUP/OwS9FHyxOMEZs3Yyycp6Pyfnsz4T8f/88EeZwDYmUh8wLTg1pPSkc5beEjTEbX8CWa7pIIKHFKwePz2QvioBdDPe7oJRxo97JUWOFcBbR2K5vaT0HBVGlnfxSLQT4zc1PpfwNCqWObbMDmN/wDK8mNsJrO4Hp0mbkxJyDeFlZdkiHNrdWopq6Jqft0ez1ge2fSwZ3qaQBFtYJY31n81HBjLuhBuAGBgjMReMyqW/GA6T5aRMhVHpcZyJn+Dig/T+GI6Sbnm1/P8aBwc97/Afa5v/8UDg5/4f55P4xSnwSw88GHMtjT2+9rEYnZiDKEDo1Xa3feaANBFadFRoMzGKGSfq5Bjim14xd/jgS26AoRZF248/gMAERf8qDx2A64QjHXNA4T9GbeTMO0G0aneYifw4eG5Not8EhP7xp69BykAhSoULEEUW6QbbSZqWOoRMzTmMm4ev/gIv35wOaIKuJR+f4Z8p2+zTIopqd68F67P4pQPQ4m2mSQK4DAUNrrQ3FeabYn6uakTGOENSc3O2H/igb0zwrhG5NegMD+FA2X8igb0AqbEMWligP4VDJv8FEbsn8RAIpB36rqDb/+PijgJd/+pFHBI//voo4JR4L/XhRwSP/6AHsDv8cE0kKB//OIBwv9GEA4QAhW1X/AuMzdYiMZq/OVAvw5sGjL5/k1sruasSZRbk4RAv7FcrlTMMHR8Y7FM+7DzsKyRjZqbp1gkezS+ifwN9PIKq5fsIgqrmC/40BU2GpsLVyG/gadEvL5TxOniisXsD0m7Uwu/G0K1iVhOdpUDvO43qvltawdwzzJUe6fqP2r/hWAmlvNB2C7gTksv4pAaw2EysbJWYDAG/aoobqwICa5HyjZUNFK2aU9rDf/wcVAyfwSAjfywsFilEk6YAYHlfOR/Q85Xse8F7mWgsSNsvTHR/hAOst/gKAhP5X/BF0zw41AwG0TtuhffAP+AKE4xNBjKm61nYCxNK3y/ggNLUf/2B3PHucOdik377UkbO1UkSDRbfffssHv8ei6paC/wLUSYSkpF+OZrbCBpxR+U0LU0f7uykQSICfQEQet0cFWR4RlXdD0tDR2BM3W2WnVqL3EQs95IcVz8CVoQF/EYW+189X/AAHQJT38LBb/+Ciog7+IQyUDdKKIWVJjID+FQqD/BRRr9/EIbWBIE6lirqOwGAPSLedM4VO0Y/DIqsHIQs58Ajkcf/g4bbfGZj/aQ22/k/7AG23z+PHS//4A22//Fjl3/lYW4/g4wsRzHtgP84C2/90NYiK01UAf0+AAAwmDeVx/gIeyUsxdcq1gtjl6BJtlzKptAoRTaoiRDTOpBwlYWnv3LAWsHBxzuDMVlnsETqnu9oAgTjVJ4GGGGyH4ZderMQ8ZA4rcDE0WWhlrcQF+ANykkeIvYo+FPznt6Iw3JULxEMVqKB9vE3Xf8BHXxRH2kzcl8gP1XDBuGfVhzcYlVB7tQtUjedUYaeYAqVqH1O1j+BYhOWNVYqcJ8SVd6fSdPEFzHDNeFfPnS/vGHMbYsjbdnBJv4qIwouu7mxznDPBSx/H8Pgb6a2iakNblD3zYhvjkkFiurXtpydB7S+yEnvFbk6pz255q0+Xl9EoKfYqwK9oRIPZrUomnkWkWy51MOJxy9Va1dZglkavAJd99repzDJH9YNkpFjwAG5HPsLyk/lEP+R3bjoslUZW3kiQqqM2AwSOqt4wT4ENe7Fgo5NZuqWUY5Z9TYgkz4F2MUouUh1rCxBKIcR9wBqf0kAZh74UAn5AW52JxmUyjb+mjuv1gykC3t7VmAhTVAo9ECQRlGnXB11OIP38C49+MFmJD8FyA8cogx2bJLANxct9qFB7RHD2/g7cTvKzWIZaSMgvM3igDi5l0lRF1mdSqKxDSej/A2vig8Gn/gIUxPQeDUB/4Y1KwYBel6E36AwH8qPF3U84INs2/RGqR2k7zSP5TG/gTEaU//CQ26N/jImMWxz/EBZ8lgMAThC/Ix5EpsCs39TXzCGdYqBUennsgP8EBOon8B7FzVL8OOazkws88qLZnmqDjvh4agxsOBcDFZ4BqYQH8ErzGV8is0oBXAOtSgLL1gYv+AjcRNgYUGA3QjNY00v7k1EfT/yvJa0/gFPOK86YR2W0v8BBeEZpNt2djUuvD/xQB4/Kvh+Lm5QGAu6ape+EMKetbWeSroRtjiLoXp7//BBNl/BAJv/L208Qql9uvUBSzvTfO8VcZu/T6Jqain/A+pb/MQEIp06C+fw8Z/IPzexn8kbBj19rnIPdbhSuhvrndQP7G1/54JAWwMuzEz+AXPXzo0G5xftju1hbUHw+uwVqFWxSw40dBIyVVT7HDgmak/Ecrv4gK8FtLGLvbUb9H5uQXAVdVZAvD+JQ9CHARswVnQQAj+Fgt/+BibL+IAEUDmQbfnXjHgP4VCoP4F3CP+IQIsCKdC0g+LZgMCQp8xQleAF/gBzFf/e3xmxpQAzeZ93++uCA57/KA2Kx/AeuDovZID0GxsGJ6R+mL1jCTYlw6sdw2tVFjMqbEUZEw+s9P1ntnWUE0ujePDpA648JfKQuhukvfwQKEagV/AYoQJfw2ILdvaNf4wJJW5b/JASSt/wHrbaqDCpEndQwci4TI9J0zK3/Gg9i7b1u4PmZTRHlWrKgBFYy8AWoRqnLOFSjgw3gPRRhIYL2NgXh/EoBTQ4C7IZQvv3nAYEctXabEA8sLtcpfa0yvtkrrj7eWc8DbV38RAIrXnZq6ZxkBgYm9HBkZz2urT4lISYr2+0BzN3kOfwQcZfwQCZ/y1HXJ6P3/46Ks7sL+9kX7hIluwqMZ8HMSphc3PoahQ9h76/89EibZfAqf8C94Wy/BcDpv0dv76/ejVuxjY3PnCsqVvlBOKIthsCESYhli68q++YexVyKQZ2U0VZYdeSwvoZRb/FIVANT6oos41/A3Lru5zjICcSZOKbJQoQRJS3rxbiI14AUhDkktOxbJZSFFWgeFMjmf40SUYO6IJ71LF7jxcF/fD/xSA1sm2aAfXz/w2FjDp/BQf1/LCHyrK9g//wN0zcOTA1xSmkiRbQKf0hsR8kLvp46S00suJ9Z/gIsroCvkJ6HiENWVz3cv+AIc7iHaYqJupbbwmp5eKX5f+KBKT0w5NaDVr+Fg7bw/gYg1/iABoUb91LTrAf4G7F+Q2c58LWsBN6DUB2aAu9r+s/wEQAJBTiK5KY8f3hHy/i2+KFVH2BjSz+D0kNE7XjUk7bo2QXk36fxiDQsA9wUFVsTcwlAqShhueAF5MERgUBfIKJfwCTpmR/Es6ZyfyFOmZ38e9QwGQf5ABfAAH/PAvzogfyPTpmB/pYlgp/nKdM/hjOoFzrJBZ25Wf8cD+devukP+MAGeJKdu9QmOfEpb+HLJDWDv8YELv8DGqybVHib/Dh0iDxIyPn8/n+ljsUSXUDKeLe8wrmO8rb7zhMzJ8dXmsEPUoIDAEE+xGEqUiZ7h8WYhYfpPXKSxvRHTedKWfuWZEyoJuJxmEE20bPJn3ybIBa1hHD1LdZfS/Mu3NOYIvGjycJte3SOquLketgPIS1a9bYx6aMiYByHKHn7uzatI8BD2IdyTvsqCKXi3dZrFFMPl/gIp60FYe32X0N9oeCDPMwmg0siBjVTF1EEX9pyBAFT4Irt5ZhbvSPG/84St+zbxXX3NvmWrAqGafY79/yANJlSABXtVU847utxJOLcLKRE3ywnGo6X39iTmSPsq0sKw6br7DRm+fM5eHyMeSFNpkAOfAqjBcUMZologpnSmL8s5MVLxVlw9VCGWnOQRTyxyFnoxVujpCtqhfBXNdfZlN4fHA5rMl8MJsuurBIr7qXPDX+PpICoqk3YyduqPwkLZWERtUT7viAbAVnhBRnVW8dVYGR45EHmrbhUAGXpY79YOyAXacEepKiLMfOTvaD+y/WcR2yvu0r7vFULSY59l2KN/AsEf0aupLmyixfol1xBkaH+Ah4hc0dZxdA3P35vnjb0Nd0cwdy4L/FREjh/goO2yAIvrIRK3JPc/gQAEgwSU/xcbImAeJBVqtl/78BWA3+DF4dy0/fcTW8973cEecsBCSQdN/27V/4+yUf4fsHH/HAnj+HeWUwtq9f9H8PYUrZ/D9iDTKatOHUU/KSnZ4+xxlNB3qFno/uyupOeaIhcR5YgLn+GBo/ivOoBJsOLSggtJAWwAtESuLomVBn/QwqKI7xsrS3WftfX3/HW3wTCCHmNCbg7SVmOg0TN5lwkD0rHT/PYM+G8FqyXNqLsP8D2SK6fw/bwuM6EgwrH6Eb/AFbNrWcHIXsdVnRkKi+zg70/LmQWQjy/Kf+GCkiXrUHdtQ3n1rLDUeaiGs/AdO+OMFf+PwZ8WYHGNkf8HETiv+XgTu3glr8LLOxtJdGw288IbNQhae+jHf+MS65/7ejUgrMAqMiNpKAsrlxO9eYYSj/BxVlF/BRaF/K+Xc24to4/4MCnB+ywEUp1EiWK/gCM8wuXBOnRz0z0Xwxl/IeNBajVyo9iOpxC9Vk0WjzFgMx1wcOUo8NLN8/eSvE/geKwR93/ARiGl/AYAJG/wy7ZDor8G1ijf8LAeH+CiHQ/+IQEUClZOQrjgZ/gYzl8Bi6JYDZrGiBOfT5J+JgpGevqCU6dTuukjWeDMcXJ78xH9weoBWRBHIXi6dtoRpgOLM9GJjf4pAsRi56uM/04D/GhSVD338B1LD7/xgVOz/1ILuLAX+UDDFD/GxJKz/Alv36GkrhWZVncPLy4yb0ANt9SQTWPteKyw/FaWlqtMa5rZU6bm7gk2gVB+aL8+Al6ICm0zHAZnfxSDYv6o22FmFQGAFLlt+WzB0aWfZWCQ6I+HXnPOyUN/g4qiZ/gkH4/lg6F0asS/8CEEgjWc2ZDzy3XloKw0sDXteKa4S7wd/4QFAbBN/mnTH0f9gCg2AAAf/+FBsP6Z0xwIbDP5/c75Us/gAhtJOFmMOqb7QpgALjTcsOtv2hthWr+DyAVw7N7m8SJIU5jJ4+w6WG6m2Or2uQ4RnYM5KYAuyfrSFZbOzi66oR0tnTru503pkTezpGCNph2Xpd/ceG3fNKWOWHyQO8dQQIBgAlPi7QHuQJetfQD4nhP18smDY2rr1EUDP3ZFRfI5lh4h+lm5rj3BcO19KrAmTzWHfsOhPZH/4B40qupS0kp+5fM461fOExH1kd3N5+VHCjTV5Hq2R0zQNCiQ/VfBTSf4fmxwZRkiVIIHWL0BOlWlX2CoEjbPs9QikLdu7rHWXQdmTwFFnTE8Ccmh4OK9XgaDWuolNlngKAX6gd9Eg0kiH6+V20l8wZcvpi8KWTngnQMRLyi6pj7cu/JYO+8ipKsloBHhFUFdU6NYAsIdZW6dIwvJTOCzg9ffYTP3UjAauUoVIUHMJ/qOW96LZ7yEi2yW5aS/lGfHvveyVsRA4DtYVZSv4NloJoGqFo6UPwvkflCSoHQHeUuhx5/nT0WTGpaoOrqQS5z/FgMaB/golYdkP8jErDoKQM080mH+ZiFYd/x8DBh/w+AP/5YHBSf4GApv44+ewUegtgKKQn/SwHyef6kB1EP9NErDvv8Xb9wu95igoEMhhYHC6fRglEGyey5NFf6xnpTxktTJjif4wIp+ASETmaKBP+ZEUejfteO+BQf/lYv4X/xgRT8MnF5jnxY6BCbq0dyRSluiSy9+4QfwNo9JDun0P8BgAlT/DEAX4l1sP9zd/CxFr/goq+Uz4gARIcBRFw7qoaU/wYCmL/bPSIif+e1wkdFNo/ESgYGHusXTaQRIusxL3ayYM0q8j3+OArkAQ/2gTxOPQDPPs2ZZ3ZUkqDdGz9pHv8xIUGNqba6VHyB2mr+9tmnnl/sCBbGP//FX3X/Fh6JWS4VD/CgVhSE03Jf/w/KmduZT9SNoPdv7UOL+mz5L5H9foiZ/Gwp2q/PdrIPyIbxTUY2GkOT40KxuNBn81aUskjapgkQsFhpTISW5BUhp1ebzxtKRv+SRotzQ6/4CCAMh6IahS/WvzvEuoqFe09Bb5DTka1I8xTdWGr/PyIvxIsxjk4K4CXOJuk0nv7VNub5ODuNdQWkCLGPhS+aBx1lseKQyB+LH8avweRoQh6OwSpPphsojsfX56ne4LzB60qazLzpb5moy6Dng7Avd1n3iyGJuP8IBSGeI/pxGegxNYJjyw2rWH2W+Bt7AryEJb9BofPUOo9+ISs1+V1bCb25/4CPAoXpCuhPYljebQsY2Y+lJHTKTIDJPLB7yU8WEo9IIiVFeGQRkmzjTVnlpVOcP5YECWjHtAzk7TOpUdBuPWYm9vZMylWwsVODAHaQvRPgUowuuYioEkvf0W6QHZdkmeMD08SYuTRGXC87+fvQb/ARMM42BB1nY+tm4v3rQGeTMJ6lLwYEk0uRAzoKU62zkkDJlP5HaPyKGhJeT1Tpf/AvpJ/gIpBH5qlQZmiZbjEoo+izUoCDdL7LZz0nxGrFPuFvZxfte5mLS9LFlnhHZ1/WGa0sj9qRawRrYC8P4lFQIcBgWKn2jtcgMBOq0TZ7EOKs92APjtghc/Pi0NVrX/BxZiL/BIsP/rILT30FcSAP8EFG4s8AYrUG9Fdztylni/cbTvdvq4hL/QS8Cu19HkGEzRNWC4XGAzVi6cKgzY1GIdBwo30hzBvUXGf4MKtldn/DQ/eHrLkFqdf7vjEQlB5hfXUjnmO5khzDBjXR/Z3kQGB/CYAOf4QJXH/8ID7w/+tA/+EFZ4P+FQIT5/gQDNF/8LArggnHH+LAqbkCxf9gB/PoKzwf+/Chjfg/10UMb/5aD+fH/HwkNt/CLrDmhP8ens809f56KrtQD+EPo7+Ewvv+PMOJBM0JEkIQPoFHwjCxqWWlf/m8d4I7AgyqYg6++NyZC9n1bL/HAnGQx4HfZdIcPRLLchAm7Oo36MDD9YJ87/xEFAy5+X8BwbUBgf4CL1Uwum77Pubm+oND2VJHfzwXSclT+KAETwPeIuV6r+FhQn/BRWah/EJSWBpC9QROh/gwRDXvgJH2GZd7VXiIS06olsa9zQodZb7JPxMS/KtA7xtOHzV1nYS65CQnAoeSwTxRzfKwgmyoAHE7+KRbtiwmf8wEOB6K/8CxW/wGuH000DysTdeDY7oYrNQlMJhYqWfFRL5v6JCW+ymsVGgfKM9Z76H/G2KAyld7iBIK4npg31/AKJ3hv8FAsX8tSnDGVfOAbP5ithg0M3XRHIeG/xN5wI3ZI7f/g4nMI/goZm/ldRedTJ/Of4GBmxBfaH4If7Jki55JL7ZiQaDF2am5a36+Cq0jMMXfQkGWVb+FAKr/BROAFJDun1v8BhS1r/DNCPEF4GBojN/AvtX8CkoeVSLJ10UaHWXUX+AikbHo0c2RsUB3AfNQH/kgTK9lEWw+PkQAi3NqnCC6uVkM89tcTiFCQFy/gkL9/h0BrhoGCIuAesIAwGVVxgInDGvRLJFlTejG49FGD/XnAvD+JQESHAXUYcMAuiiAwP4UCJv4D7Ww7+IwPZVbHxd/wEeNo/wMFJDFD+bfLiSV3H9EVBuSbx0i5eOaRhtfDqGj5PhSrmt0nfwAaSFNJkUI8z3d184eGOI8QvPVQDUYO/wOs2fwSB7/68B4bFK4DUiIkG0UDp+q3Xcaa2nbrtg2PjtggJ7hAI2siOgwTw/wEDMFf5yBCIa3+HUJEUM/4cX/s1lAMBdhZTEC2OD12S8opLQ6/3U8ilLBIZpGLaK6eg4r32HU/vOqTQMzmm3Utu2pmy50VN2fD1mqtd/E1yFEveyra7P8oFmFT8DjFm9ifapZtbjrBfVPXJnreY44njRvLxz2EHZu+8tFvOh6a+fFLGbSGwiW8FuQgPFvLkSnJ/xSEvic5u1PvI0BgJX3HbztO61+14j2DQj2Gx9hgGX8C2FRb/EQCKBmBt0AD7+EBgfwoEc/wQf4fw8BFggfcWDELBkB/jojITVCEAmeYlv4DABJ2DAhq2/zsRkJ/v+eCMHOCpFr/fRGQk+aARaxH5AheeWdGSQ2H1Ems4Cn8AB8f5QzzTDWO+6txAZdOrVH+/CMhP/HBVnA09vrP/w8Cvremz1b4+7IR2nMriGgphiFtgBpctTnF87Zjr6kvJs/tAv8fgv4mqCZ7YCXkke1q7SlBgSFW9IGb8TkPDPodbLtqAEkDXowPpxhR2F/gIdtwOUkMXwqO2ESUqOdYh68YUgWWrIp96Lrv/hPZpyWS+mAkNyJouuCA0IpHhBUdRfodtyibx1zBF6rOccnrP9XEZCf8RBa4J/oIjIT/g0G/ae32zAABAAT/VhGQJgn+rCMhME/1YRkJ/xmArgP8QBF/8agV4FYsLu2/wBOM3lQA7HfulbI5ZSTYNacE5JrzVT0ORalBlASqP+OCQPL+CDkupKweHJHfi/zALeNmgQgtYTknzj6nKZI7pACjWHZ0+MQ0P93RhIB3VXOxE+sbjh6+se2v4pGgSqPtqhLOqAwALzqY73gf4AEM5vKNIvALUgkKwTD5Iz47YIBtYEAm9eXIMD25D/OhMLA3/w8P4F3+PiYWAjCPDuHBr0iDzfgJ5DCqEDSGQqea8OY1XXjFojqd5CG0Bg8+5bzlUQDOHIe5eUKl5CSVWXb6gAII33GDdB2Sg9D/JBEY0V1JA5OkUCQLMAHNXSUbESXYyrkF/goYB5AoQCWIZl/gMAES6DAS+plP8TDsNn+jhmxX/HAqxUuf4YFWKgz/HAqxUgKhakMOJsdu6K4DhOLZUwlQKU0HFcrimxwNOu6uJic3876FWKrkf/h8GTH4OOt9VXdr/yMh3JzMBmnH/wfLc9wTgHakqITB1ncljHgBMZSSGGTf/ARh57t4imzFrSSipeEKJo8zxDfY731wvoM/817Mp76dfHy3oCXos5pnJZIUUsWrt1Pn+ehSOOa/P4FT+CRggTSiioxliXMbzf4AH0RimUBzeX/M+h2zNuzN3DgVZbTnp/8aF7SobsAcI15n7jP8BgAmhBLR5KX9vgkwgeVTFyqfE1X6EdjqjytmFDj+MuDM96IfQe9dhKYniEASQJKEhiGxjF2ISKmUod8v20FBpU+Vvoda4BE5X7cxM9cqhdtFgzO8B21/ACbAZK/f/FCV1CgbUETWHQGBHeDf9MirCQrCjIw2UF7LjkzBx+/+Awkyc/iEswDPlxnLJ2DoDAFGAllWjHr8ed0dawuAYX/gAT66VIDuz/BwqUl/BRpJ/K8ciYBfV/jAEhaAv4ErvAMnHiEAlbvI/4DABCyDAuHD/xYWsyAWJ/oQtZkoP48r0wk3SD+HgB8h/2sEElf5kBImqf/Xha1J/r4JOz/h8Dfir+XgN8h/0ARrE/44Lf3Pv//wmlP/pIS4y8f4FU1ydZJHfrs+0NN1jOvEbT5XM60S+ndT7pXZ+4DMpv67peENYFyD2tJ6PQXtSEwkYN3DUVM5Zn7+KUJYv0TCU7i3+FhRl/wUKuD/xEhuG4z0iQBl/wMn2lZfZ4gzk+hcjIWT+LAtUTiE9MEcXCCNbCIRSWUwHi9jIL9mmSKarjCU3QwXoE+EE/MB8hIZf8DelqeG5Zp/AYAIefwyikCPif7qhrQBgA+DfZ0+D6DuE3V68L7f+FWgyfqL/g4G4b/glQ0/lrLCC6Uyv8aDRm9//AdhPGf9YGjN/4/sAP8yC/6gJ/k44mgf81DRi47Yf5IJTNP4ET/y9IGhy3Xijb0bRqjarldyXbR7v2ePYq2uAzrEppLyDyNj/Ewj0b/g4mFkAlUQ9POAwAT9guz2/ygNOk/5eJ0UIktMcD+FAIT5C1H/BBfLMf4SJ0UAlSgP8WE6KAX/mAnRQ/jAUbiS0x/z8Toof56DEH8KQP9cFI7v8Pgb8Kemn8eKCACZnn+uidFD/8RTHDPwgFJ/xCFR/6AKY4TXrU1Kn1+WaeZlTDRNDWpKfvXDIPSXA/opvpFM4uZ4FiDrHF6xsb4zBDqv/gBXAdZ3MssCtFeLcihMvNKecQe03PceHP9KFjc3+YidbD+Lj/WCUfPDQhQHIBIb/DoL+URv8Mg64ehfweWzf5iOAOf8fEuu8+Zv/3CmOEI/k4L//j1i0iS0x//8Uxw//+KY4f4rQaPyMBL/+Hwt+Gakv/uxTHD/EQd//L+d3R/5GMJIjguOTkMzg3/xIUxv/wIAMnJqc/4cKY3jg5ODg4MTL/DBTG6zMCaHDExMrK0JbgntLecs50bGxgZMZobGJiarSWZmjOqT+JimN0zMjI5/zkI8Q4mBgBA4U6uwlynH6zpKB7mc1HPgTEh/g4sg7/gk6l/lo5eqnIj/hiGbfIQCNrE2gwTxYD+OoZuu/h5mqSf4fhmzkeHSDFwBOFXSLDitWxpy8d1AL01Y6yImzJSh3E2edwG+/0t2am9B6dtXhbtFI5ob0KEdQiOqO/4BaefG23R0tBu7/JRHmL/AciAZ2saYbHfuQJrCyt61/5Lq/mC/HvDE81C3hA5RG1zLrkJXHn99J57D/T6YuUYxfikvwDbZ3LfxSgpFtpkJ+n6/wsgpf4KJL4L+IUJgKvcybzwaC/x8XZIPP/Ui7JAi/34C9DV/8vK9YRf6CJKlf4DtsDfzQnLcfip+cgc7qYMypn3eNt6LgkIzJkbUZP3O3YUOQj/GhbaNzfwbbYET0zcwo26aTuF66RPZEee9us18cQwS2qBnomEBgfwmAOf4mIpgHOC/8gSKMX8JgeX++C20YAyID/XxFMA97TB0Uwo7l43ohsXWLJ3R44YbnuqwxwSSw/9zilUztZSDQCGdbAJLDziOJTsPbFKdEjCgW+jZjSbMh7j4A4/XbigQygcu5kFef3cAKfZBgHkYWrFWqurAgOyTLT6kk0eGOuomdfR7qcpdOL4EPGNTlGVw8cHB0O80uwSwtRhl1e6/4Zy/RBnCNZNiQQOtxDK7QyUOEBFaRm/Moyk5WLlcPS23sQotjNrES33idjQGYmTnzQ0S5ykBqN/gvEAAmQW6yPHfb3QKKk5jEiz4CMdSyDooPI1UxnzQe2k5a1tnqzY1G4plIS+tv+YrAteVWwNpdYiiyGhqzh0Mo14QAkxivi3lBu9X4uJYu7EZwwpCsGRf4CLYk1DKISInUp+HhVHEjqSn3cvhxrdCBO5HeSwBaHzK+Hg05MV+gL9mwEcb9Poe2kJkzjg0HyWUNguLhlICDUMt74zhwdTOkwtBKghRXF8ZSiDwNTPhVt4fZSh1ca8G42K5yoVGBjXz6GHq4jUxgUua85KbFShPOh3lPj4TN1A8fo4A9GfKiKUFySl65Unj4zOAWYV5xyiIKbG0sPCxtyDmem399rWkG+XUibhA7NCAYlf1LTg/EBvepDjHsxt3rByvWJAlh51dCCiHgFCrfbtpube036SgGdDDA1RhHpWGimmFZXq65F5EAeDfr9l6A/Koz1L0oOr9Oc32CMIRDs0hEHPBw/4Al3fKyl37FMZHL9xRR2twE/0ASKNfwKCI/+IiKYD/WhbaNHm+AE4BGCaU5wlXZ4biJXRQq5GSGKFj/AoEJAQifbOJz/AYAIgwU3G/4uPuOf47XvRSrNNXpZMTw/4aJ43b+CgIT/WQ0IU8zgT/C1b+2QQE+wgE8VbuwXtCfxOxgfwmAkfw7/gFpp/hAkYo+Dp/AWYHpuvDhFfU4hM1aWLevthKRbzh1sGiXk69zuqPk8NaJ00DyYOB/RVuND37plJmyOEVT+B2bX+CASv+GSWJ+vd0JzYD8hUH37/4lARYZ4iIydHOmAwN+kH8uHe93OLema0mbNuDt9Y7Z8P8HExzv8EgIv8tM/SRv5/8KmEH+CioBj+IQQPArJucNMI5YDAW1XzEgV8coXIlKEEjqwumiSYuIX+BQOMUf4Ix+K+C5df+J2sX/LhOsj/AyST/AeKAfQoRTPDxffQaipu/9I6Vdi4a/JgE6XrO36PWDyKPM9WcfwoC6/OCiYD3+IQgAMF/aVvsU/gMB7XJrXgr9c9Z2rsWOd2SctADvs/v4TIIDA4QCeRey/wgcfU/40MhFf4YdMiZK/ghtzUJp7yBKp0S56loVeHpuE/8BBzn2EJUTSO4jrAAfVEEBn80O0JEZVSoFMj5xV870Wz56JFssOsw//NIkAdhIDw3F4YDMoNcZ9BPOlKNshGZ96QXxiwA1f/xQGG5OAswIVpUBgYtpfsHT3+7WZh9c2xUNoklDybn+Dhdw/+IgoILurm250BgMDPdZxrD3QOG/aXtIh4rbQ540+tGf8IFcXwDf3+/rf6SFUfpP+BaetG1ytzvH2eu1BAOhnpUOEBQC2HM182sz7PU3PQvdi05N+/hQKf/icUod7BcsQU4gMACsvYmxDTKGFaVZ5YV+VAFi1jciH+KRbMuBEZ9OQL+EYSj/wUWHAfwSJRf4wLDgD1Pr7emrIDA/hOaR+LT/BCAH3sJczQYR8Tso5/JFAHNrhZybn/RlUOJUkFB9vTcOQBzPMyKRDNxCVC2S/4YL7Iv4Tm4v8ZBu3X+Cj4Qb/PQQEOAAF/BKEklMG7P5CoXwFgeE1uj9SUDjzSRpUCyyY0y0hWvcLW9jw+g/w4X6ue7A9AkIPeE695qx/FuM7/FAOAZsf/gIsYvVwOAwLqoL2mFxcG29GECSVLVSafP8apg84Hm/P5kG6TV7pMD+HKHcEhAO/GzX+AwAQ+/gOh3YSDDfoTfQsnYqvzbBbcICOJVWmnhwL/AQeS1V6jaaUHAkbDazBb3YtHiIpwnQaZ+OzT5/ui7RpvGeAWX9Q6BN38C4xanNreCIL9zS62tQNS8wznS1TwZciLwrcUvOKEmeU1hZltr9K0i+EAfpVEhE63kFirV0vvAQz/FJESmspBGUkEwGBDAZ9LEJUmxafFRvPujzO1LTb+h/8UkZ5ZpwkASEBAYH+OBn+0F/goEb/ykSrx9RGBXHoHpZeDg6XeRTMyCpWxsRdf46DIgh3QZbxUK/4AE2Zpum188UJaBPIBIEmKcSpOHfbaRhnrtF756IT+Bd4dRwjyNKi34I8+om3D9g4X3OAq5S1BgEJHHcf/ARVlPyAjARWD0erUubpizSc4wPXL2NKK4H8k4ff8EiOiN/AYjynfw0NH122LqP8NhWbHfzGWql/1Q6YBgStSy6nemuk2ltI5Tc9vxjM531zN/gOK9QlfG/2kCJSgTUPTo4l9fCZveAvFj2Q3+4vnkMNsDbUBWyWMywJIL0oH+wCLTsBAf//B5c3/FCLTuZ2f8LDImwJtnIkfw/mta4zBAqCVImtAbAiygzf00fQ/dmB8oovTmMkwWz7G9sxL3J0lTz2vg1ktpzMWVesgHdlonupdDx+vTBqe0BCzUb98+3ytzVBmF6IeSTKwyWucXqHyhy7D2I433FlQh/gbbowJwtfgnzTDHa6Xd/NVRH+fk5OLNcw/qB849+mw/kkvb53ZuvWCGRFm4VK/4wkpGsDTLHo+gYLAvETlYB86mC+VoSnRlC70X8AbQSokP4HAkMCqtD/A0wtEO0Ed4KmW9LKQST/D+h0nCD6FJhc6Su9ej4ZS5MdWWOV672CqzXPn0cq8xxmkhoNZfE6b9plsqKMdiJX4jTZ43tJIluCeniRrTzCWMbbnC6zFu5tbcBPCmSYLtrYBOQDfmjmzgm4jss1EXVZUWq4C/7MVlfvXT5ZKFD8rgoppZyXEURhqAc7S1gEzjA4UlVOMbhjEZbBogH5Ic14qDmPIqqvaSoP+k19m85nDuNF2U40OoJxld86hKGXHJaDJ2bYhkRYfTS4BpAviEYGFSNaf+Ci2kX4VUr/AQXrqKEH7swJaC82lhh8wTNNMUD5zScftdNyPwMFkQZPmSLQVTbE7YSecO6I6sPe8ipm4KiBeH8SirMOAo351furtwGBLwGE/dI7n0h+b3wYnzJasA/Xo5f/FICKI+DSIqAt/CwQqfwH62t/8RC8vAcMSxnfv+PBgPKW/v8wv/kmwGJ/4Bf65Z4CXD1SQpHvSfXoZcjj242h7QspKt0Lbx1/cEDFB41NUJb0SS/snuheKbfICV835TFI2y1/+KR7p+fLrQcP+Bv61dGq3X88SXOYGw2N1oCd05G0P/ARMW8K9Iuzbzq17CQdwDrfo34giVk7Gf4Gdbl6p5e/BBXn6pm/xSFCLfhJYTxXcBgJwZ3brw06viXT1gcmLAKpino0K3/gwHI5v4iCy2jvtr540YDA3QOkVyC2FbMGho2PabT5qCG5/TB/wcVos/wSB7/y1YCiusE/wL/2sMuUWEOD/D1dn/qqOfTFl2wsCqyrvK7r2ULDqO10bhxWhLDSAeAqHxCy2HMDCpIdCnonVE3X8UA0OOFStU03n+PBjcjC/vY+mWKJDxbXsgKqjDdO9c09J/2CMHf+PWeYzs1X+BeZJlVEC07YGOjlr2XXEmlS9hL3Y8taz4JV/HEIa5ODWUzC/NjEreAMxk/WsCR04k0M8mbQXhaB/ikWRJugPQeIZgMDkIHNMRiveHap/wEIy0MOCUnz8CTH39f4MMZ4f4hCGnn9O4AaBX+Gwwpy/goBF/ljUobcZNB4DA5KhSF+nOsSfR5C874Y7Ygg/309a/ggYW8/7/SRv9ICs2bl6/wLkOtCMKHT4S6S3idlp4kpWZwzPK+LyWywHF8GFNDFNhJ+uOJSr8IdBb03pJnKL8cLRI9Kj6xRoP8DJIyt6yWD/AYAJk/wyI4jceFFK8/wsKyf4KJ6iP4gARIcChvj5l14x/gbIcbHXzb6B1KwMsyjyuHoVG/F8pfiPdjxfaqK/PVtXAKJmxvVNU+Lc/gsCPFzPUbTHpefPr6LwALw/iUCwhwEICR8pkkggMAP6GvcMfxR+vpp4oYGhb0Gvwo/SP4pARRLLYe4IQeAwP4V8Jv8GCzI7Ovv/8BgAkRBXWn/LnhL/Ag/84bOHfwB0JXXBWsoOfNtTZp41656Iyw5+WO4knayBe4CMSQNU63iHZCNePj0hSKN0XKVFkZWW4AJ/A7WV/BIPR/LBy6K891oH8DfYrpjzqVBcEJmB4lhraYDr58/vcwKUjikOH3f4CLubOBoBfDZPJx5Zz/87SpsMri9NtWPt/juHysVv4G6XFevFi3+AwAQ1/hkHhEHxlgR7ugMAIOBGdyCojNUVj7egjOkpBfmTUQ0C7/4lARYbLAhn4B2OAwP4UBNH/BQ2D9/EICJplDj5nFLAYFGpiCES1/1PrM83y1oWT0KRkNMaX8EFhLt/f65z/pAVaddRv+BSwl4p9qWzUmf+y+r8Cdb0+qIjq0n60XYPj2u3zrKBhsj1rSmnRYh9YBZAWIQskMIS1T7drwfhU/ghjJHf/2FwOqcqlc+KmyZVzk7h4rVgU+/2Q4mt/8eRZMll/X8CsZL386Ys7A8KJ8lIBdhFK0otzqETE97mWHXNBANspmF8tg7hbFmIchw8hE7ACdQ2swDcj29Aqr/FIfeIqMhFygnAYHc+ohvaY4ltOwLGAmoM7UbQb0KW/8UB9+OZOnRt1UBgTSpvDe4FQvbmiAoKOi56Oo0oVGLfwP8l/wSAi/6yKuYnBrZgD/HRKI/Nv97NtKyJn8XzLC3MfnpRUED93WtKedNg/x5K04msA88CnnL+d7YZGrXy/G0bwIpiKhTTW4w8dwG4Ndwdi1DMiFeV6Px/CgQp/AvX1/EI0eCKxghdBffQH8Ct6sGUZ3oA4vUpGusf4CN2JRL1Yfquyxb192A8LQpt2r0W3/RKFANjlLH8fbazNmBf2eRajXBURJA8/4oCxcYXa6vCIIDAz16RdvgqsWns/olXgPjsnKenbeE/igBEypKi2SfWgMDPxgXxAOH8BlDGR/4IfnD58cB52OE76T0Y0MR/JSkOFk81cTl4ULeZJffrapNJKN/gMGu/2UXxAfEF/t4viACAT/6AHoPSCyVdKXe3uWpQ10Id8AL0VAKitKgBKy4/bafKMgvMhapyBlazDjcaplnHjpAv+AjfxFsjboRWgP4IskZwtWzqGYqO3/Pw/tRB+SMbzSbZzEPRjps9tbwW/lhvyHUUm5HLqNTUc5Iz7aV7L1+h+g9bMMQ3et1PY8OoNCgMHC8YDWuXBJjIZQO05/kgysW/xwQWzqsIBMzPyfwGAAnXBhHNSf52ILZ//+EFA/+wCC2d2tsoqbpe4vboF8OvA7GVwOLehWC52Tzki1lWKR0lGSWw/h4Ff6/59BX7GFmLcAe5knzUAizZuHmMbdWCVohQx+r3kiE71YdZqN5/B89/EFs5NWdMVT6YQovqWLw1VzY7a/voCJvqUGTX6wAXTH9rOPwfPllcsxVFPED/sGTlUeVWqpa9GvsFDut1rEsJ/dNfQI3x14XmD86k/QiJmaGV7t9YjbCpfAyppkFxiQUX/Ri24GVAhbLe08fmQKoMnarrJejT/ultp7C0Sm2Zycy24kKIjdP2IXQj/snZqySIvFc6X0oJb4DVl9hDBEfKN7KNZugZA5/TjjPke7UyW20j4ZvFnVGRDNx97E4RVIZ3TbOznI1ot+pZmL39vDn2GJbDYTHdE8bPhRzCn2QSi5JX4Yo6gSSUWrDUC3EwrjXIh73xyY+1gD/gJDHHY10OGhQc9GM3daKITTI48dVe2IkcVePX8HOG6cEg4+nbbYNguWXvM1Rf8BEgcZJydUD/AOZvLolp7EGd5flbCa7aj2uDHD/KTbQjgTQnIb6LbI3kJCONIWJg/4CGQYEUMcbE/wEQ84LnJiGcpH6wTIiF1QSjSAkzsSLhrhL0IO4GnA3baqQndlqQs7NdrmuwtMO8zmwWG1US12QH1MLPGUSUT9fG6Kk3gRMZP3e386m5w2vd7qeBqCy1tHsLjLtWZOMD0WX2LSmIgIGYwe9uewIxKkcdzAJWUiwckmfV2JN+nooK4ezLbDKAs7JNkg0RcYghIEGMUDHB7NmaBR+uvXKpQTQaWLpBjwl7L3wM475s4aoPiRguSaQF0NLySDmvafdO1CntV9oMDR+NlV7l34wpPssW/moRVdaJ8/MsTnLOlMQKa04YK/+Le+MEx1iLT+ItTUVWZ+PamLUfwAOUUhmpCYcAAxKraroYeyPfwAOblCilFKzG4gArms2MCP+Ai++OZ1IlkY1/f1UygnEEZlHxea+5vWW4j5vvx8XmlyoCDsMl8T96942yUF2S4wRuu1vdGz7CTQf5BIUYtH6F7UCxwnUfcN8/TEss9gAt17HuC8MgubnhF2EwGT0OkLfkYTqHHXzvoa2wPZ4jqXOsU6eLUMvz1sc6cFmVSMgnXxFcDskWZ1flhqzNXbOzyn0CAVZfK5WnvdcFADkRt74SY2cSh6y4WuiNXduSmGE/wEKBl24dLMl5OO5mop3bsw18bttMrpQhSL/E9YCthCDqbHCy1fa1A4hhR5Gvveowzc5kQsOiw1QdRAKeaqWDUhLKKVLPzzcaE2jdaCRKQfAG82ZmyLLgF8lSJbJMBLqtF6YDfCMMnaoSf/gIlAQbImCdnm/Fe0ovT6CUONazp+BpjFO42TDubJnGA26JsYICMs9LUQd4fBLbFBUcoyVMetKHKDva+vF8lSnfaAuWs6cSXkyzgRGNTqmrBckvmLsWRYeCtmQ1rPjNH7J3HEdbiIR0tWyOYVt0jFNA/aCjm0RrzYrS6+eu+mySMh155IYvMz0418JPou5/JIL8A+QssefI8RrLkp/gAjwBQ4gayGZoHmLdhpW9vhFgZHUEUgjITlHXrqh/BRoNxgMPQdDuNEmfCgfGCbbrzIPdbxrkoWjKbre8aZqJVf2pClZyuXcHRYKv0UJRbyWZfce0/gCfzlQ38IIzttkVcN3pgo8MlpgI4MxtCG3KsrGfnSpZaGgMoAP/ARNAZnhFxe2F3aZ7zpkiu0IESBXQvunNvRR210HDAXdYLBxZ+QAgCcf3DH/iObfGC0qDvsAHBumU1zgPMs1HG98Wt4BV23RCheDda/CajuMjCN/gIh2N+SMVQWuAfYhYlfatFCmaGsuSCca8a++teZdGMlFEgXZbPyfloekTV8KKGK86h+E3jk0hgwJxgInNrgcdAaBSufSReheUD5bJcXb0VRqRtRdMXJkhs+NLZOHqQtjg4hDri/WcjKMeibjIVof8BD4MMYNaeVUYemu72WpKz5s36jp1gQj7nZPn74nHA0FPerF14XSyLUXa1eTCakUT6zcOXoGZsUwyFLHnJFjhKRtlb/wDgOfwLYBQ+/wzTg0XBQ4CZPLOlv7pau0fdbxL95SzYCz602akO7RpZ2kMbLrKCD4kjT7V3CSoz0D54f4EP7Fv4FYYv4JMv/5abY1F9IvAnIBPCdrjgcD7YYi1AKaiIRt0w73/BAfrmIfwzmXP/TiCyLSClA8W0M4w8d1GU6VydjsX5klp0a6bVj5G3+ADAz79Trj/gYy5/2IYTv/EF/t4wnf64r/4BhO/+4UTavKlx5zCcIFURaixTfyLOB3OXg8mfpFDy+QhBo0mhKVqFb5eUHJvbFQXDeX7yV1iUoXuuVO/4CEW5fG0My9B6v/PxA17ZubsDiUm05ty9zS5gMXD4ocwuWkf66e3c8wFQksK7JE14mLU2uCEnN12X9F1P8lVR/R7Cqo9HvcuiPuHHgSEJd/H5ryImZi3JEbscpO5wN8t2Emc58p8rqRJdPw+QQUvcakC2sEDlUyZl5f6vRNukbiGWyYAOZDaD8FdXFlP6QIz6CHor/H4D+dBKA2Iqr6ntactJ9TF7SnrCxXCimVfscg4rHQNwmU90niw3cF3y0s58jGOj4/bjvqEoNBvr9Q+F9iSefLWTNHV/kozkQ/gPy2i0Mu2f8BEuA/+wM4lNovL+WLtD3i2SlyBUoxQ6/wmrW3kB8n0rs2tAXDI19gtu17c76twxIsot5/BLIUefwGkJon8Mtdkx+dIi5ZhAYDFKZ3eBLsynVu4p2yOk325c3356fxSAik35o8H9G/+MCHqbI/wgUq+GnBBrt/ubaZnh/5jScB4EZr0DLHwybsTXy3Uf+Ehca34j2tHINfY9h0cLoT/AQKtcKhNzicGIOfh8kk2Y2Q3Zd7Zqih1/DIyG5/wUBXfyxUPT47Q5oAfwLY0SToTnWv4nlFo9ind3zVqct8T23GrM3L6TOTAdGEZ1qsZRWcrVMzlStlRb36SXBAONTr29j1gXf/BKCsQzsTyn+AgbBZxgMBHCSEY4p/cMauV2cJ0vYdYjQXk2j+Bouf+IQZ0EnYAbhdD7gMD+FB3n/BRkYP/EIPGAWYSmqtRZ/+DAZSmYkqKO1HQVYQfIQwd/ynO+7XEyTHFdeVppJgK88TESeNeNDS3S/8BEgVNtHySpiqV0BTmp8wAlA1KHNfxQB7LbOEqKPeQGB7dwyMI4oNgCuqPg5V8Q0Zdbjy1N/CIPN/rYmFha+b+BZTqcopIvgS+A6MsCL+EQKhlUunNhoFSb+9mUiCVvPRHxWSLXWLYPcKW5RSUb4nTNgSc4F/EvAm/ga4JX6e7EP4DABBj+GXnKWXCHyb8j/gZYAoLOUUAlYU8zIMmA3EleuMC1Pyy5VhEqO2KwTrH0pZcfm+gfvJ3WDlw/I0Y8Bf34vLJ5pAS5Au/+JQGuEuv2STQZvf4bBW/4LCDf5Yh6W60uC+AwKeNRRp4CazYASaE8z84QVEQqbEuPhrOf9bD9sZas7b+r0a+Zf4KNbdD+EwID5A5A6/gUCBB9Gu/xYP0WAf/IB+iz+EwUr+HwQC/xodisItbFcUH/rosFOAg7+E//vwAAHaAAGe/0gP0Rg6DJ4To2B52KJMZbZzitguNpkt0um729HISTElOKn8CgMqCeguzbH8clHf8SAndB/FYR3/CCnb6SAz/pYRdI88zEdCH+xh+y3/CQ/ZZ/AepbSclsGQs3UMG+q0cbeZGgJGWynWv1UYib3pbTKtf+fmnzz8Gm6VftMYJbuwvcBuJgPshk/tUfwNUG/xELB8use7vjH/40LKhsD+A9zRN/4KDD6P+ZB1QAB/yQaXaeElEinWzBsXVH4jX4Y1aXK5vh5xfE1BCKNCAUcFqUGDfHrKIZyGbD7H6fXIiLNCnpVtNlCCuwWwFw4tgLmlYF/C4TdYMNu9WSy5oJh4r8oZkPQAXMYO42AB5tEAAAAEbZqlSSM99AXUvhwRPrah38ANWAm1XX1gIeN/wVJtBQIqZCN9fwQ+DMyp8joKclH+EC3NN+KfwqBJhPnoCi/jbk2RYlb+G2jG5QraH/wEC+CoH4wNugo+53ZX+AhHnljBcnk80R4FzIIKKwduToqUQLv/iRCHobpdKXd/wEEJNfwsRJfwL2UfxCPzgS8CTQQq/4MDDD34q5vgadLd/wEYikll3GQZ1gObX2xgu2eg0Q0OEqc5mfhr97Br1Y1OvDN9YlBmpaSYBiO9DLUM+1YF8gf8Byy+z/wRO858GBD2N/iworHA8T/rxRWP/CYHl/vYEWsDMjlcf/Xg11eI95yEQt/HMOQLszX3Kb7Nwk5xEaXuehc9v24zh1TT3UWCQJ5OwfmEpOTpXl/APjw8naN68oVYPj88TujLZwbdxrl0Wm9qJj1m2Zxr+URXGJgd2x002RXKMSrW8xHKfowAyuEvQckacEYIzCTMd8JGDwpWg7z2XlUEzAOh+woIF/tJHwSw/C8fnF/GCZQ9VcM0xz7p8dIzGBS3+3OyjLuPtYHrbI0r7WVhJuoW+qz9hw6+33Y6Jpw+KZI3Hmndq0RPR5oak9OJrVbCDATLR7y/TOFMBUy9A/o7vMrzZ+YLiD2spMeGBvXwo2AR9Ahwed26Pwi11oPd/QeNjbUoq6AClm6lOWD8zlLdu/4CMTLVkVPNyIDLTZymEqjC7QnnQTI8CuZjY8dIVlHRbxp/n8k3tkbqx9+72VQdibMDVgrgQXCYS0hcy5AdqUeUEQnlsHikDYBNbL5F9zYN2hmxNJPjJZiaGbjxfYsptaWXerAisaSuN0USWAkAJMLwec2NbwZVRB4cJHfByR/gGkifph0ztpMnjYeUc5Yl0/IX1vnudym7nsz5/63GSP0r8FhL4WndCV6lWYrpuAAnW4VtbzWSCNBVEZwr8ncSnV1xnohxFOIrSFBc6d9GEvH0XCIC7V2teZCnIjHSxhP4rBBkQ5IWwBGzGKFRnfUwjOFDHig32B8dj/CUZ4uTrTYu9GVcFUco6Ao7LOugsg1OsG/L8zmlADjwy2C9rJpS81+/4FwGq9y1E00B/hdULPS1+EUUsS6q8JfbL6ol3Kyl99J84OKSTib9OjGWnA4Xc4+JMUX6l23nTSU3+Brf/+IRN5zA5g1V0qAwCn5/Ey66+tvqVbYQXMVQceE6VzK/4H48lrD/mjjyewE5d32jFEQUgAV0h1hm0jwytWSFNWgg103amum/CUcnqj/YhMwXAP9/8eX/FA2LFCrv8KFQ6YZHuL3/D/AkKCG1F0f8ASru4DqNwnQL5Xlo4UbaSwHm2Izu4pj/IjbWPJlDmPFoBZ0dWfn4ZwJ0mb8igah9RP1vWhRQyE7CN/5InXjKRDghyEYflyb1APArtCkd8AL6lnhv5176RykFnjfVLu9RLxLTQzBf+AkGRaO7DjR4aEUo1Zf9EpsjDbJ5mGfInMhJE8olqAzAn93KMhXRUl8tWK90SvmG5iqVmhm1KdjvelYFRELTX7eZ9qT+ZGj8nO/pUtXIGF9TtQD+c9cRUdI2Lz4fA3x4bSA7f7+brWIo/3K6CICim+0ChuFns5fwgntsPLkrS1IXFv27ltMxGQNtRz/lzduVTcd6Vzm0TaqXnlsWpbd4hHp6wsGcmVvo+OYbPdSCQnkzxskzAX5msQRv1j9V9FLkIP/gDa0K5HCGrM45op+fihNkbGbVA4eBSLvKIxnFjojXb3DKnujBaGQrWLS5b3/wEfXdZLeh1R3dxiMcmBb318B8yWecDRZT/ZFprsLU9CMBdEuYnK/eOce2linPN9tQpXPxELeH8C/01nsWwdLulX39eX0RMs4U5CJsF5s/f+6Mnl1ZJc18sVBVsgGw3+WBDbv9uIPf6WuxEYYBMN/b/g403t/gkTV/lj4HtGryOADA/hQmH/wQPYxtHzBE7idwYCPc/+TjFGn/GBrif/jAs5ygbPA/hNU6/wgQDwv+0ChG7+BN923Mdqg9vJ3rk73ppbkbVFYqNlhME88OhZgELunPybnbBODt/Bb5TsUqmBr4dRZK4sCLf37P4B+Ehgc8ug/gMAEqP8ZDRIi3MmiVSwGB/CgP3/Ae3W4fMRAInFw4PseycB/CoN38hIh/ghB+K/gkG8BT71/xcdArvJ/+AF7n6YGz/hwAP/1IC9zsFm0pHGqnDynmXvP0vTrcOZaIcBNoJ0+OtO7XS/ZcsIt3T26fcLMfSOehVXYwBvmEQoylvgxYV3ior7B6ByN4Pf6WL3yf4xMg/8uCTmUu6n+GiYOX1LpX+HQX9/Of4WMnf9mAvc/6w/+WBe5/8ggLrOH/ycF//x6cR9MDZ/+WBe5/8iAwM/++AXuf+H0ED/zIQSkf7YBe5/4wRQ/4fC3+lDcf+8Avc/8Qh3/8vmsbJPf+HC97QxNS4zMTQ2MP8THtG38BgDRibGJl/hwvexDE1MzAyNDU2/w0C9rGZscMxwaGbKctCacuTg6nTMYGObmZgaGjwZJyacOBqk/4oBe1jM3N33/Ehe9T8Xuc5JkCEb6OzPxuXE3zl3gJn7aCS1/dSn6f0IX6goiQ6jw9GgKTCuO2eycj7kpXCKuva5uVH8D/D38Ek7H+tgbS+2g/OF4GTJ/giBkJIhz7ivn/LIBeOj7lwS09nLh5bmGTdlYyK+p4D+CVi/Atx6UA2BDJrCzOxbV8Bgcs/suggp4wzxkKeTC7UY/XLI6jpqyNqK5+z2icygdR98bHb2wT0C8POJSsSHATaM6ENfp4DACgmIuCVJjSPPXz3dSUMcVbn3kWX+QQwgemEAoRjqYMDMz//FRyqECTyvrGqAAXIIAG+DCi1/rwm+E/x8QemIn+XkQzmYmfYuzxa3C/8BGCpMM7IEYE2I/JGEMf5yNeHAL0sxh0/h9WGkv49/zQJEo7i/sr+GZhGbYyMczAD/SwOlz/twNJ43wI80tT/XQaTwE6PzpYNJ4/hMM5/isAf/zIGk8f46FOb+4Frf5XTlv8gH/AN/Ala3iCGbgR55gaEW+pZmB+0OCh+XdR1Rk1Gh4/wB4qx8ltKM1iip/DIs+5/wUe8fyv1NiIWfhYQAf4sO42HB/wQHS9l2H/BKJjBSg7/j47jZ/jWkjW85zamxo/gWyOMj9uwTfWQRZNEMEcaxh1GfbmeB3bYm13/BtDpEPfg28rX0XVJ1hUHfnlEsYrmji++s+Mg69/4pF5Sy9UUhfF4AwP4UHGP4FlWv4hARQKZcC2uzy7+BnBbMTfPbIgL2tGIueSJfp72OT3B/wEg64NRUfdkUasDS1fLANBX+OAndkTv9pGTJnl/sIyZMAAH//xkyZ/Oa8GDOlyLlaWk/w4OyKyzVn/AA0qFM4svsOChHQgrjCxwD+mYpVAIntnOHJsKtJBZon+AeB63KKTDCT0GuyLKu7xrQuuMJhuW6HNalwWFVBVzqo2LpUD8juk60rco/nJmqoHIIIrVyjggWzfDjfI9SnlARNvKPdKD2cp9dxdOi34eLSkmp1U5SfdT64PGNfoWalEymD1bOMFJZiJ0iRsgImf8BEUO+8fXmLiSegvecZmh2FG1zJsVONPy2d4MCJ+pGKz4UzfwDR/0c9+hwogXv4xX7WWnHuSlMoGP8PsYagxjM+xE58x03SSk51OI4EZrYiP9mkxiRnDW0riQ2tMax2tos/u9hfRY1klkdIC1m4mDVt5a86qm8Td/uPdF5fI4Zn81mbVaFS1ycjVfzX8AeAfqHU41T1ZxlELCCAD3f7nZ1uz3tLAoJLu6SX6BECJ30SnzLCdIZLoAOEusFjNteT8nGFTDTXJBOUkSQULnyQoeUSES0DNJRleRfYTlG7JHBsNFuNdQJtx6YDS8iAaiFqOcgQEMvjjz+VMXzasan4IhIU/wL+d6GaOhvQDH50jlgq0ml1ha9t1GFy1irsLJitizQxwNL1NCPgM1jUUACsxFx9M0FvFpXo7KPcn8DOXDA55dj/Aaxjc/wxUVRln9ju0bv8DQqOnX/wDmZVH7hH4oKZuawhNonjfh0I3DX1dP64H7mY6tB/lA6XrvK7Fby7C7CoAdGVDAiQpVDMqf4OMCQ/5hHHw2H0kzi/gY1q159e/wET4nySBScHH+xXhH9AW709VRCaKD3cYV8aW8RTCF97/gJGg45bJiXIjGrqpRSlPLtOD2py9PP4JGl/4dBFYcBEFvzc7pMgMD+EQDw8P4HNr/4eARQJEpeewzq3+Bq/Lfri3niEBm+bpTzTWu4Y1hyzQNcd15jQF5Q8LFwvISTZE0OifyzShwnR9NQuQC8eucil+q2F/ikCxebMIOTrcgMD+Ezbz+cmOb+AO6G4wmcVrYABM4ZD7US3EHV+4VtgSej/IhKnjWQWM4yfSTGdY9ERTqrIF8PNwBIJHovFJSXqvp/wcadBfwSFdH+sgqT2LStT+Brobn3gHQoWyT2hdLo+KrUke9MolOG40QxeVLruHyF7F0+ZvxECHncwns8lzkiRrGeFuM3mhTQu/ikJObHcgdEO7gMD+FAPD/BRptx/EIUoB4FaYmqsSP+PCtCo2//YZPpId97W3BfPU34NcnAVtIfCYbWNq5/E5I86OTt/wLdd9qkCMtXGF3YP9czxT5XBq9OSXcm2+TS7YjHoZVR6SU+49D56i9n566Eh0iGW0y2zst0GmB4fxEiNRQQyHvjBv8DTafgfZxPn5aOmfriI9FgcNUKoBwLkXB9cvBX3gGDac/s0k/noPhtRILrwGG2gpmDt1uYVZSjYL/g41TB/mIMy/wYWwh/wHmsegnm0zhZIDtKMcD3ZmDb34MKB9/AGIayXX+Ajg1sEUneq8icLB/QHhyaG10pc9BfOkqjDchsdd+baCP+BsxJpOIEA/gMAECP8aGy/6TRIhTGkBgfwoB4f4KMF+/4hARQV6iyzV45P+FxRX/BhUDysWEmfwGACKsFqGX8wiin0qCOSARWaxw7KBtWNaQaQseFYK/gyFG0K5jYPAZau495/AC4WxIOz7bkIi9fTQxhn/PofKPaHOf8UgmxKMt0A52f+DAlfP+AMp1Pkua2wbJ9G/hH9dtRx0gW1EEO9nRFG/MVmXzp61gRpF7mAPSyYrONdoRgg2TNgJ4xCoYM3/BxtNv/BINH/LWRADLHZgfyJjoUYC+P8EFHdgVWP/+bChHgV9GVR9dc2ybiGHB0TWDikYP3pv4VoiAFbNWDo3YeNo/wAwF1H5iXR7X8OPlHsfRsU/fobVGNCP4IHO1d3qf4DABLL+BAIX/JhzTd/HAyd/IFjIq7LBDHJ+r7c9gfrWhDw9D7yooTKZAp0KZ9EOF7Yptgq67mdw7Yu0rzn7aBFqsWDkb/ANa9/A2zg1DHwz/8BgAlP/DQRMFLYwNTv8C5mHyDE/eID+O5fMqQbRIj6UIw/yAt5LNgQYjcEL6KJCzrPUCfwyETPP8FANn+vAOPUKpWMcb1Y14hUt3RQgw4hypE1/AUC+QQ3/wQEwLfwCQirGJlcBBgSxY/4qRutDVjvYqdQQu5BAT/Bx6f3/nSEIaQut9zSWbE+pXgykSnnlZjscFTQ/lvHFoibrxa9vJ8F4v9rCjEf+QDeN7+EgTn/GQMjYuTiJDYAB/nYjJi85CO1/8/IB7H+QDiu3/PgqCh/DRH0DkKMLQIX/LAM//gkOVv/5MB3zVTg/YUIP8+G8GID/n4vYpAAD/XRPHGAYB/jgfOQHv5pUBfv/YQWO//nZDu4/v9QF/jkJp/xUQ4MGyhnn+EgzYEQURIfQP5/UBTQCwOW5jlg+C+hab7xGzC+QtIWADpRc3JrMCwv2URAfEOkL/NUoM0dKCt0+BR8OEOjrs8ykFWB0Qyd3IpUE/yI8lXLdO1cPMinJOrfgOhobhBtlRtM2PnVR9NcaHr8oPa00o0jUxURkVnxjDlrrJTAv1tYh4WmYk4NrDc2soCGmBOv9V6ZC4wh6EeGk8HXdK4O/HxFPFhOeTwwQ5nvAnDAFjhMM8rnn5z7s3q1vuEYBR7Wkjojmvd+NVqhf8BGidM/FxEx/D9mSx1g1LSjfjRnFMmJ9C6OTOeqwiklNZHKFTuFjq5BBpQCtsqCzGgk2K6UkFot+hemqY8XJYDl4p4LZa0bAgABH8O0MxynKTSUUf+BSgGT63tRI6wQNtSuLvaZdjoXtfgpyaQ8O+ucqmwWQkXuC7WQ7jpErqL1Qy1COTFPX4o23t6oWk6B3r7yOPi46rrW7nsZN3VlrNSIHR9Q1kff4hivKAL6+U2wODwcsLjoYLXrjiZx8TcG2vOb8ZLqSLUoch0fGX+AjPyL+FjBT/gQSpWsT+JcFNk/kLBTAo2zqtZFY2BSfhV8KzHZQC/rn5HffqI+YXJ0XXMY399bP4+wU/8iD8LX+eDJomg87/wU/43oJP8WFLQSY//wVeyjKk3F3+WCIaKR2bJeavyCZQFVFIjt/iASpgjg628lMOJ1hPnH9Q8huymK8NGSSORV4YMstELpGBTYMcL8FczVfmq11hZSARSudu+lxnuYCGvWnBGhhUt6Qq+G5tGhAGadRkMMF84+sDjA0X7SGHyKJHwaxHd7uc4DeRtdstym/EHYO9fSm1N85loOqGmcaGbjc3gKBMXZia16rVM9aJ3wOLS2O1NO6XeoPhgRNqSQGic0fZqB7Siv4UNaf7QkGdOb5XEqAeU2xEvTVXDAVKiQHha/wEL3LE7FNUC7/D9oIDMP34dvFZd7ZYqbZdGosNgVAGBRu7mxiSapdx7wBuIEWoZdQKf052qPwgwKDWfsEO9NCXzxnFl8G5dVnEMhYHcZKpYMxEbDvJZgtjcjlTg2WTUB25YyrD4d1QVrKyDBBMkNO6Jz+6/DsWziRWUnyNtSue0tGWSqfLANOksbKHQbIeNrCmj8jFtxdRWJS74gouxEctacO02HhF49Ik4Q4WQtClDDI1FHZ0Am9du0q6xNAnPjAJpAzQfgu1d0ne/hoyUo/wJq0fITM6v3bpSnCPJGQRK0VX8AHSiJNel/u1bjJ5uje7E6MpvAjKbppmsoI1ej/4CDANaXRqvQfvThztHxJ/gbpH/iA65hjGZ8UOwkOAwP4UQuf8FG6kn8RHf35llT8Amn8C9I/yH7w/yqZHBSUYWvLkHTXwoZxdMS++CQvdULxakQ7uECXbP+Ofa4t8lCQv3sPr9/bPuCuh05afxke1fwGfrCZe9PgzPWgPDfY6XzOAX6+L95MpwDq4LKiw+7bbsDj1SQD8rVtWYNuEq6O4SZL/NBAii0aAv8DriX8Egjv8toyIJa/wKDji6UerHCWiw2jh2nYfIQT5ogMkPbg7sX62NKrz/orONkTiD9O39dJV+molO92Z8bk+Wu54cRIF3/xKCLQ2KNzecz6HAYDWCYbPitpP41IRtkjPCSpeOSvQm/wSxXf4MGu44MAUsJ/lNiu/wQFsjzWUAfwKi5DciT+Sa0nO72BBueAaCYIL8lzPEbuGc4OAr7Atn2EE3QX+MjitHB/gPGWuP//AYCT/kwMBJA2nt+R7L58QafEbEfi5OQkE8PDi6IxU5A796TkIQaFzdgvv7Zn3sNdt2Db81aaPGdm6lM6tX5IgRL+CQ3Ox/gMQp/hpu+kwDW8y89gMD+GQ014/goBG/14DwNv+LBDUw6/wUIamfwKEdfwKEfgYod/4oENTIdi1iQgabAAP8SCGqf+PifIv/MgrQH/nwQ1T+f8ZF2t38DAYx/i4lI6oOdSWpHlxqf/JBaER8kkVFnjiHmlY/ktQwtAJCKREiO/bnzYQrJNtATUmqhuu2H8KCpXh/A6lp/DwSEAjpuNaf6rP8Cu3fyUSyu+W9WNAFWpJsCdTmxzs4ahowtW3UMOjAvkBprRBHYKtUemNT1XSI60cW3ytB41Iex2FL/xSFXCuoEcxWewGBx7YnVb0vkhrftN3rhP5oqXiX43+Ciaj1X/gh/Jv/8ZA88mAFARofsH8DAWJY3SxLhergCBx7f4CKpa2gRS1Abc7Ki0sTHPUkof9Cdzittdv8YFMvgCwjnfE9T/vwLv3f/4ZT2YTJ3IvN/h26QD0JH/z0Uy+AN/QYIH/jQub0jYcn3nCf/x8FLjfxICB/wILHGHMwIiO6JiTReFsPhNxm+TLoFX5EbinwaAJMdtrb+gZerRIOiSSiWVS2jog+62wArVsEJWCv+DA9QUD5oRqoMer/gDABLODASJw/idmhC2DB7m0Bw9VwTurmwSVSx6Y77v4Rqh4gtqYMq2QMtAAAwKLuWsxz7nL8N5ghZuzhdKDbx+F8wLv/iUNnhv8BGljTUzFRrAYBjjAIjSu0jLLIfVGEM3p+BW2I26f8D3wv8xDCv+DCJX3+A2eU0plW5PGZAZWSytwMpFVQzjzJJqdHEcmsKMv/ARM8KdKAiJjP8KAVX+AouZ+/iIRXnT7Mu52n/gVwM+TfQyGVIms8WTCZFHLR0tSxiSm3nWZj20cdcIPXmJQ+rQV5ddd/enm34tmZFsmg5LCiGkoO/4KLl/4eBW+6FwNINiIDA/NMAqv8BEA5u2GLNmjR2jIxiifhajVoF8hnFBc4RA2v8EF79bBh+hJygnLNokI9m8q8LESQZTx9+TYFpKtkBchlrPydat2PrQozYczmR05ZoGUPvKz2Of4CN+rkNaP/rlmsfVDmCg/wcdCvIerr6//gxCBKDHU4A4BhDHWDOYEBYECBkFJhAC6RYEAP/8EHnxwCwUA2BizBAKX+CDz6pf/wciDkACf/AgBzwKj/A4BFAi0CKMIAy/wIAq/4OQIlIBRYQvXYQCeXzlH4GBYFIBUoGBYJ/+AwAVhATz+BQAZmAVJhAZ4BUmEL0df/wYdEufwYBO/4IOmFjRXX/8FH5sbU/DPx/gw+ZQ+CH6kAUYQBNV1v+AwChhAEthAEY2YARhCk9WW2EAnVZbABbYQB0f/4fUIf8FH5yf+KDuBD/EB+dL/EACZ/io7g8YFgQAH+FQGEWFGEBJv8EHWXsFq67V/kBIPx/wgeGmQCosIDAAKjAKLAQSEiAqLCAsMjIfxABOSDCA5KstsIDHP8QAY0EwMACosIFMqy2ofwMA0qLCA4f8QgMK//AgDD84KQVQGEB1YODg19hAq1WW2EClFdhAeaDg19gAf4GAEA5/gQD1UnSv5/4dA9f4UBNoQWYASCAVJgJAH/CiQnwBCN/gpGXi/gcBw/kQBfYQIw/wcjMYMDCBU38BgaAGaq0/BAW8gKL//ihHgB/gsETgf4WAxIYW/wQgOqwsgpSLAQEGwIkDARa14ySgwAK+oikGLMIGUq8HBiosKsIGOq56L8EfHsD+ra/DnYieiotBi0koLbAgKSh/gxGGiW2BggUHEBVhA3r+BAVTPR5GB/h0JKR/4GBPoEBg5BSYET+BQDlX2EDhoWEhmEEZlZbkFCAgBVhA7RXUF89EYD+BQAiFP4VCwTsRWxVhA8lXYQPB/wQhMSyKgoMIFTf4IPwLQQYf4EB5JmWsxX+MgeSG/g0GEPRVhBCxXYQQn/wQjA8MICL38Cgv2te0Tr/FIZv/g5A0sX19fg1FgIIUBhoha8f4CAUVAUT2BUj1fYCCDAT49YCAB/wQfiFyH8CgDH8DAwM/ho7+5/hMMC/wchRQL/7/AYBuwQECA/4UQNT4KCEBcRWEE71dhBO9hBJ9WW/4IAYGBg/h8AoUWEBwP4gAI2D+CIAiH4IBf/8eHD/nALfwkAjAtMrsILTfwOBbf4IQE1oANf4UE1v4EGCQXR/gQXB/gYAnY/4PAF1//BiB1cC/HzAoyMa/4YQOdgYSV2EGEv4EAsmEGQ2Ag/iACwYB+EARGAWEFX1ZbgYFShGAgg4YBAf4DASFf+BQF7+Awez+AwcYN1+RgQFgIAGRkJFS/gcIMYID+BRtEGg/4EGc2EGi2BBMz/BCFI4DS0EwgtcrLcCpMINSfwGAffwIAGsBBBAKJMINasCB/AYAhrfwGEQfwGJGsEEA/wcBiaf8DAJ+//gIRI4wXp/gMRFgwFS/wYjVOL7AwfwMA7/f8CgO4OBsICev8CAO4OHfwOA7sBBBQICayH8BidqBIwQCayMD/AgFNfwICqr7BQfwKPFA54rr/8DBN9B/CQP+Dr6uwg6//BAE3qEAwg7dB/wQAvwf8EAL8BCfwMAEsH8DgBLCDzX8BgAX/AoHrAQX8EgXTCBfwMCZg9v/AwF3fsIKMfwKBd5XPBAJmD7H8GgmbAgfweCRA/t/AoLkEAX8CAkQM5/wGG0v8BgkTCECjAwfwOAdsH8Bi9kB/BAIRQQEH/AwFlgfwQABPB/A4AmwhCcwgIB/AwMwwgIB/AgAkwsICQfwQAJfMCA3oQ6MICgfwQASoH8CAHEOwgLB/BABwwfwIAlzcGwID/AYIADdf8BgJkB/AgC/XwbCA8EIAsIOWfwGAwUEEAKl/BwOeEZf8DAamn/A4OeEb38Jg58EAmrB/AoYmEfvPAwOB/BICZ/AYEp/AhLOEjP8ChSoSQsIKefwIFKhJZ8wQFKhJ1/CALDA/wQDifwOCIhLBB/wIDihF3/AwOKwhC5QbBwfwOAI/wWBg/wMGEhMb/AwDnK/wIGEwRrAqUgoDAQfweFwBNp/AhNJBALAPwIDCCbCE4n8CE2kB/BwBLsyuwhO9/AoeWE9rAQQTACjf8Ch1MB/gw6xnkVBgIGCggQChQEBklCGgxEVYQoO/gULRIIQBk1BbgoQQFWEKOldIQonh4X+BAXf+CADeg/gMAgYQoV/gMJhhf4DCNwa1kUQUGBA/gYJ2X19fX2CAhYf+AyrAKYv4EKUoQ1k/8EIYicaySgwIEL/BwdkFQ38CFO0KAsIGAQMR/AgBrMfwKIKAVQfwIC/fwcJThVt/AwA7hRP8Bh8QPTf4IPF/f4RBKQK3v4GAReqJ/ggQKIP4EC5wr9iGBAhAFhCQj+BhEdhCwQiP4FCdAkI/gYL7YQLA/g8RbCy/+BRKsLO/4JAUID+AQM4AuD+EACtb/gYArZ/4JAfKD+BBIdQUGBg/g8ECC4J/gYBZlYeCiAFhCX3/BSEzbKyMooSX+GEeWiTL5ao2B5431yh/Hwm4itbYubGpTZLoTlqVhgUPQXY3n/Bx3ECDf+GhvBr/Ex06ngDpv4CGRCQh7aCQsY1mzOOvnR5OsAnyI6X8BmfMNCAWcY6P8BgAmxBhf+AjIsWU/hMArgLkQf9iCZY4hwf/+Fb4f88Ct8LEUK8QFwfH/zIVvhDv8ECGWPEdGsHA9ZezgXoREc3yDTIYefLYHZ/gI2Vty8KGhkVgcpqL/9LFvvX8Jhe/8MxadX9CX5AAX//joe//uArfA/xKF//+8Fb4RXcYPropv4nlt/8fAnVn8PgD/+mBW+FDhJ7JxCbPf9dCt8IUif9CCs2gcg/z0W81AzB/C4Kn/EMCx/6ACs2vCD2jT/AQLc5hwctqoY9+z3Gq72nXlsySh44ScxxA4JVEcGvl7j1ND6ar/Vf5LpCUUKDIM6/B4jqC51RzqXCarQAaR/noYiaBWD/XQmOH/BN4CO3q8tso5If8Esq29jTXSCUn8NBadZrp8Wjf4mKlhQtTyrD/BhotyPQh/BgT/0HtWOi4wvhIq02iBc8eP1aKL/BgN/Fj9pJL3VroESRQQS35KyQxkb2Af/jwE3y3P8nINpP+rArNoA/9dEqvYH/EBOP/BM5cBBpro/Eh/JwY//nAgrm/gcB/AFlwIL+B7i1McGD1F0YGyFPKVobajCNGvxK0IP9rAnCF/D5bm5JBZjuvTusSKaF7ghMJ04G1BpUH8PwAsB/CIU8fw+GP/7MCuev4QAfwL/UhGLD/Flqt/iYElO/4sFcdT8Pmk/+WgSg7/cQVx1/FhrH/EAg//AhiGA/9uCuOv4mBP/+X8Baf/yAjU6f4kO3WjAuNDQ0Mf8PAnrByYGBoYEuamRobmZucmn+GDRxQ0YzkyZWYwMW91ZE04N2VEOjIxWQyOTU3Y29mSkR2SWVM/xQFcbGpgYPv+rBY6GHAaCPEOj04d7qTG0gmAqehnh/wEZxkF9Ojo7BM1FlK4j70hUnvtAJ6hof+Atu4hQ2PuPOC7ePm8WhGKeueVMBtLp9oz56Aj/AlXMe0HrA/AoTN7Y+dtUjACam0ETk1AtRqdLhLv3EbSBOeFb0QZBtxPMrs6MH4y48Mnp+pOrb5T/A1XMOd8TxH8BgAkh/DFXMXD9oyVOzAYEqTnSyxgOOPKq2diQ+D9zEPkFZ9P+Dic0D+CQEX/Yhu1x8oOGfpobv9kwFGCmMFCNrAZ+EKa8bVi4jfiHfRWmp5FEs/+KAbItxO4QV62twP8YEkaAmDKTNN4OD/iIpD1/hMBN67L7GAgy7XTqFHsvX1jyRdDA2di5PcHlfejcHdlC1jT/gShe/xUSmCH+CjV7dHpf9pAwwIGsEBuKwwFUgsnow7yglRQN4WCoi1Mv0hE6PumaLrw7+Z/sIJp0AAEA//8avb/zlr7/4qCUtCjDAOh/go59BDMr/h/WXFz4uG37UqPtbXbJibCuSEGbP+NO1s+fUINssAPLoBs3S9e634yjU7kpYlRrBr6492GeFCYU/GpmfdsGPMbnKi7326tVvTGXSoohxD+APhhOP8BA6A3BA/DrFMilH59dA5SCUXZ4U7m6H3GNAeK/wGdjYlH/gIwIX62E0AbPHtAk4b8L4AxDUODK83T6vwAQgqRaaINmKcgkUJQ4XJcvu+gcqHNWWmCFERWSsqLykqwJ3X2C54vqmzzMu8O7AbR+JH9/gAN0HlDnMKX4xv4fAN9eFNIQJqVTDWS/Rcz4fk2Sfcyoxs0y7TfaV9rKzvxUL1OHNYzQSA0AJevJkmJx2Wz186G4HYAA5fS8V896M+QxxtHflPwjKIaikemg2574fO/gAPsLiTB80w7UkWc/wBB9E/E57G+wW35sjLPPc93UqhI4Z9FQ666rh2O7k4m8ZC36ZjuFlglvTtm3DEYCbsTVuTrYXk0PcOvhc77NrqjEY+GUtEiGgj/JHI8IAo28aQLa9zfWDZ5OsAz5GcsKcGRjDaM20v6RgFqWtrkaMwGn68vwsewsfbFji79V/A04JgpHCATIkQ/8BgAkhBgJhQ/5OBxh/4TYWP8kG6oXyij1xwWzfpG9yBMPycu+2JwWAulc79HxgwvFjgR4b7fGqP8KDJP+Cicqp0tRKe/gMAE/xkZxUgUWoUSYn4MBgUDU8RDZkIhIZXuQsLv5DWnbz77pfwNiFfxANqQ4CKPfJ3SQjfwvMSB2b/gpErtZUZisf4DABL+DELAR/haYkDqCKIM0bnemKK9iZbamEiNqB6LUaGU77Nidya3tXiDlb7ZgMD+EpiQOQP4DmJA5A/g6YkYP4JmJJGC/gOYmFmCgUmBAJFWW4D+Dphpb/gmAQX1//ByFz8AOP8CvugA9QbCAIL/gg9hGAIj+A2/f+BAA2Q/wgkEvwCiwUCiwhqywgHP8wLMmJH8CzI3B/AkyaBKACpMIGQAKl/AYAzW/wJMzaPzAszNS/wKAPxf8CgE1r/AbaZ2gKkwhqz/DMzaf/wPM2IkTD3OKMIAhq/8ETPEn/BMzxUfwPM8NX8mTMbCAUzCAFD/Ac2QF6P8BzZGf/As2R/EgDJ/Kk0+sfxzNPqP8ByhBDqL4i/xlLxyMF/AsvJ/EM1YLMBJ/AcwYiQIDBSCkyP8CzrDAyfxGAdSEiLSDH/AYE/IMEIAr/+DD1I1fh4A4xWAFWECI/4FogVrx/gQANNVc9X18+Pf4DAEVCD/g6dzgJ1k1CQkVD+Ip4EDS1b+EJ7wCsJGQ/iYA6YQLe/iaePAQb/gue8AxL+I57z+CgJxRYQNF/icAy/gafvA1f+BIfvBC/+A5Z0WV2EDaP4Fn7/4EAEAO7/lCfvA8j+BZ+/zgcBw/kWfvA7L+CZ+8EKP4EA0Lf+ZJ+8Eq/4Gn7wSf/Inn7wT8/jefv/gMDlX2EFCP4Dn7wXo/gefvBTb+Bp+8BTb+Hp+8FS1dhBUNhBf3+BZ+8EKP4En7wWb/kafvA7C/gSfvBa5XYQWpYQYWVlthBeD+uZ+8GcVdhBnFhBiH+B+fv/gwAo/hafv/g0AjYP4iAIv43n7wcoV2EHKP4HAsb+IJ+8HU/4Vn7wdT/gyfvB3r+E5+8HlFdhB5T+BALJgQfF/imfvBuH+Dp+8H2f4pn7wgF/gaZ8DWEGTv4FlpxIgmEHMP4En7wgm/gSa7BzD+B5+8IN/4EkVQdY/hifvAhV/gabMYf4En7wdr/hafvCH3+BpvSFYQZ3/gUB3kP4BwHf+JJ+8Ivv4cn7wjhV2EI4f4ICb1CAYQjwg/4IAX/OHJ+8JHP4En7wcw/hefvCTn+Bpo1BYQaa/gWaNTP4IATMJWv4NBM/4Rn7wl4/gabMhP4FBIvX+CJ+8JlmBg/gTAO/4sn7wnQ/gWfvB1j+CJ+8J4/4Fn7wdY/gifvCfZhAFA/ggAl/gSfvCgn+BZ+8HWP4In7wod/gafvCG3+CJ8vCjH+Bp+8Irv4Wn7wpN/gaYiVf4HBzwpg/hMHP/gmfjwp//hmfvCpv+BgE2jYQa+/gUBNrv4IClQq8/hAFh/hBn7wri/gWfvCj3+B5+8K9P4Fn7wo9/hWfvCw/+Bpk0U/4EDCf4Xn7ws2/g2fvC0b+E5+8LYFdhC2D+BQ8sLb/yB5+8G4f4bn7wuQ/hCfvC7xXYQuph4X+BAXf+Dp+8LlP+CJ+8IN/4Hn7wcw/g2fvC+T+IZ+8MCP4Pn7wwa/gUwjwwi/gQF+/g6yXDDj+BgB1E/gSfHCSj+Fp+8MYP4GAQVsif4IECv4Fn7wx//gWfvCor+B5+8Mkv4Gn7wqK/hmH7wyx/gYCWvf4JAUP4Xn7wzd/gYDC6f4JAfP4Zn7w0I/4FNgA0X/gSfvCv8BJQVH8LT97a5Kw+KkYJ/AD5s2Jwd6CprasIf1wVxSsSNmVL4vwbxoFp6fwtP3/4mQxa/8Hx9Sr/CqA98gMJ/AiA+SQgEjsTL/AYAI3QYqgAv+KkMY1uQFW/wYhsbfwn3if4oK4nQoVezxQ6BdzXFE5BBVnh58AUeuAAH7ZulEAhv5Fc+02sKyBlOTC/fTWv4Da0f4E0MD+BEp0CAwkBM0Tg7f8FHpWoC1CocphQCAAYEAgN8RuAMBQCAAgEAgAUBAIKBAAGIMRApBQEAgwOAAQUBAIQA4B/gQk1aksyMjWYTg1RrsG1sBZMQznkcyqX+E/Ij/BMSlQ/4mJ6bf4T+q/8nIKVP+Jg03D+Bf+9Jp+/QwCSUDRI6l2Nf8Kn1O6pGcptItoroN/IxG8f9Nr0/h324woQaTiBB/4EABIMDBiX+FfbhyCsepSj/8/qX24/xkMqk/MCAY38LvWII8p4IMVfCW+dt/kg7A8+UxU37nDA02BK8yfg8Qgknsb4W1osql0ReWtA4F1NVuyMfMeG2aHAjT9Q8CBJx4tFQPyexC5v+BvUz+IUqoBFBqR7GwRgMCs/W1BLsN6KgZy18VMBQr/NGcqGD+B//v+CQEb/WwvktIvP8DnIdbwjTUXqx/AYAIzfw5yHIhcPM1YuttShc6o3Zjyjdu7Yoqc7+KkTg1yQGCx5A8of3/yHf3/yHf3/yHf3/yB39/8h39/8h39/8h39/8h39/8h39/8h39/8h39/8h38QHId/iY/Qw/wUCkFqWNnROwRNwliJwlQIY7iYxUQGjBv0g6qNEg75opWGgWR/8Mm0jz/BRvX/rwYmQ/xwbPamP/+w48irxw4POBt67+fPJE0n9A89lEiEf89HjlXOLFM/AtCSre6lAM1pz8RC2RHHcvXDlHkAOnnVo7zH4Sz2excV91BAwfNoi9rEgjRad1bEIIayJ/F2+Zd/FJ/+Hh6mC322/wsf//4KPJRv4hARQSUXOfk6yn/GwiR7hQjUMfMTf8YJl8f9SBzDgF/ooOXc/gOsxWiR7BNYlBGQxa4DcgZx5cp+qqzRsLTu+7Sz9/2G2pElMe14r9isuktWcDxMArGzfprdDPBqvkDVIGFhAJ87cmDAyjplF4PjnM3yJVcSEkuOhcp0r4hLPhJIcfn76u4nKfuQMkTN/sbv9dC/+BzhH+dgyL5p7fum/h5OE/wwjxHfwmB3f4YJrOv4SBMfPDBtklkqH8AbEUjst9CJ1LdxH24+KL3KX+5/nJBxx/gRxB/8tDT3P8Cg03+VkBOD+EMnQLJjz/Swb7l/iIuv7zijNZirvzmNjPT/NQhiiBUH+fBohLA/2AXTmfwGHITDNyVqTl017jnQcR9XJfIoMG5hsf4QC3L6X+9hvL/LwngT/w/oK4lqtNX8C0zq9es2zoI5xTD3wkEPy1A9YLiVDxDIcsLGAxNqahw2Z8K4QfP5i68kCqzps4yaNZg2sXPTAM//FI3kFVUYIxVVAYAU/LLxUbt/23asyW2AYH5xm5sSqf8D4Fv8EgI3+tjbqh9I8D0a3txvwJMSnoQL7VwyJjA9Pd7JIC+Qr/BA+/mL8H8ELebFBircpf4sQqAAoE5KJsg7+GtnudyZYivf8+ECU0B/kZIFp/1oLo4Qnp99LZ8x6NWBH+vi3+r/DSa88ERrctEBPzwnfgJI4ZFR8TEN7D38ei1gIAf5WGG3f8xGM1/8G+8n8JhNf+ehiF3+/yAzPv8gM/sEgM/j4Cf/0MY5XuoERAHB/CYi3/k5D1L/wgYOV+QAcAqBQOts3k0iBPI4/BbBs/kA49vsb/BN2PnDIi3Sr/QWgxeDeExgSr2eOEafPl4F46eHBe+oY3mDc1mqD/gI2bGLsjl/wUY5X/UTwAJ38UnwAZH8DBU3+Dia8n8yQgFr/50Jrx/8gEE4oB1gwGARX+CCa8fvAID/EyWhFz5EJrx5IFAAD/JBNeOAnnYEgYDef4IGdrd3/Igzw91/OCCa4Dd/xgTXjiv+Sia8cy/gUBG/w8TXj84B1/4KJrwdvAX4BfgcAIAGzAbn/BQzwrcgN5/goZ4V4gHr/wUTXg4YBCv+ChngMpAjL/BRNeOpgS//AoAivgTR/I4asQfxFi3/+Zgq6dqndHDzRGlaCQKRXUd+0+qolQoi3XnmTWgOIOccBkprBb/wEOFJqTxRy74ipxDyDa/2CbqJM1mI1y3K/SAd63/AAL846HsOf60JqAiHWDGahvmiiJU2Y4rRD/Nx7iDBVRP/AQUDwCg9DkHFkQjJWqbt0mtv0gl/w3MCaKlY7VIMxXiWHvvGfSZ3B3MWfAg3/kAmoC/hvVgJYQAmhYNv4DABD+DARFblP8TEmlH+LkztwAH/NoTyjGdTI1NBywic0/yIgf/tPb8sVKV2+wvEKgERx8/kiY+Wig4MRT9sVeVMeEjRsWmUfltDMQJGxV3E9JUOY2Gx6Zeh/DM0ogH8D9PC3S/4QFOw4/+Xunh8Af4ADdrRb1EqDvWNFBdmlWoaIib1hp/FVkzDcIIQAJERaKuH/2IckYf3/08H/Fgm/r/ASNidW/4WGcsf8BBRl/+QDAUmq72EtHWeKCy3FctVejOWMBLpU65wQA8KyetyUk+eRneUXb9KCWGUD+peNLA6O//AQzaHw9QMGikg1RSR1yTJtg6KTuRjF0IKDmij/4CQAM6gp7SMMzs6FQJ7tARx2cJDyANgeBEqWSz+AAJMWo0sFFJxWp6GmO2nth9s1zHhhi9S1X9pvmtCHkJ3mJG3r1+8hPfJ/gAmfQr7sanAI/fXp5RQjV1IDDITTG+Km+jBDd5veoilOQxqmEEwoBIEGGWic9VJ+hmwGzYPdv+HzRQGIorJS2Zdw8X9K39+bv2BbNzJ8SEOYFztBZt+rrHPsm0U3ex8kfVxSm7btYFXOys2dWQNzfIkY684rviqrDgIHaNA8a9kubCVHYnwu8zYmke0xz5asLw65xoQni5XFNtpte9bAbAT27Z2nJ6Tazg/Bz2CaHITiiUWDXPs9CSb/gApxceZzJyKtXdJVnGa4TUJWiEq3egSncMj2OG5KGiceWwlG903/fu2dIRfXwmKPKZlJUSC6oXnAhQL+zK6F61XzmG7k3+BWVplhGsaDO9zW8btKKa2vCerGkneD17tKQmR9QCPk6z6Bq9nfXRpVCdlZoBFrkhqAqcTLwIsyt/ktBSff5jZoP8FE2b/8CU1zTUfCWR0AO2SeikbDpXbKrxmlQ/Z5riZro7TTbNVNbCpb/CiH7/goqybHS1ErH+A0Uj/GgMvffColBPp1/gwGwSbbFF/cmg3wT0J6PMjJ9cCqPBJpY5sHHESsl2JVAxvu8fnzlKxaAwrezIRGhAdQsqodflYlnP4JRr/4eAawEK1OJ+ReL/BgMy2OZG7GEMK2FGCVrktTlHxIIFLbrID2UkntmEBJcp+B6EvUs5WETSYO1FRon2aLpucrLEzUfe/xUA1wflqabNAYAmLe5oEhe1QW0Cs+3NGCjlQCPyv/8FNtP8DAI38sA5CApzR0fwMDkO644px/GeCwaftnCLCgtKAe7jDu116RBA0AjR+yod9FVv8KAVX8EM3P8PAeAGbdGQAgg1/AwK49wrU5JSeckQQnsXSL/xJlwRj1tEtpownx7wesDJPXw6tMJzhrP4aTiJ49lZ8+T4on8AClgVDHWf/GIK4/OPluCI0sa+hq6VAenU82k12MF1/gwlKnK4QCyM2f/gMAEqIMAnwP/8gZmY/wmB5f76DuFsBn/18YvLte30PJ1OmyOdSERdQJPhC8uK3mHJ/MXtGw+p8n9duWqCqi7Pt/JKjw6EASoK2kv3UINR9olthjQenZ+81nwHACW6MvIWsKrAWGzy1aqgk4VPyEhZvTO5G6KzUT8B5XbXsHqPPfeKXVm7bANNTeCxUgqZKvu6y/yaOPh+p1wGP01t3GLaz/eiaAOQDx2PCMcTsjRGhwRYMxhm1gfAOGW9a5yk/HY9ic3VIOwVUMynN4L6Tl9mkX2+37938AIcyHchkZDi9yEJlPBdpModlKnLTqmFuiUKAiznF1jK3eHwbXmq1qSY8Gm/NScswX2qRqA85mHd1jEEfXC+iJ4m+YeqmZcFHQlDGJEstHhzBEYb4mHd7S+s+IGqh44l1g5YFVw9OPuTrNWUoLg/PO8Fo2ViFFQ1EXguQS/c0Y8utiZIzfvQ06aWbUg2WPiAqgYG4CnVoQa2w6vW0iKroJSvAN8ffs+NePu0dJLFhExvYZSEchD5oOFIiv1xAXfkQadWphVjelDVOf4CFEtUlLyCwNDr9HS0jA0B7BZ+AX1PACuLaIG9bd8f7TrSjwls/fa7vzG0K97ZFB8q5LJqkAfpNTg1dMWg97iRI4ypxTT/EiPbamYY6Zdse5Yc9Nb+h8iRC3f+j0cgWSZUSINavPgAdwt/Al4/ASYKIXoIoPRsTtze0lPBdkvn1uoNsUKsjDXL/4CC2BQDXtTTxeZAVXnuyMaEJRjjCnmjtp4HL7D+KRhAM4Jm6zl2AwPumqUIfjZHTS1FULjcIZhfc2Hg2P8HGLOn8xDCCQlRkp/jwM2H7v//EeuH+kiPXB7/gXF4oavstozaWU3JO5sgXoVE6r3eOq6Ipi1aCNGeIBfg1/ToP+FAhT/BRi8w6pTqifwGACJn+MiNBW4/wEHBAnHqOMBgOpmaldl2GeQ6tna9V0spnvihfv+CguSj+JCLEJgwRQY+Q/xYFtuOJ5/gopMhvSP/FRbaMFJsRu7KAAf5KBClDhNgPndFnzN20+J8kza8bObHrgyB/I982+JaGE/wEFvr7xpfwTBdxRsSlJmSwOSZdJ7wsEBFhUl37HqCtKrUh+6QkVHBhE0VTyF0NFJkLK8IKpVTh+9pCD90EFZfIHSv+CDqYG889GDArJi/xUEJLGTcIZTHuRlyBwlD45yH/J+GNACt9+R/0ZEI0EwXhBRhTsit2zliXdj/fkqyAW+woQZmV/+LBCS38JgmX+MkmhD+BQSb/PgdR2Af7+HKuvzKwQkt/D4I//D4J//D7Mf/lgJs/Mmp3TCLN8/8ZEBvNMjuw2Rxx/nocY6+P4cyOmnuHK3/Xw4x1/gpLKCSTXHwh1AU0nQkpDZxDaOWonZJP+CKA93/vYaUwSCBlfKMFRSSgVeu62LeeoG6ZGP/jwtvLqU7/gWokjhWtkNGX2OlyNTga/1/jsYnBGTkR7EGY51PerxNA3b2UlJTNp2PnPaw8bFtHym3Is3/Fl3A/4GhHH9RK4T+AwASM/hkZOBhStgrHl4DA/hyLypv7/Iav9IByliYsfwL2uySlI+WJ3gnrI1ZwpsAUnCnMf4CEfrlyzolF8ajPCHKrdEcAX988Nvc3In230j8zQYMs1Br4h8wD/wccbi/wSDrf6+PJk/4D8AiVidoyvZkupa5ADZtABpvreDhYK/GZMZArveCvbMVTKiw8zVSca7WoEzGVFV+sL5c/wED9c+8KnfxSFABpH04iEA/8DI9szusmkUecX1Cq5Nx/UcIdqVbkG6F1iU+53Stnd4Ox85rB/CgIt/go42m/iELsA2Jx3+ji1oDA/xMlHQn+DEK+djIv8IHBekDBkFTOSkA7JwlzwCTKQjLO9V4IYfmGFHcFMBSBy4Aj3+Gh6z3/EiV2t/AlvJOzrJaolQ6hEa8e0N8RKEreaDilScmD5cKEdS0aXYyfTLw88IHfGgck5APrWFSHdF/UZe2xGf4MO2PP4Hk3P8ZGzCwLorIx6Xk7wGB/iwY4M/wQStWrKz/hAVx2Fv9eBCgz4BHx/DG5GGpesaDyf2+yZ5zJFzzvd73PLPZ/55g2D7rogf6+AhQb/Ph4hHQf3+yv/6aCFBoQ/vv3+CrO9CJjT/LAQoMSZemcP8EtWIDESv+QCIR4G5CxRL13Y3HlxY8YLvKVyzIkEjJ7DaR4w1OJDbWtQFZzlTlja+DmVlhsFrgiMB6DA1JjzNF1hrDf/2qWPfEVsFuNyz4IR8L/wEkeKlJDkcE8h2JFgORIGEy6o/jkJZ5a0xGiLjX+Sowdp0JyuZxbYRX0/+AhLM+QTtNB2iy/Jbcu+o5xdiktiW1c+jOgeV/JPSoeb/fB8aHWJzZsyuDyyPt69NNC5GguYsxho5TBOqwm2B+ACuwebiRPzBC4bqLms0PK3sNqFc/yAVJU086OoAE+35D/R+dUCpcb9QtEUfOr+N88xlzDu09OvmBd6ZqbKmxJegWlTRvgJieVYxtXj2hVXkUcpJXzyG+I25ryJ/gCP01v4AOb1W0dj/STa3NxqF0sYcdGCcrRk2j/LR07HAoEBYyZbvf5+YwO3HmLXb/ZJUI3n74PW9dWEw5NmNDQNbB80c7jv5C/OaHxGjzWp5pKeClgAvjB/aCQBfaNmqsxcHZ148d9CrodFwhsQc9yXQnwVjUMVaxM+hYX3KdykgX9f4Fy+qEWbt6oDcsiHET8ZR4xff8BIlX2nQ48OHraHaeUSBEj2TTIZnR5X1RsGBnfluoFbIRz8naTkf4CUK07+BqfKAYMe2/gMAEvv4ZI1QXFs+hNaP8eEnWKz/f6KAf6QDn63GO/gXTQpdmGBiYj1qalD9xtOwwizrUY5qEwARRBVpwz/8BIISRHT5n+AkqadE/wEVMhMdMsvDUTDQQfK5iVch9W71/A6ar/BIOt/rwrLO/xkEx2gUIBMC67fwBgAj/BciR/q4IVTAJiW/wYD46VK3S+95Lvb7PC2sCAKjZVQkx4zUHMP6hd+G9V0XfP2VLifwgenQaqdP+OgDQnvbiAOPbL1yX8VBguQPhrniQGB/CgLr/AqVt/EQaWwSuCMddjQH+OkAkDD/vZSMYN4xhLlbWs/DKfZlqlg9USK3/SN/jwNLj/yP/gXgcrhAzglEXSfcx+7rDn0xeBcYMG8Yp2AWVj9vX9YKMNuOzjs6OiEhD7Tx2DUas6b+AFUB1yVrTrGW/ioUhwA/ub1r/Hgdob6f3+sxf6SDoGhj+BHfdspmkwC87SDZeuaaogjwJ+xpbqdMLLM3/AQUdZcz0c5d3sWQjFVx2R0PiSBc5Zq22wjngmelX4GX8Dm+/8xApD/go9gQMpq1EhIP/fqNNyFxArAAY7NHbjL/BAa1+zH8BoKUX/4I+CflO6ULUtT/wEHZX0/R/gI26UwsT/AQo4XAfHt5C9k9MasgF6rpmn7zrT2/Nf/so+Cf+a7/bx8BP9sv/8A+CfxMpF4ALMtxv/ARXMsC5fa3tGgl7kvo8BJbH3jgkeVr7voXyYjziJCQMA9h/NkFvfkzIdFBAQydsrTymF00cpofKM9n+fi3X176TgkUW1VvDoy72xogPivh1UXjb2x2siyDo03uQk6FAzN+iQoyW4VmCsWGfAL2ScHCS1SEWJbJNKsAUraoWgxn+PwH86Gmfg1NDtQBek4sIqOfn1U2y5HU926ozEkhY1zYn0t1F1vseZf4Aac0EPKCdhXONO9grd7OqeRb9u5F9lbyQSQu9Cf4/q+96jXz6JID92YA/8UvR4y1qPwQIq8pcxtuSytY5b2SNVC5E55WcKjkQk6uEWTw4PcaCX0Aa5kD4NZuY265xRzY6v8kHwTz3HHznrG/40YdC+/zIQEsS7J/tTfyBp0EGgn8B7ClL/OCLh4sbMGfE+oATlzKGCYU1Nn3OXRQDflJoXAluR3ooCuodkf4GEpP9iHwT/zXf7ePgnwoo/6AO4e/tO8hFEvyjGyr+uOM3SpOqmMtkB8ortkrWyTG4WQ6N1jGBtoFD6H098+YuegL3e8bO8ibzzvMMuuwaZDdxhrKbfx+HyE3Yy0WPl7T15mN/AD58/4CW28PXbV7ITsC2DhJlH46ulBSvm6a377pLntLpxVVn/4CCmd24YeS81hA03dSh8QjlpHT1WWL/JSCZr/Af01bMi3cYN/I2YIAtZGaiBaAjf5bguRuc8dmpxrd4BOuzgg4ZccgU9n9cKHs8yZw47UfFT/gCE6Nwj+CSorL/gMqP0/4cPrOGSwIDA/hQj6/gVXb/iBARQO0xsZOpcD+BqKG8n4FLSILpgtbJ7Ytt46XWx/qYxCLX9m5f8hNY46BPcnO3VkQKdjGY479OAlQJOIYRZ8STVa/ioCxvaT+AILInX/4GoocDoBJYIyLCRFf8BHFm7Bus34dk64wQnKdrLIrF3JaqexL6U4qzYr2pih1Wkvzgs+/+Aidbuld50M00v+B6Fv+CQGz/XhOOp/jYrYf2AaBAQbba/+AwAS4gwET9f8nGt9SAjJjKSjuv9z5/8Pr6NJ4J2YZ+/46PUEXQu6GMEVsoQMxQlR42EFXi5W99XIuc6ITPGVpCSiiGzaWtiP8EH9efziAfbhwWcXwk1gj+Hphy96d1TIICeCi9p3IpXqVDLTgS7HFeq7Wco8jmsnL7XKokdviMo82KfxSDfhdmOpnxHwGB/CgMz4fwOW1/w8EAoF3VH5q4WV/i5PuRCtP+BQVsHhAYpJIb+AwASPgwyaSn/Fhq0+FUn/hDVp//HyW3wQfxzf9kuFNARvQ/48EjLMa/lySGdL1S/30VLbAtB/74qW2/iIFj/5oatPgf+hB0e6AywCxwFA/z4atP9H8B89+70AG/Tme7Qk7xuE1W0wjiOz5YKeGQcU7pCq7S4EG69hadu0A0svb63Wu53ZpboLNlyw/xd0gy9aSk1a4QcX8pjFbSJMOfwLz39/Q6Ei58AB0Ol6pAwGAb3/DipbTmBgMA6gMA3gQA7ACA/xQPWGT8Q89keCgBB/MAGT/jwfJo/gYDJBQBABAG2AID/EwbLL/wIBUQH8R9SvQ6qFgBgTECgGABgUH/EPUoBSAKAcAEgUH/DHUpdAoCAfxF1Ka/8CgI38CdRf8CdSgCQfwGfTTEAc38C9SjYAvYC9/wH1KWADbfwJsDARsBIoEi/wH1FKyBL/8CdUwEvgTF/AvUXDgUh/AvVNYgV1/AvVNnAWv8wL1TfAGCfwIKzgYIBhv8e+/v//ipnb//xUzt/6YqZ2Q98W4Cs0vzGHu2XC3Yn5L3B6UcZQmBIMUj4jRJ3GJyTHhF5Yw4QhuiWDuoMB89LCmhM/k5Ndjlm3aL0JxlUl/0Rv+NPHP/BhdQMCk0IBKkxE/wGACSUGGWNjKctRKCOPx4qBCR7Cdg/O1Owrp6FJwFyCsmvQQjn/HZGSKs889/wO0hIB/j4cpTHBfo/SL/AQOfvpYQzR0xfUC0Pi3oJg5ovVs6REfX/gI+dqnAng1g/ebHgBLecRGxKDCYhdvEn3gOOAvOe7V0KPo8+W3WN4P8+Fk16IkFjVC/58Jd4f4e0I8n/xwGryZMvJ7R4at/QgwW6kI51uaSh2saztDTR3YI7D5KmezOfZU5TX/9F2hv+LgY4z+FFh8e/adBNTLC/zxEiRH8gCNIOx/5qH3gP9HBY0n8Wa5/+gBxzf+TjI3+HMPj/0Iczc3Z+F7xKFGn+Mhxzf+EQIn+BglnBBv8oDjkv/CRtJ/iYICN/xMJx7/wndQAwIBJIKsf5EHHTv4dfrTE//YfJsa2Mhd7gg3RF/gJFBv0P0f47SdlAcIcf48i+FozFX+BfX7JT0D4QDJ6RFv8wRwVYeObp/v60GlD5KVYThno89SkgM5DqP7V27xZlBq5+bLFgQ4a8j+AD6woFQC+HT/BRcHLBPSJ/AYAImQTEjf4qQyeo+lonLmA/4RztL+EwFXTQXFtHpsuckLQ4BgeHDE8AY+9FOfkIZIA4oyIXuWfo3/BQXGrm1UMZl1zhZ4LlqXU6srzaLSDb9GFzOAgeejvik6GQWS/YJSt2BWg21jJkt8qsyU1Ooil0bT/FEmw7LTBUobAYEn/gIsy61xIHJQB34JFxejyPaESA39w/wYoU/wH8xqfn8C2Y/+MDsfvD/gP/suP/jBzugsxVIxHcRNk1m/lqh1FFcy9BJ3Wn+DAoqJjqP8MHON35/k4KColYf/gF1f9Cd8m+nEYUHTqKuKb5jPL/4XdsvtHQJj0nz/BwU05onZX+DCg8Ob2sCuDnRQwB/4CL6pYhvOPMHQyfZrdF+Y87f5z+FAjb/BBippA/xGrMw52QLGYEfwMwBeq6udCJJt/M0ecLBlYFQAYT9Ci4WHMd5XpV6Jz29340884KBsoVkmhTrKFXwXuy1J5QDbsF38YrJf8CfqGzHxeQKKnExv+AgnsDLpp9+t5kAdbSHAiICr4ubP1JIngmG8Qx48lsTIqpDH1M/St+thr0sns1lv8HIJK38xBh3+AoKNc/xMElJ/4OFlEoj/Ix3ZgCc40k5SB/mY7sw/x8Ehh38PgD/+WDJlz+BgKb+ONdUE5ThwNXoE/10LKJfw/rKoqD/7SO7MP4DZetyB9tEzqPb7BxkMelBajjd2ZuPjsCZvaxxgXs8J98+yx0S/ALPeehWYfnauyabO3b1A4M8z8HB5RcCCjVB/AYAJUfw1o3qxr+LoVAYH8KBhHf/ghCLEv4gARYaXyylFBWyAwP8VF8Sv+CghM1WjP8IC9fZH86+BKVAMXhqinL7rMl8He2o01+9jE7nMlt8vSkdXEfpiGb9cByn/Yw34D//4EpU/4kEJmz6SbfwVqtgW5Mff8PN+oCAi1OPbIDWcjDwinfMXrAWCOcPBq4CvpNFPXHDCEZN2BVbOoLAAYbdodJwDi25YzKXuAfW4PRZTX0RZGkORPLsVA+xPZsE+wWyc7DGsc32uQCS/eBfDSwmiNZOWEJKIieCommYTr+fd7m+3JrXvsSqFjbcj6P9eWu0Gdm3EA7wTOOpvYE6pvPuX5SccavAF6kIJCwLJW6gLmMID4dLgFnxpjslVnoeCaU4E3z3WgvCcnO8i97HjctWYpQhFtBPo+NL/D4G+X90v59QbSaj8KJ8jjPDVF09x7hyw2/4CM7F4enzBUMcAcwi5/k1TBxRavDp6M83bKgWpuxUfSBDh+qVb0RQpbAAJrAvgx3krVaij/fmp/WkJ3+MI+KULGDLNHbNup0C5XBx8w70dSfYDmxvdRRSIJCr62xAjdQqagujJmB/chDLobP/U1fRxEUKb8H1l71QVhDhfmjX1PPCCET38hwe+us286F0ntKAILh5LiTRxtww196UYGrf3WLDl3OhtV/R9C3cEbv8dFVe1F/+wZulqi3mGmFJKBbKlEkhjj6AfXS5rqF/z0M3Slw//jonNSsv/2FW5WpGrEfjhIVmu6gzT2k6aHYe/cUGH8bpX3y42q6iTdgAfwL4vcUCIEtX8S5ebV6ssKUCHcJ601TfHfg72aSGPK4B0cDNNYg9L0UEo26EFU+upaTY79Kp2Q/kn8YnA/8OYHmL/e3izOopP912QcP+Aj4JzbiTAqbUus5wwUzv48ziGXUwf4FPjTmFZd3ro5xiNWN+gDqqwyCNJRi+C4m7D/SCCRYXH0Ki3ji4bcvMw8vSHUJOpCAe/ZAXiX8dQC+QR3/wUUBAT+EBOakHHF//FB3vrDGoy2FrfeXIIi/5eFJqv8uI3tfPAgDN/vw737YP9ZIfP5o5gU3Nvn8O2doFuf50DGLaZoZHUQP8+He/f8P0tBR/l4Pys/x8ErH/w+nzf6aO9+/5AwYP/QB3v3/C4MHVgAA+gAANICD/CZ3xocYbYAVB/D4Bc/4IU/uf8TExTn8Jnpf+KA/uog1r2/eP4TVe/8qB/gx/jY74bJcCoxOfykLLLExVxuck/mfKi5gZEqv8HITPD/MShL/hI6n7+eoZgqQ9GSNrfVCW3iPi55Tn0qJ/3WDAUWQ6GXWgT7zC3BNE+xE5bMO89utqE3JhYM+YStd/wcGZWkYGPln/AYAJd/40F9dv0jIW7GgP4VA8P8FHr2f8QICKBa5z/gIzHqrRz+BquvwbSb3i6GwOGNN77B/SOdnMEcIcx5E+4YIa3veXuDWIENZl+dzmIHlk0DsgdXqegwWO711Fz/wYMzW/xEA1kxUJi1Et/wNWr+hFxHKo7zV7TcD2Mw+j2Aie/qOcdLMEbL2Rfvm/uUbksrMd5fbKNjDGAj+YNquzAybMqdrHv8D1df8Egj3+vATTb+BPGH3NWQZmDQqVnyJ4BlMxUGltrwxU+w5zzkvuLTGhmwagicu/SAfjhdz4j+yAvbz7cJ1IygbWe/ikHIFWlyTwXZgMD+FADw/gWpC/iEBFBIwJY5zSff4GsSPvkyqsrSIvGAOdx0rFYZdkWK3UFOOfXFtp+0CBMiUaFM3Cur4K09qmmckigQRmHXDYpBnREH/BySHyA/wCo5f4SBCLd8z/xUdrM/4DlHGn/fh2sz/AwEN/zw7WZt3Q9MHhDMkJBQv4D12zNEEFBNjk5QzkyMjg0MTE5ODc1QzM2MjkwNjc0NDQxJmMzTQyMTYx/zAdrNGxuam5z/hg7Vr/y8drMm/+LkSYP+FKGtAln+BIa0nhAJXy+7+AwAT3gwMPLv8VAmvxegI8fAaAAXIESVwXI8v8eospRBfw4OATDLi0FVQPrP5KVYnixSFLIW61OwSDrqyT17LR9ZcylmGCY7/1vBIAH8DAo38H0EgAf0/BIfwL5cfwBZCukGZRVM0FG3vHXECPw9QDhAcr+iHNSJ4PuuFKcP+S8QT+qbVY2+X6uRjVvpKyMD4HitPigfwN5cRpuhDv8BgAhL/DQ6DOs0enu0BgCH7zPmjOrIvum/wZWGaeYNAD7iY/4ORBVP4JARv9bHtOzw5fwIZNn0Cpe+5G7+lzkidiUUxabEjIRSlWRnyXsHYpyp2gQwgZl4/wyBVPP8FAeP+vin6YDae36CZ8DhK0QtwvtfOCbz3kqfLNr8v5UoXzi3q2fFV0cXIyn/4UBVf4FxEf4ggZsuUsMRvrGAwMircZoyK2WK5JrHiUw19IT4YqXq/wSGulYfdf4ISFhIUBAMGPeoMBhqD+J3G2X0t32nUwPFJPi/Bm3cntEK0Gb3f+Fls3ZXYR/SA6UUAD+HAIi7/jYAiPi02iSMv+c/9Lgn+o4l5PlPVh0m/hZdW4VrAkho3Pxg/gVz2+gwP5HvGR085eS29XdLBgQt6x3iJ1GooZhdrI2bZqAPjLT4oxm82f8BEk2czgyPmHW9lkOwM+WVb8g0KQSmV/4IRcQv8EJIHH+Jki17/ByNXyDMn+Dki2T+OVwM/x8SR8g3+vCseEFH/8NJsQoKGgDalF4wUXt8Sq7mRMEoZPH3znS5+k4QdndmwUObOO42do8fI65iku9y3zxQJegJ0dh2MX1F5c8P3jk8V3kEl6QC4ABDH2oaVsSZ/RxNo8omPYSsJcHSUAWdid/GYBLr0le1t7KacfuWUOtteCdJhzqToxawJs6a0/HsCLk0yT6JI2V4SAAHI9dWBe5A3iIdjF6l0dwG3D6vd7Q5iEpTIU9pQ/S4ab+0VRZVpCj2pNwr7QvHaALJKIxMQnGHiWM/tPm5l7lr7eAght7aJBHmB9beVTrwp5IoD4TSF5xgonwXAMbJs4kDii/CVMPE4l8URFuRFzG7U7Fi3kU52kBdGnCPzyXM05JxOxoACrzBzFi1Tvh7bd0hL/thV9yHcRNXce/DoSMF5DKWIhDDjA9Sh5bAPXmIb0YT8wwzGhaibt8vcAhl7pIKNbCN+7JgCDbYBLYxb4Ct+8cPgOjYGCryyUh2BODqhj6qDDReJeTzSgqEDXy7yzF39cgPB0tA1LxkZh0ERnYh9Ym0fSXIZPpMAAOW8VwwIJLYteGpAm63lTh+uAakzFffLIp+x0+M/XUZpwtzVJ3fGhD38tMoGGAbAvijzjq3CTnHU1Fgh8yTlSz9EAhMqXLUuH4fH1ZzyY4T5j5X0n0DqnLsNJ7pnLby+PAweSQ+hdUhzGkK//TirhmaSJJacp9F6stmKJoU+MIQb5EPHgAFKnWrkn+AjtQ6bf+CxFnpZXMgoJ+ayVAxlCANItZaUIDHp05RNpH5rn2L+nGSbnCJAY6LxT97WNBiIDIg+lAp9SojmwADSirGwS/ya2bduBlNx8SJkx8cxns6xPCXnm6sN0l048+n7cY/8BIdv9zsPj2gmzT98QTWoVJlQhc8z1fH8GkUiNdZBIAh3yUXQr/gIGrjD+aVpGv/EYNtNGGRRahp6EqQNUA18Pmnprpc2FGZyiK0okRc/khRVZoff0+dR5+91wUW7NiinbX195AA8g0if9gs1zwfj8vOUq6s0O4A25urqDHeS4cTrOji7Xck2PQnNg+5UE9nuX5zBPxbJjUzlq5xkqcKztEIwg5xoR4AEFVkbnU4MQsRSWrlDpd6SeU7gD/ASIKw21PaMaQ66Cpw7tH03kSfKgapDLnAJpTmA7tK/COheYs5YgN2eZ1VHbbfODn/AGNeBf+UkdFIMEAEC/wkjopCI7VzgAAE4gosR0FTW74w0MeuP1u0y51dD5bOUAgCqAJPKDKbqv38AMVbLQPfF5ARWRiXv4hkASaGCZQv3Mx5V/4MLyO17+Bv9MT/wcnrGP8DgSbT2/X8C43oK27v+DjAPAN75B3HLTGp7tI3S4W0905cydeh1uX24Qq8VHYDQSPrtdGWFC6+7nfspu4gOZi1BM5y6Upq8fy4+SY5Ac75Ug8B+V4i5Z3TAm5YzxDT5CgVnxbMIhTWzahJVoMbtywSwNoAJ7W2Tx2/Z/Mqxews3C1zPNbaDi14XXW5csOSzKI7NO/gFkvRAVv5vTVhrgSjXysTEsYtI8paL8tKJ0OF/0Z4yGuCIvn7zQPX8dDjiPusUgdE/gGfYSrb8lUdQaq2gvC0UetTXCC2GT8/P3Mcp0NnJTBueKf3JtQUksqPvaN8e87rRXw/3dUhJFP55ATrxZuAYGYiRsnsO5K0hrQlbPXXrioKArynf4gp9/4kBHn+NjVdp7ev7/gjwY+hBK/dKLiU0VvSIsPcXcSdR/a8m+K9Z3mSmcl0FbOeTgwdopsqUxna9Me9fwDTh6utNHyAVP4LJpDT+AyfD/XAqlEDDePgf44MwSlaEAmU09/4DAAT/ASpKWEczT/nYzBK//4JHBf7AMwSndEA1Dhn719sQMJFLrBeM72tuGCiwtlh4IFNgQMyNFV5P+HgV/s/n0BX7PklaOTdLn9HpJqYFyflPbIx0nHY3f9lTq5ShyRVFFxTb4/38ZglE5oien2lwYBMmKE2AbYWr5PlbGuCXzSUgxyK9VmNkjpiWvOQVqIfZl3gaAOUh/7p3mpcCK5gt6Be7Xz7tAyFR/AGx2RowRKOyQqD0XguYO7OMwNFfltPYAdIfgeYbRpCvDrysFub8N0ssCHdB3iuGBkI9LuE4Vu8Ko4Ozz56QN5VNY7GIktX62oIHB1nFhQwupiCNS1hbEQS7yEM8xzIsw0Tci6PnakyZyUCpMHcmtxXW0BoC9WnVGccJBUpctfiDKFQi4TWdUrF03vecz67i3Yf4CF1qndIjzNf97NgW9iJPCVVSf4mhWLNe0Kf94X9uTXfSJ8tO+xXJcYwAgpKax8toPfuSq5wuot12jHMN/Y+6VsactN/KB3IQmjSjgcaVl6PxXRgmyRAp1Qn/ASWEQqVfuAWAv7koNS8KJro0yf4CRj5x1rtEIpUlhKvwcWCV4nG/P8l6PB70qd2E9vKrvJkxpPhdiYMTC2H5a0h5CkkDUpAXOZS/An1mV5sT+wniZk9duTABFHorlU8EpCSJh62mSRUmNAsS+i/Yah/M8d7ljVicJgVHiRzSS98rBKuK6WFVycN7ySIzZyv9QrYr9EOh+PYtJReHi0rG52zSHG8eRNNbNcOnDqRntuTEPgHWdxy1wE3f07rynFFQolx0hZoegWhGCjYicTZS8yOLa1K+OeHP6nbJ86lEkbi5QAtU+QWeAHPqlS5eVffDNRLIpjvUALh0QhKUURHJH4sRwGIIXjDgUasOIT0IZbAFxIEaEm1+sHzyUnZ0wb+Dc8xBBUIwYscCs0ce0osC3ywV78oW3JIMXFWbE7ZAYxcqx4g34z2+39jOKSGrnbwC6o3HjTkEIaKM1d+s2KVHRPRnPg9cMAleyyoWOxTZEYZRnCco/yzs0YBXO7ohBePnjPJYRhQ8zYnCLYATrX1qeVnNzZag/eSZV1+A5MyuG1mSt2jhrNi7ozH8rg87sY/nl7Q9UWDQtf1/mnJWKzOuGjdkW7ATH0oPIOnQBg0KfLeDWwFBpG0eENvMcd8ili/n2UzRiy4xrUrAd7YmCyUFWcZARyHxHbbJt+RfrMjIcc3OrQHLR10HF+hQzwKIJJMpwxrm9+06wMIiZu0An/ARdclMPNd/AHJyxDA/DWieVuTzfJw+p/aA1jXLg+5cbDrejlgOX1N/gIk4TvgQfxcax77TokR4a5GodR+2Uma747CLiCVB34aRWXpODBglRPtSioDVziIiJ+F1uYvQ8UUPZC6eaMpQZEkr/tynIoLCBUqyY8C1AiELBfkisY4+21oGH5RwC83lP7MewXQX/ASJquEJYshm979RSRUqPCFXPVZZpSoBAduBXswaZBxFecnCuHB7PSOughRK34+SQdrpkkew6Qu9Q8qlqKgkPUyRGRFcDmxLFqjxydYQ55B4xOEADU1RqyApzbdLkwM4lwfgsF6EK3wb85w0zoGazdtKPQ6SFojENMMRPkAdZDc6osWYYYCAU7EQqZ0ZAKe8hi/JaAULrxiOgTtKUZQrGCtKOrB3cIaK4AicBrDFCOG6P8fwCGke0PojU+fKSb4WeG4lb7QLKzj2PCCouH4fBYglYIJ969ldGa4Ckt8QnTCuovvMnfOtqCmMoh12qH4uxYOSezcprFIoXxBIZml0FL1Y4WcUyGr+lM2qjxA5PVDQ99gIoPEZa9YJmKT2rPQ7yywyr9OnQ0WjYjjr8mzYpTame+ZES12CL024+SNo3QMJ4j7x1aFiOOF9/taZYWkiQc4f1Rbn4sKEg4zvnPqRPM49yHFD4gVTXgCffan3eWFj43b+UCWOO9GBeHVDxEkF1aPMCBetIP4jjpBfvsHkxKJMHV+i5TBY5BnLNd/wpybV5d/gIR5m2XNMItAYBlm08O2/HoaHJW5olTgJc078Nixt8xPgcP15WymPsD+BUk4UfTscdtL4dl3C+t8thzIT9pe1H9rCt8Eb/cRQBQ+kdaXmdWnXdKGa2G0QEpBNZTMYBuqVeA+HGAhHS0tYD/GAVyVNFFr5pYBS/wjghfwmAJsEkGEXwEAt7mw0hasQUV9gKym5JeaR1Zp0UqOb9vo//4SFc9voYUs+JgtQMOKJZUMj4ztunwETUa24UuEUZj6HZz5JNE1y/hkoBef4KKAf9eA3kP+OEPctT/wgMJfVJ/L1RO+gJdZ86oj8q4yTjLzxLPEpVVsyM89O9r3swNgClwzHRRMX//iPFH/7Ru1M/5juI/4UI5peQZ/yAIg7hTZi9B98I7783ygpumt330rVg2c+5xB25YP1duOEZ+TgLszFV4lgNV37NXwtNbH0fAyCs4CCr1TAMKXoPjMCZvYaNuLrXyfeR+IlOBWz0XofXuO78uucTtqq3DfZC1ls9SN5seStbgP0XGBjKwbEv+vEugtjkd781uOBA+oewWtaA6qV+FdSHQbo3CbYLkknK4TgFqzbgB0McN1mlCb1vpgzpn0tF3Rl/0XtRSLXv+MZwtzBdERluXNC0bo953M1wTv8P1g6I7L3hFl+TWs12EltXlTiroIVzfp5SP4BxveSCZJVoE49odgL6v49UoLo5wNWSA/XTu9+GH8m4HkFng8nmHAAtwDwyAcXl5VxvF/IZ3MWBVAcAL9ragjLvYKhdzKFAbx9jIFzHQneV9FdwoDHjadXqt/gJMwbBVm6sPz4DlvuS21xFTrFSs25vIbGtN7hLro2fLk953oNl8WBdsIjc9Uy2EMj1kGUr/AR9LJ74b9h95XlC7+ttaCE1957ATgfVbYC2gAqFjERn8O7u2N/B03OKuVhC5p8+dFD+pVyMB6BUQjAYp//ac3PZuqAsM/bD/FIK8pAzgVG3UCdXsM/Hju4Mj2T/ArL0OAWUzVXNKwYY7AqGQqGovhxLI46YyQj7XrSGZwhE0P+AiXVv+FEID+CGM/+Bhn3+GIPsocRGQ2n/BgJin9ETI4Vc+yZepyrpv6/4CBKLHXKwBvWzTgCBZxzHsjPRW1hn37H31vltQvsAm1haOkjwJDe8H7CYg/gfPb/mVDsLBz+BWpEl4/FVuTKlfasU2w0cVXNII5oFXTAgVI9v6hrCItqVD6v/xwXkAhEJpCV2GfwGACCsFoOT/k4wIs2VhAVThOQ4JP0E8io3zvgOy5/x0Dw1gJWpSnvJQeYRkfeclTlg6KiEExVBAM6GJ/ORPnf0LkS5fUFKb/Ara+U3Z7SIsuzRGnNKQE7a0torj6UGwTHYJCf5rgOFnoN8K4fwyC8PH80kMf8COBRbGDyRdymdoRGFRYw+j7BBUF2ZkQbuhRcPmrFkafHqm7P/CgRH4fwcSG/wxHFNtun+EB1X8DBR5rpH3yR1uSRUwjLqbREE/pYa8MWXRAVFajpKyM4D4fyQsermRlQ95TPt8iK4g6/onfxKyS0HzCIUf/L7jt/AbpCc3fBa/W4NC/ffUpDJJlutUZERTIMP7Gkegetnef2aWUHfwoBVeH8HA4H+MjbF5PDB/ivP4Eg6Y/n6hL515Ut6z3PfGE2mIHIZiEpUhoDNAJCnsQMKxlbTz/xQkBBakGClZDCAX+Ag3g0A/xkk4gjRTR36Xa/OSEnEGf/j4iI/w4n0v/4yDxAjqi/x4jJ5fSEuSqa7ygj7yS3J2srd+1PJnphFHv46yO//tFJhkeKx1jAz4U+0jufpDPP9G2jepoduedtMD/g4y6Fmg28Z/4DABG/+WZ3LF2WS4P4Fui/pGpfd6nt9cjQsCp9L6EuCd8JUUYHBt4tJZsGC6+9l29m8/hQCq/wUZQofxELm27WnhaMg/46QQ2nS/gbsKGm6EaP8qIJtK5eiupFZCga+rubb/6fk8/JB6Tf4sKJ+387M24eBmWEfP4GeE0wGdHlIELDy44M5+UdnQvrmqw91iuaPmltZ4txa6G3mgALjeTb6DRsWNRs7zXCzGHQj1PwYP/BxlH9/BIM9/La6FUSn8CgzygWmH8AF29xJuFW8EOzmgCwzxPwiqXmimPc99t39XTizXJEkv8jA8clB4SoDvwS/7sWPagT+zwH+ejxyUI/gURd/5gHETDxTj7Gq/46FrT0xLR/8oBwmn0qLt1LL5mPuvzVWGOyA79sq10kSVUI6cIuBbNhtk1lp4bfwyFsP38xiO/8CjQ9lS1PysfJKFj8Z5AvUlczknOz+Kf+EAhJYY/wgL72Jj/L6AyBCQlxe0yXEMOvrzY84OFUUZE0ydYI2ds6N0nnJdGJbOy/7GOGwAH+/+kH+iEBl9K/8LB8kQ2BeH8Pz2gDXBRgNoiu9snvO+5V3NVl9ORu/vTRPkNbsN8Gf4CBnTU5x1G/zDaeB7dtvrpBle7Jv49Fz3sS3r37MHdRCF7vIsp7XZDgwjLG6jolEG/hp2u2pJbVDGgr+XDQnHT05HYJg5ulrLYNYRbwo+XljS2zNZoL37ykUpC8u1EACtdaMB+eftflZdSeGddx9v74rXbXlykRPnjVBDDrqH9ONSBRKaQ0k0tCZbr1jhAonP2xAWmnu4c0a92kt5rML8P6rT8Q3izCeT/IBw9BT6IKdYRCUUb+0PfZqdWn3dmvAIasFyrBhjVU98G0ED531jeXGbkUGUlBqezCN7F7s9Xw/cGGCksly3ResTYJjItmotrnEujd2pfnLxtJqz3qlL+tzJ0pbweHP3bBHUpxXcMNBJJ/WqizXk6wXw1C3nzA5owrjzwBrndW21B4hAeX4nujEltePR/4CLBrseGJLLBLPQHADmlBbfzxq6/gFn+bfbV1vQwaiSLr2mH9uKp1ALe0GE0BX9b/a2WBhQL3wQ8GebDscrwJKmEaNuhIPmuSZDN7ejgw+YKtIHf4I3AS3//yFjT/o5CxpHr2H/gWmUV6HmGLGbbwCr0zFEOIkLY0ZaNkqCNvWRL9yNYIz4ZvH3P4UJUf/BRybJ/BJXr/DF9xjaRLTAkd/AyAyvsngPZ8+I0xdDoaz+aaR5Io0gV4JjUZt/m0Cl0GUf4CNRv9DVXsnnG1BJ4tIZLXlCTP8dJvfqwL+B8t6bVmJb/gMAEY/9jIesE9Mbvmu6V1tzN/gJDsi+q1P9y9kOmQtSC4iRFnUUVNlQFxur/C/hQCq/gXKY/iEEWAxBffwviIIDA/xMOLKdkBP4IHF9f4D6K2W9C0mXmv4BlPk0dtvjsjOIjWFlWCocAfswVjSY/wYVSrXCobUXoDtJVh/gZnUNmbvTKRBX+DhhkpAl0etv4DABL7/FyvevDgRerEtyK938DI27T73+XpLuwKsI+eudYJU09NbGpu/AzM1g5d0T1qI/HBHZ/jg8gd1f/SQyzH/gIQ3F/zsMphwVIt/76PId/Wf4ATmW60CAaed55+h8OXhSGzIk5cRbMSyHdEWwY1v30w/34eQ7/wOdvbT2+/1/DwK+uPb2eeaMB2u7Twibv9sy4AdpfD9W4JxgaU20cPIym1HHfx/HPzFEXeOXFfjnKwAR4dkOo7RTb8UPv7yJW5IEMMeP8F2KkfkdF5RqlJxA5WoOvDcI1cDcyRyG+m1kIK4tfQzYzrMfw/h/+7uXl5r303kxSeNzIUCP2q6wBAxDUHaCdb8rrQ6LjQJ/6uPId/4iC1wP/QRyQ7/waDftPb8mAAAgAH/qw8h3A/9WHkO4H/qw8h3/jMAVwH+IAi/+NQK8CKLWOMG9A3x6rSJaGp6OzHRNl/eVQRipvG2JMxjZvNX7/xweGhfTbtFvnVToSuN+7eVVV0ZgHB0rWQA6CO3PXGwRX9giUTGD+hwGQSEHv/4CFInebQMFPGha8Df1y/wgWh6/zEOm/4KMK6/4Eo/3Fw6MipxhAoPKd6ZwP0dEBNHM7POSb5PJpVlOKYxxF9CH+FAKr/BQgQqzasxOv8BgVz7/DWXUZTs0ZysBgSnqKu6fxBsqhROuRN3KHiTQFbt0f4OVr126EmnBUl/4DABKCDAmFC/ycCdaSvvsEkzL1sCxvKcnLCSCf+AkOQoJEU7/JASfT9OgFlzqkRlCF5/+AglMv/ASNY8q5bMhR0qX4fvdNeC4lbMScJe2J5/jgZQDCYTT0uNO/gMAEV4JaPJTAZuhYgCG8NwsjjjMlo3FbAaPUc6HHBIHwPvK9qAXz8z/AQXU4uvTjscUO3V7v85zmqkvZmZ3PSbC52jvjJ8e/fwLm+vGzSKVTMG/rhHAnWAYqBwcFRPeYbCLAfT3KcABEpVbsUEqIMCsWkZEdqWwNVgTXEdg4lKTb/+DDGLX+IAYOGPZHWhSFkP4GaAXza0vFU0cCH+kX3JoKS4GnT31uwdzU+vUIpNN13cTJBZ3/HCFc0Wf/sCHr/4vaGP4g52rI+Hf4F1eIAyUmX3TNyOeibnSrZcg3jx1psiu02pr1w3ydtCjYT6gQoDSxOFuvJCvYkFmtFI20ceE2wMb/ByC+p/BIP3/L7CAgjT2/UITLHuMU1eUeP8AOnG3G6dmnaskjtx++EXYg/kHSBfgGDzEGf8KAVX+CkFNb+IRCkDgZl2+P3ZgP8eAfKZh/wHuvnf8CAAkGBjEX/iwLuTE9/3vOD/pQfKZ9fMMvc38DAY38LRxTwS+Z6WSVjA/xAPZf4yQGNsT+BAUc/+MgMbf44FePf//BC+H+kAhfBFkjj+Bek6LtmckMlnTgcQxHt9ADffW7E8nWSOkfTJxXVg/A9md+HS/JQzxbbB54lzDcS5aH7g3+dkb7k/wcawC/zEJq/wMgm/wHj/x8VO1PVlgRKVbBjfJjNWtYsHGJ4K8ad89F4/u7/AEL2Jj4oQH8KAVX+CjKev+IiP8SG5B+rAf/GBs09lf4QPddJJAq29DEycbEVGJPn8H9ZE1yIP+YZN32WwMPSA/gl0Jketg6ji/C0/3CPx0JZoWTTo8S0+tceS5ilyNLr1IqK0R7/HCNMHd/3sF0ZcU9KPgefRuBlSdyNr4g/Ze+TP8e9AC26cx/ArvdNzhDEA0bS2l4W0PReuQAOR4SQw92gHx0nHTEq9cQAta55ISujsaNumiqnx3UIL/gRlL8R7f/NQzff4KBETP8TB9xH8DeZYIsf6+OZ9/8LB97n8CauM/hdFpmFgKzP+Alhr5SOtNmE26rX2eWss10k4LrexCrpjisRL+FATP+JyEcCipkGRItygMB23WVSncbAc8HgrypezB4Q0vb99nugwFTwISbUuD/BBGwE8hu70GMBmXGxZyr1BmMr629Qf4CJyHoZaba7lVED+Cg3KjeFg/5xcEMd7vE+2bksxTdW/fQ/K44Ug7YPfYqmBJ9Pp/hMBG+HBchIBg26AgqPr/xUkfEUK5DPUP/gIr6F/hCzK/hMCTWGhJ9FDwF+5Nmb1zZtWH4Yl03eHFPHRldC7ouAd6Maf/gmEooaXlWp3ak9bA1NuKo0Gl4zOL5ph1X0XSCQQIFz+AJu7TJsX4gUmSK9D8lw55aZH2BwItxcUi49+/go53/l8V3GG5nsT+BoSiquHlVgXmPju8Nwm3fb+yy1rfg+JhnNzTMkIIeC7QOY9o/hQCq/ic53Ao/e8dpZlIDA+KFSSiDU2iad/8BHwFfO7+5QWChbC2v+EDl/QA/wgLX7r/+vDl/SAzX/ZA8IoA//+OX9P//HL+n+sh4RTjBxM65n8gySdA4PRwhHOOGzr8xWlgJM9AwVfB4Rp52kg1hay3klhHB3/AEO89BaLu2WQhm6r3vGnXaTUC9Sd0yepkj2ljHi593qBVP+AkS0kimbNsP91SLSAbEOqmL4+I9QBxtWIs+Y+hAS+/gCuTGiV/Gep/DNbZIqPCb6Lwr8OAm/i9ktLAqlNyqYRcnB3Zovtng/V2R+dlqx+6WovRShFwSSBPLw2+2zLG72ElBOXl4QVBJ7F8dDe5/qKS0A3XFMjfyCMZPCzP4H1PhKl0tEi6r6LE8etoKY6II2O69hMjId6t4263B8fjbVC0RoBWtW2EVr8JBgWevUs2WAl6t+/QYTjVmhoRJJDgbxoquuoJnQy05r7Kw6qPX3eH+3gTBM3qiXv6yJWhA/BTsZgmDHOxOEdDWeoHR05XrjNpUTicUgcjZZmagJixzLfatBBd2XnJ8KVRZhfD/Cwu1VDSITsZBfW/t+gAqPCfBhYZBkW5TlcD7QNutgwFflh0SjgHoZFlUfCVRv7Coy3kL6T+Bc3iwB1a5UpQknOCbVKHqVPt1DbtxOHKC/IBENR11Bm2qzbg/xwPRDKX//gmY7/TwTMd9Wh4jGTBJmO7JhEJau1K42KaefZb7cR13m0ShbGWlaOJ/v8OWYoVAnPUP3F/AYAJ1/5UPUvZdzMHmTOVzXFBZrmtCT1MHZcCGrv4cX8/HFb/qPLlCO6V6V5ERVAO5XdC9fGaXDlhcjHOKBMd1pvb/AQVDfgSuhD8ttDxvaKITdvB4U7l5Rt60U/wPqB1LrBfwIACQP+vi/UO8i49OIdUdvc0ORT19aJwyw0+K3v/ByelGK4v8BkuDz/AYAJBwYSu9H+ak9KPpmNLZGb05P4+CvMD+PR2wDYAD/YByf+QgpJh5PwqsXIoYcGOClCQmvd8ISDRH+Dk6BYyjrrhZAGbQev7CmCzmKJzibe6XipfQSNegkIIfGXdf8MGhGqS+h9K4YJRBhR1R4AywVSleRmFf5+ByAEH3H7NE/gA98wv8BDO15Bt0+/YkMOoMHMVp/dL2L6Z0rYG7BgeFeYs6BfyNxbfRjzZdDqy/YKjUHua0V0N4nr+DV2mjh9FgOf7UOVP/4VA//4dBXP/ICmo5/FQL//GQU+g64VlDkJtyaRkiI5mtQ0NvtCL8EXcZULAsQ2puzoADu7CMNpuq4M/BMS+kUJSsc9z7DmCNm3an0qkl2eeO+aanxjf++Dlp//IxrOD/HoL//DpZWDT5/u4pbe/ycK3+BrJ1/qFlM4nveDhMhTZPY0Q2P5g40xr/O7tbA6ZC7bpHTf7utHdETg7rxM0Yg/rgGJ8mg0x8ORkG+T5QBBEh+g/xCGfkIL1ZcC2+yAT+eUps5r300Brm04a/8CrdQE98P8NVrs98w2lD/w8Cvsf+uhovQA6QB/xMFuEwng33WYlsmUXuVJZMMjtbL+VqV1/L634IwxRM2nebZRGsPATmu+HfNubEHvVvJa20DugvK3JYV1/ngP2GwstQv8bIaO31cES02pIp1G0unzCEK0cRBSK0a0ps/QmdGixw8VSHs73UfwoP6fwLs8/wSP5/wxs8nXl/AKzee7vAf4oOerP8GD/koOA/4UH/JQPb7v86D/ko1/joHFThbI/x8M2zvj80e0XBd3h0g7XOPZNYUc7O8l9xsJtQMsvu7NJpO6ZLD1yTSiMMBf1QOUKAy0m4ijnRcBflgIjSBc/bdBBWFx/5AH/Jf4EbRMB9ee7izWeNxv8BJaUiIE4gtMKLi0KvWL5a9h+OUz8eRONrrnXDNs9lmCW/+2qcvU2QqRJr8ALD/wO33/zES0f4KEOukv989kjx064XSJivwACYbPkfVzcv8IGLZepCK9bW3P8BgAml/rxF2NgznP2EQTxAAB//4gni/5kQTxJBt/P6HqnJipqxRTSCVR9M1eYl3ewizIoeKfV6mgoymmBlfheQWx/bXtzU0cGO2eTh7D50Nncr1w2m/bwXUKwvLcQfnn6i8iUVzp/3oNYPNJlEfhtMwNPyqh6NTA8kbAvptJo6VwCfkO65WEXARh80jUzaDXX7rWOXvKn47wmABXbwnrS2y9CWR5Eq7pcX5zBIlmqPyIS51zv+bigsWJzHH3rT4Rfku8/kgGcG/8oJ+Oi1a9sZMxeh8xRB6gN4znH+AjFjtbP0OQ/yAMQLtx08DDhbKbRfZb6gxgBFaSOh7Y8EaKlQkOHcEkK155mIPlBnwndyEI4NSCR24/dSj2cxJk0DR0HD7HyPJEJ3kyBgnBo0VEe9kzX+AhHgUDlABIv5oVfmRG8JSWGCUKyZ62n8yX5c75M5T6oKr2bS/uYYh6h8LFYba95K72ErRe7W/HQPbd51Nepsd9eMZOeEMCTk5yAzM+XXrmJcosjIPV1UuInMnQqWzNnOhlarw6Wv2jaBz2y0sXBTUCjUnTzozSWYsrwP4Tjffizgf8CLKbLFprsGFlLj/ioZDuHH37F9eMz8SDId38Jx5v+MkUO8+e5324P9FDId3+OC24az/v9GYP9MBRXgM09v1h6e2B5tmqxW4p0uXtshyB0SAxRC5tZ4/NnMRK4xII3+AhPlD/FyWecNT4bCXG6kQn+Oks85DBBL/2sdoK/w/wVf44TiPP8zIroXcnkf+llB57+GBn5eOHC3FhI4tJB2QIeh6j7F8SOXG0aKKmZpUaKxMr9WP8+Al2H/gxP+2Aw77/w0myY/42S15gD/OiYcp/FhuHN0kJzGgWKAHIMvD0AKcK07NUdWmMfE8Wdp0xkQQOkSe5nkzPrn6Iq5rYT1VVxovI8QLt/RyVmDhccOvionDPrCN/5EkolH/Fyh1yNT4d/AwXL/jpQ65/v8Ll/v8Ll/0Ads7lDHxLRpzn+AgsFXeuiOEPsybwoSuv8/J9oh3lH6UyqsPMrTPD8P7OReKXjDSSshu+YHAVnEjQa5LhULVepAkJ7ssHrHKDko8p8kWDoHaljePjQ/YlNcGMwfhQHFHcX3VgzZVcS3xfv814/XM/BOsOZVV2VXv6H1OC0kq/yuj0AhyFib97hv+IgyX+FDZP+BYJD+ITwcExLjehBIYgMDFp/gRggWlmWcZfJ0NWA7rbCxObD4azf/Wws+UkkvSiksYkyl/gpSEA/hMCA+QKQOP8GGw+77of9QNh9/4TBSv4fBAL/GRTXEZE+gSHLn/4Y2H3LtCVSvlPi2cCoP4fA7f+aEsPv/jpJhqev+pJMNRH/vpJhqSIf/XiTDUR/6IQNXPrB1akuflbwsYOKmL2VvK7ry1iWQNbcJT9crUYiGVu5m6KP8aHqrQ9/hpIcgSx7PKMGlV0V1+bKgR9KXDVIktUWEissH99rrIDA/hMAc/gW9JEP8KJBbH+tB2YpMS0D+FQCE+Q/R/wUYh3/4QSHpQ2fyf8WDsxQfS/7AHZikxLQP9PRGEoHYH+vCg0SJ/iv4AVT09KV4PaVuSKCRiMu30AO0NWD/HQr0Wk7ql/HieuBNvd/nodmKDWT/YxQaIxfwBCwfR6ZZ+QojF5AWk/kj/l0CP8pfScym++8gF6lRFM8n/BYoNE/hMP7/hkgQNeuiSbU5/isJD/1IUGyf/YKDRP9QBbRB/EAcH/qIoNE/jAVn/g4Cj/0wUGiA5oJigXXUv/DooNEAdiOPgaU/ksFD/i0EjW5egYBww/54SJVTIDO5MNouGAAQD/CYTB/i5LScAP8TKUJ4BJ3YPJ3AOCfwpOX88JgGH+LiRPCTuAfrcyP+SqpMzslW120SnjDQ4H4nxpBUvA70mHH+Kxb/+HxB//HyvA5/BpwgYt6lz1W//DA67LApQnqV9s/+eig0T51/4desB3c/10UGiAqB/Ho9+AME/HA6//OQy//D4P//FhPGDhk2WtBO6fy8Mvwn8HjL/8RoWH/CIFGW/W1cEzJB32z+zRSWcrLLUWibxv8cFc4G2LEeoiZbfy8Mvsn+fEu3Lg1cdvjP5LBQ/4tBI4Avi5kIT/+KwYP+chl/n/gMZfTTUAAgH+EwmD+Axl+7mLPSsYi72OjF4AjqSCXHPXym+T+FRhnhkyONZgcBwhMtBdMckFKh8ZhHBB0iusEtY6PAf4BASAZnNmeCgFhwBgFROD9I7/52rU+XNb+FTV0LuAAAPP4zw8/4TI5I1mB+AMIv5swUF8/xcmYgGqRkf4KBTRAi2YP8gJmIH8NorX8etjP8wDXz/hIGNey8iW2LmPB34P0q3v5RzeMUyG+gNJDNnDvYfuqCNwkmj+eY5RbSOXuK7mVyTRvcIwbUYN+f/Bh0vgxn8x3LX8C+Bf+OAjYaN/wgMsbI3+vEoSp+AdmrLkZScAPDGX5vYUX2UIlZO/KhgUrMndxZVGVf2VZD///HtAv89oFNEeyTP/woKc0BLXp/w/m7g3eGqJBDVKJOU4UEdUxDwE1P/M2Sy7wNS3i00W/ScjYGupENW/+Jvg6xwsYzkDSTWt8aGfrs00JWn84l/jVUYEocCWoBBVrFN7CyKygsJrmCms3+Aj9IFZHWekKR835w4PIfQDT1VBeB2WYvBjQ5JjSliYJuInziLpkKRMNF1MCmHNW6c0B1R0Tc3btlNC9kesTpsdVDlHFm4urNpWsrhVVF0Zn/AQOe7oJmU2hUzKC3/gIdT8JOEnsgXG9X+4JBm2eiJaYdhBWWje/h/Wh1OJLq7poJ5O3sm46hDmp8gxVSHDcwc3WM8yjP4A/rpCAD5sIywusXWyYk3sE5lRJl9+GoUtJPn+AP8dGFWiDkP4BEWDQVMDR5/XqLEgZ8jmfaxnT4qoVjkr5uP0tLEEAV63rolPhQKuOFIQi1tqYcLSehXu9E5CeXfqDz8B6CUGc5U28eCH9Nk632mBeAJePRqqIEk0hJvAT69vjxVsJsPZhluoYZ5ht1glM9DpRtUh3KZQxqFRS9AkT0XziJJxwlXKc9GZdJTA/xcma4mqRk/wKMMg3OYf5MTNcf4huPPPMichx/EUIP/DaIsHfYJ/gIH09460ouIxjf6iO+/e0ICTEHy022AtRtelJCHQLHVA/j5M2/x4ntXTrp/lxPauTuU4CfoROo5KCDh1UgugrPYgU3/BhuSsSvH+dEw6r+APX9zxNmIv8BFiXy12wnfFqFh+X3+AhVznFSy9cY01bg1wtd5mmUI6HP8aD5Mm1/BrGz/iYOq5hkhpYydoFoDAz4TAHP8IGFxP8EsbN7/K7GykB/AgfwqBCfIdo/4KHVuH+CGNkXb2D+FmNkO/mBjZ/gMCn/n1jZDcD/axTg9/p4RJ5EdrjLf+PEw2kU4n+PWNkM/mJjZCuCjHPeMJuKDhR82s+OcAdoGvKkpWDvJNMFylebg9hdVQ/+i2Nn/Kwd2JA3wEqQAFkVt//4pwe/ivV+Lh3uITuo168JNMSaIxh46vqyodk/x8ChC/y2xs/xCHB/x+xsg7IGGD//jg+RtOyq9rT/SyZuZ/JYIH/EYXG09wRofx8Fxf5+N/fOD/PixG/fx8bDgOB/HwxP/BnmGzmfWXOtixL4/y+MVpZZXV7Qohoz/FYA//OYC//BBzQAh8//ig1ct/gj2tF3H/MheYjf5+Hlwf4/44v4/O50D/XQu/f/RoF//mQ8fu/j7EmASA/itFH/j/FG/jIN//kgA/AQInEP8fDxiX8pfEX82Bj8/lxNz5/yweUnfw+Tv/w+T//6WO6Lwb3uz+k2h/K/xkRjmY2dgQjwyn8vOjPV/vYqBe/gai00yCJpLaOPoheSvLFNk512gwR/kCZtOI7t3MFP8BKev9ftizp/Ci2r/AdwZtv8R+nfpCzGIP6MBgJ5y211GIz2XQwOBv7nfZ3Rr6S5f/DbBAjJCAUDJpkGAl8N/k5VbayS6L33nwUH9lo5KMKtGZ3Fr/wEdNCCH+SFEb3/HCZCfBf3/tMf6SDQf4/AP8cDZqQ9/e3AE5jHXDUHHVtwtmSRg+06y7qSQEwv8eALTx6oT/goJDz2HAFCJ2fSf8BAq8nmaY5uNfwBxLxDVA2w3YydYY2JIRG8fxpqEcv+OE0m7H//YafIyIGMiHfgs3gVieaI/+Aj3S6J4KLf2bT+N0C89H2FaDemg/wYEjjG4QRamqBK3mwBaKwrY/c610+xDxt0ZnCrcJaIiaAX5Ag6QRmswCkSbE7pfFfIYPwrunDTTuf+Dk1rH/YgKV584KDuqZSQDTJ0SUd9TBmjG8ceik2WF1oG+QSOggYD/8SJXe/P3sxQbnt+FyUJ0oD8FRdNnp4lPBjiUMpyqsgqkIPfCjT2/W3/spK73+tr/byV3v30f/wErvegOt9gZ0BMD072x0sJrm4Asv9LG407e18HZIMsbb8YGAYQMxlrF9BhVs8WqjdrIW3AIxnIhZfp7Tzt7S80OX0f4/hRkF64JDo8PuhXL7bfAsDFbvyv4B+D2mNJurICBWpbfNryrrFUVag4QHSTLVnDAvz9Iqq+lpNjTUFq6eAGeFBXovBUnEz4/Af6oFORlPXL6aypJecgJqv+cGIwQyakh3CKEryq3CU9tqDfpA4sRw+wPkbd6+19rSuzYD3MWO8jJw8G76zQ6lLa0f4/A/2Hq01RY2/mI8PKnVjzXN5OcWv1hKO0w5h6AoAFjR9xJUWNcOahcjzYWwCWM2yYZuz2bkZCptvu0HeBMmYi/8iHk+X+OgzlcF/8Yld7oHhB/4CLU5s9l6w895+xwEKZw5Ek4SVVEbbp4d+/YK/0nJP+BhKT/YiV3vzW1/t5K73DEn/oBVUq8fX4SBqqGGtjLBQ+dxdoRU7xXEdmydkSz5ciyZ7TFFsw8R3y9TTe0ZurGjVs2tjDUxWMibTl/+AkK/9ljd55JZ8vcv4/DZCObaEOSsf8BJjAHsipQKakoJW9hGsSIqLPxzCS8SCW+fEIYdKasQFUwNzmTM2g86tqofHATolXeGduFtO7cosRua23/lAM5X+uDONkSk9HLvvT1RsyBqkefLYmb4Pc/npVzBAAGxktJQ1D+FAi0/wUm8YRp0WYH8BgAmR/jIXTd4Jpm9Pq7/x4Owbg/Pf5sJ/pINjiF/4FMGuQp94LsFYfhPrKklLPU4zO/teccSMp9/vBHjfM5PQJ+2z/HBvRvFf/sB1CfJGmWO9uIbTTeRfsHwaTq98fzjD/48cSq47Rv4F6IuiuVaiq08EDJCawqF9Na79foUmbT7P4dGnZWpkHJrO8H9f+u3WEfvgs34dis9n+BB2ykjnyn/ByaSR/BIbN/rxFF4/xwLtzQHzmwXbmBv1Lp5c6DTmpFzS7w1vCvKqZ4/4V+6gH1KDDJJZoa6EwAMBEaXZotc0Xmp8qJesFUOLKzARFgvjtggTsYQCbwv8gwPseP86EnR4L/Dra0Lgf+PiTo9L/Z69LL9Qm/bbrI04CwqyU7OBKz9uK+gRGvU8+1JIoMCCAs4pBzLLDxHi6TQsvFhAgulfdBWKvHIB6w3mz9GKb/JCMu7/g4EO/6f+/0rb/TAejv/AfuJ2cAuyEhlkbOu51LDk0dbyZo8VicZFIOEDKgYpywP4SHNfwoHQ+H+HBro/+GBwdkBKNgunE/x8mxDPf/Uk2IYk/34CANy/+vE2IYk/0AhtEf8ODsUX/eweAINMytzXb5jRXMI8ugAlc3dgsEP38eRxa1SeJgQVUtm515d693n0dJPHXIr8/26538dsEBpH+DBN7Hdt//OhBVUCfw8MWg3+PhZONWkmLQpGzUbD5WHQYOf4AE7Vtya4KZm6vjhDXsCYrw2eSLsQ0Rq+0dU++iGSt+kcWaz73SaS6MneGel3cSLyL57+0Tf8lD4f/8BrCHH56Ik/qrHk7xKoCTJMiUVqJM+I0a8Uy6tlTwqcZKMeQSEI0weQ7Ez9/d3+7HsqXXV1UcaD1/BBcKKf3sMp/XbS/TBqjRvwns8wG2qOJiPRUBx/x5MgTo+g/4KCJS/PxzCNbGw/BIYpkLkmAAn8vX6lOiPb4lQ6tD7/AQL9SEOy81kTe62R1oxmQx3jnOqakSftUNxVL/4UG7KP9fDacn+ApMvk/h0aSfP6pGkiX+fhpKZ/l4aSJf9EI17X178cRuI+fuomGaOy5VMebCeFxq3N7YZhVAmSzMIhPgRTbfwoESv8CkdI0c7wD/AYAIE/4yOKvJxGxB4UKYD/HScuNB/x/lX/+mA9sz+A9qX7h38WGPeCWQ2MuadG7w4rwUUHmVM+xXRZ7PMixZLUqEm/xgj294v8CY2v/uAyX8bvP8YAkx/8OJzCAWFc2MFehf+Kj2vUQsqAGccD/Eh7Xr/CYCaCml9BUsouUN/AMKF6LeAmKC49XL10J0ik1AQAUUh6TpJf4KFJ/3QjrbvKulzp681wZ2n/ASdfJRas+NV/8EJhYef3sbsj+MtYaJHkS3QOdLPb/KPDxtM+sv/HgYytyLFzwK+TfwAcwabL9ihuuohP3qN9s1y+oNQYjoy7oGymwHDlbdI78VCBZ5JW/FolONGDXdGgRdv64uFM20t/g5N9wgOUV2n+AwARx/2MLa2fYF0WigCbrKi83lIA9+cKq8zw+w2nOz7Cgc/BThE4nqbVP/CgFV/gpJBmA+/5yf4DABDR/hmOUkmGECZdI/wUno3/YNpQVtwvrhhRcF4wTfhDc3EcYPE5mr1w+UD+MihzULfpT/DnSjHfzT0ogSeCPJTwAWztb9rO2HQ9hLOEALDUv2gTyh1p0+QrDfIjn//iK8B/+4E7YO5f/BT4GOi1K4/w/y4zq9JyguQHaiECQzHWcnrXA/DZZaMD8hy8opng9/AIr8Rx/ZPZQRmgr03wgWpC3I1UHYc+EkgT7tqgB9MM/ugJ+xJ67P9nrzsJPq8eOrE+Vad0ml1O85FsK+z+PmyxfJOKXqeEu9cSsWtPFkmWc3+Al5uA/QzOpFaXSH1g0n1xJQofdeTwuDE3rgOLc76R/wEFVuDIEXvwPORQXzZlhGBnzkfJWzwAAy766n37/oKda8nBSN75QnZ4GrsO9gTmS6Mxf3VUrsQJkA38P9KI4yOuWt1CrfC5+VqNhclmEhF+E4s0FS6NF6mA0UvO7LBzBU/UYOVbw6wRPpNBXVDtVOwTHUTK4EoNIJOIGTU+VTCTUTkiYgV+gX3QCMFwV4kEp7mUcWvzH+AlKYIFDWdIACC+EJ2XMFwgDrynmohISE0GGCG0lLZ8oGXkghK2OlmDWWCT5Z5vwaCjj7aALqBVaurw6zV8f1fJ+vBxPcj0UEA7/AQc9IEnklVMKrqSEZUAQ6VOUz1M6q0jSnK2vQ2g850YnPvmf/jo2cXl/7/U6v9MB8EP8BjMoVlituBcrcbCa97PjcpLFObg5l4qNe6TUGLFrb9tEXfdwXUeQnZkwnG8xjw0sC9flJxqPHfV/g5PkrA0wDgf4DABIn/YATEb8wGOIBwnOoWGOUJ8cAuXHrPV23btdkmAS72AVoDDyfiDS+0XRX8KAVX+Ck29f+IRxADzTDX5PmigP4FHfBQf27BmsFUD6yGPpy6yX0K6myXk3bRu8G7mFZU2qzD1+oqz8hkoHTTFR7kZIjnZNTM14C+sD+CHMN4/v9eR/klzDFvyBSQwYNLur99phKh+V5yeHGD86x0VPB3g7IijDxcWomPgWROlmAMUJL81zlUF9A1bh/ZeCoWamjv/CC9q/QNSQ/wGACKP+wE3SL+AybcbjUpHOHMVb+V3gwTC4ihYcgVh/bFSNkSttv01/GUk3dI/hQCq/gU9S/gkph/xgdTa0DJhI495f8DBUw/v9pXUs6p7Dg/N+4zV2TFU551lxVMLEDCMHnCzby2fwBDIFrgLzY/KA5ekBdMYBBBC2qOF850DnWFwgFrEm1/AYDF2WFNy1qZImnPhuR3bAvgmTLPt5cDD0uz1rbv/4SM5TfsjdimeQdaNGXFt6JaCWAst3EgAIKh1Iw9kHxOE4XLz81ONpo64SQDRMoDeiJcp8soE5vSy0j/wcoDSAaOd6H8BgAmx/sArqn/gMs0J2iGMbu1g2cNjfNejk+O9qGCY+HShwAtmo4qg3T/wEXDxvGT/CgFV84KQi8f4JAqf4YBfF83pOgAbv4GEIytDb3w60Ca1jmuxi8n6KitnOqeEoACFs8mylnC229UlRyIIUWx+XZDchJGGFyfXvmFshAEO//BhmIkLEf+FCV3G/6pT+E6siAuEQQMEP/gzLbcMZcghdFhEPKC2/54XGXG40D/xgmFJAuZxi6Le2YAfwMYmlxUVkBvJGOBf5jwly0W0yfOnC58vnKCwpWM9sEWe9jEVkNsOw44FjKHCkdcfHflxRRqV2hpf4DjzPv+CRef/WyTBq49f8CmfBl47fd3umg2ejQChiJoDm/7cgbZKppqgGdKlNxwN1nQI5/jg3iZw/72gr/8vA7JR/D/n0Ha4b2/wJZlfZsECVJC5fgwijv9DHqVaMIPp4IUsAJRf4hhyLkVc9Rh63/jgPDKC4QE8gVB/gMAESv8ix03VRUy7LiNiqrQnXbXsfca6dikXVf46F+IbAOUtqVF24uX3fLZghz/rNBSkqdx9uUIlO5KHhvZDsJfRyIMD8hkLjfYQDmyg6/gMAEQv9zBuff+ClEGjG/gMSC14MBVjF/4gJnRf5+O9N/4xr1f8gHee4p/BI0wdPSPA2vtsNdJTvMKrluGFK8M9jEKayaMyOjeI4X3V7ezfwoJqf4KPIf/+CSBT+GESJ/X1/yo7L+BihE+kNesWi6M6pojx/mdT1K2xHkQnzSjDgQRPlINmg92FDM5WY1pqHzePZwuyemEZg8/h8pwtW/gOKmHIQCcy+N/z4n2gdP8PRUyx/D7jwJP8AUzhl/gIby7fuotwiyC0TzdJAG90ISHKVDjqbF0RiAPXMBmQH0XshXf+q1mb8DREqIwanZ/wEMhW8g+wLMKZCkt5Cv6K6//lAg14+0J8tIRJcxvezHfscD4hVL6BUS/JQyANqstBldLffsQD/L+FUkv/BR2QGrR/8EQoCr/i8OyA34D57p3R/CwHegWMQIHrabmzWNMpGlIPtDMPrAOUCWYkcF/2AdkB/54QPGwPg/v9JL/00dkButiP/wUOYBBkd+z/loNpklOlcwf4Kwu9YbHP8sDIMEE0Qegh/wTzmQV8oqn/nQ7IDCHoFcWr/LR2QH4T+w9fwTDNKgCkGnPnQ7IDh7j9AL//OyDi0azLAmf54OyAzMXzP8sHZAaCEG6xZ3+djsgNkjiv+Cck8Dls3tLP8tHZAaMQhn8FMclDjDn/D+2SDQ1/DmSi3qK6an5iN5hzcFH2MCK+MWLx1zBFrDN0MJSrP4ShuRzgD+axrcS+WT/XVTKPNPC+O3wJQjbQO8DARHAL6rlKITfbCRG7+zVs3DTFiSki0ODmQHuwjvgK5ePCEZ/gHrqS7SnAmEsI1OAHx4lOGg9ZCpqEkoIONcgKsOSLwaLJPmezlppEv9ZpKp95lHuyxXZD9qnt03azAzIvPrUggan515I60JC7A8POyOOHKchg1QXNT4jAe5usa1oKc49f5APU2p9RSCmeU0u1wrX9sx9F3WemOUB3oEkqCVOEwySmPr/ASgBWfkDPn1Yjoya6LXUjzXNvEzbEgOlvcHO+2almeEH/AFCXSI+JRNqGw1qJA3xjFgDzoCoBVMVa84BRd3Wd3t0q+fnZJfZLrDWWGbheUtf6hsJGB+AHwLdFOXtzo7q9HLfkr51J2VF0w7aTGHEhtHn7owj1dPwrxeCHRPPATMEW5EtxQZJW3wJAXbDuDuFNNuyjuNk2TYh4YA5Nl8VVr7zR5EBlVYEJV7Tbq6eWRAzdE5D65JBcXrftBP+DlCRr+CSH/+WzVNsjv4eoZX3+qaGUkv5+oZbT+XqGUm/0Qi94/aOCtnsDkylw0pyn26A2hawSxlYyRmO8SlnIR3g9VAwp1f/DIUq9fwU4cf6+KG8/4DHOJLlS9Hl6Jz6Axqw9upyf2cxI8tYx9KguQo6y5EiwQ493r/hQNA/wUlICfxCXfgMIdNTY1y/8DK6KnHMTDsCB3bKw5YOipp9C6nxa3FdH2VgFtYFE7QITiWGoTp83WrNOQDSdsPrZuztmlxsHVD9/g5DjVA/d5Ff4DABB3/YAQ5i/wG78q+OzkKAKWg28cogz0OX56GkA0/NDKtpgPbnzIn88MP80f8KAVX+CkOI/+CQKn/GSHEfdhb03cv8DLdAt0p2wxN+miww4xbuEs7mcZP4L2g3iq/4sXm82NBLMgaFf4qJf0/8GJtZeT/BRdb/rwH09Tf9jA+nv//htH7+gl3Zeubb/go7zDOJwHv9/E3pFg3DFv32MX+SDGX+1YuBi1oiIfue+/AK9LN1bQaGXz5/7d5HC0AZF+lVlBhp+cKJNvWgtXq9hKgL1e0vhFMjXB9mDqe93O8ANULpNxLQysQeaWS4DgHSqlnfaELPrb7soDoVZyDNzpwuqCodoBZB4zr6Y3lwhbope+qHBRD1E4q/sdUw/s2+CV21XX3nuwjLeT1Sa+nhAaE9fCJlT3oGAZRPiFUHMONUsCMQweAtDrwKVtWOG/wBzIeDoiaEDCsBBPywj/kAgY1PGuspfwENFHAwClAERAxZHQeV3IqNOfyfTPWLRCJSzhcNzCgoAoSC9w0vZ1Msa3s0iop/r8hwxhIbM0GTbzg3zNuq0bMJxT80cyFIYSmTB76hGMn8won9PpS6pnbz650ACT8A6oYNewTR6hgur8+vwvmy7fMKSPxc6/JBKaCKVnQbWN6pxcA2nYK8DNM6ckXy5kj6VMRpOi54xH+XynVAkiI8lFfXS60eRqk6dQB5bIGPKbvAvenOa6dlUd1jjX/BW/gWLJXNWgEGvgELpTo/LwuqxrM+aoEZz7N5rHvaDdZCclXY20knOVOZK0RpJKtP9bAAkc62iyDu4P4RG/f9bG6wcDC/yOkrei2//8Tuf/6QJ3Pyzp/gWVKZFJJlmExVvC8Z8jMEKlzauN9grw0cbNh5LFn1WiL4cnPv4ZCFHv+Yx/sfUAhtP4Gl9WjsaiaXpAhWF7Hpk5aGKVvKUvNjm+7hsnmBiesPMchBnn+FAr/w/gVmm/UG//gMAEx/4ZrkOTn7gmDCP+BnuRuqADr5W0S0vCnMPIDf54AtQQNrf8BHy4e+Jc+Alk1aGDdZuX/og/xwchMLv+FiOm3KKP8cEdNocxkPzgJOw3/QLgOn703ymAhNIssoWC8KzgQctJbxWqGvUs/N9HITF43fw/vCCjA+sRMvAMLj58ikY9uW/DzzdUucFEuHH+AjmNSv2iXElEx926NJCZsmsNVMG+J8aEkWYJz2RMnkjiIElFtVUY46rT/bA/dsIFdfpMIHBR7yvU9znzdE3MkY6f8eaz4riy/8CtBDkb7XMlf4CQbYJ4b22bB3gcC/2eNOyCm0GcMXJ/06VMtLPYqf8bJIoSf/hZJFCrYnz1ANiMAA4rNo24jP29XABgHEQyk/GVFeE6mBjjXR2bHM99mgCfwQe7/wQGSfy2Lx2Ox/jJKjfx/4DMu1D/3RswVdH+EEYk77dqtZPhM+T5LmMJgH95dYUjPtqes2GROkBCH+L9vbktFgf8YJA5YTCAi0Mln/dguqWaDbxu/z0kDJYV/QQIH/jRILPdvWoksc6/xwOKTkaL2jn8SAgehYpozbutNO5lCmjw0qLk80z8+8f+CjwdsFhAP5NUn+AwARQoMCXd//Jw8xMzHuCung1jH/iXItW0fwBRg0fhUQeF/IuRpJVjiEvjwZy6wAMAujMy+E7yK/wENpOxGs8xuGTwJ1XK9/gov6iX/4IQclf/GRDFyk4bQQ4h//iwFDqbIIBGWEApQj7ILLaf9TAodP+IB7E37ebT6x9TaylcmU23MNTHejbRnPmkBLsjjYQGMpGlWTQLBf8dJHaVbCAkRCbc/AYAJyQYEuPn+tEjtL67v4gOfpWevyQbl8GKtgHvj7wkyRRNqb9itEWrkk8wC1vypH/DBd722ihw/vePC2CPFEko/Zxy3uNd8vf/n4mld/wYkdo/4yOqtv4ZGdHr+CiYZ/lkpDbe8ZAP8dJEU5f/+wE/l/iqU1/h8pyBjjCBZcAk3g/wUpCw/b7s79UbGA0lqV05KUs6sZINweve4LuNOLFjrcznK2YFC5/iYGWPAvkEeP8NAyZsGDjj7/ihK3MgcgMsOXNrLD/18Hfr/wMAzf+wDv1x/qdFOGAD7+HDGQyawj/zkNsCpCeh2FlC//+Dv1/8cEcEX8iAwf/Lg79f+AMC5/6EHfr8GB/jRQme5/8NEXK/+Jg9JCGFm2jt04YgDA/xQHpW/4OI+1/8yEuBH+YEsygCDV/+FQIT5BP8GET2v/hAi7cBqni/3klmUAQav/+4SzKP4cA3xsfa2mKQvOOQ7AYmtIh8nL/PhLeR/8JLMo/hwJjMzxDlQxW/5klklH8CgT//Nksyi9/gW0FgalCLMPSQ7nihtCRZ8Z/B/HSMqEnWr9agEB03qAjh83oP8WBkXP8COlC2M/4QHplCnP14Nx7v4DrUC2KL/DvsBKe9q61SdS8XKO8PNNryPxQoPK4vMvO2z/sYTPwAP//Dce//EBM/D+DaNYLAdN5/jqT1u0Gi7Nm/iDtiv2hVco/DBjaSMqJiSPIoVaCGGanDwPhgPzlHC4+p6m96kGh2RkKf3NswnolOD1D9Rh8lLPmw3bJgTNwDCRbYVEnmwYgqe/bLGJeZlyOBYZfapBYB4IBoPD3D7cCxBDNrAyo8sRcH45skkfb9KJkAKt47DjXpYJY4gXnOetrphkMYYw7bq/s8IP8BC7LppmUv0LpRNcyQ6j9v60d2NHKcFAopLS12CQeo3phqQijbtsGLXhiIKqCh0QdT5om5+fZTwAp6/w/2xAQunoO/ilMVuBMT/gJIqH05QPktcscsQazl7BEESyJXehU9DbS4+YafIetYxta8mUZYXG0jawkhgKYvSD/BxDzMPoE/kuXTnT85jV35EugvO+Gn0bTZkZvKOA5rn1fET3/ARgIBRmaQgzLVOgiQqHY1t9ai6xTwG7+qodaJVwZcD3+AkIUVclTkhCAwYJ+wa0n2pcehHy1yStwbJS0EM3wHS9DZ5y99shkvuGeQxtAtmaD/wEQNnSJPpEK8ipKxR0+3oCx5VcVU3jQaDfwquD38DRqv8QNIQFmItkQLXRAYCn0LLOmyOD7I/WikKPV0KvNv5Fev+DE/ZGehAJa7PmEAlrtAoMBE8f/FxdKZ/oiGITv4D6xYWmrZS3P9vJaiXojy2Pex+UUAqv3RZscwfHf8BKlHVbtSltE/wunHs0IBTzm+QVUVf5OFTkf8TIeoR/FKcfE3czHzsXcg4Uaxo8RP6Mg/Y3NJy/Q4sbhRc5S6keCMMt/w6nH138Fpx/+akicKI2+F63r9WgP5ATjyOHf8BE/WKSmq4QLA37i1QpyyRinYOjtMVJfYWvDp2T0gp/hgJg8u5Prl2pr3IzPEmc9/wExOt/nxKTcDAf7ASJwL+GVk4G/gMLmwP4DC68n+K1k7/EyPWR/ECyd/CoZL3P8Bhks/CAST+An8BgAgbBgw4s/4sWxisP/BQvan/iosVQgmTuPjGVMt0AAAGgEV1hdEQ/xwL2gfwmHY/4QV2JQIAQABAv8VKm+P+KFuzn/Kxaun/AsM1Gp9c3/r/QhXPMXdJAogOI8THE2fDlp9TG/5qmiCBHXiC/4qUHDgST/wFKDhz1CAjY9k38BgAhbBgzx+f4sT8RgRk/0In4jAP+RY4019dIWrF6yZJIm9mIEh67VUFvs7P8MGuGV9b4rJ/ICIKTZNjTwTkw5qr6aZUeY3/LBPv7/j4uWh/ywhrvbEUnXqxru7/DIA/KdTeAyBV/HlbSdd0l/DulaPUH+ujijzAmBbmlPgmvrtwkIb0gfr6zzzsVhnZy3ReyEPysq8QJcNQb+K5uv+Hwd//fwjPT/pgbtS/iUH//0QXCQjK//g4EZ6f4MCiQhN+UeRs43mzpL/FBcJCVEBdqw/hePv1ATUO7KHsI4JJbof/igV+O/hIEK/xMf0XAv+Pi4SHAPt7J5OoLRWx+URZIeKuEJHh9z7T/wcocTfwUz2f66N+7P4BP5yQ49svo8kzcZHh/XnqAtJCb2HhZ3o6iCfolqFhcRtt/No/hQCq/wUoatgipi/4CCkWwRUxv8aFI7ie1iErMkZ/A33jLVf1fRnykPtqBGnAXxQI3evquqbq1ez6r6MTPtu2qbg7NM9wlQ7Lob0lD0Pjjy7EFZxA8XH/A4uCAhwzQgMretz/AYAJmwYMNQH80C4P8JgSv9/i4P9/i4P9P4uD/f4uD/As2jNRB3rVWPpsGP4QATB4rwvLLF3+YG8/X1pYjBiHy/2N7Y6OFVsp/Ztr7ZMILn8xJy0by7Yt/z4OBQDv4JFPf5YyFFGavCQA/hiJMZ4QClA9Fgsu0lP8Eyq3r/CR7B/iY67e/iSJM/ho/roP9rKGvv8B28M9YlYHOPSnDFfp9BRXjK9pIyT/gISA9f4He8ZH99PVrq1ruh3+FAbP/BQKIuCLoKc/wGACf40I+3HdDlb60poD/HRUJxKf97W0RqIgpBjBom4yqubtZE4yteUVtLm/x5vKvPYOP8C329ErSmTEp02gWp+BgKAhEpDMLn755FPc1UaZsT3nVbh4JZ/jhQ9T5P8ID9M9z/L+kOsf/+K7qP/tHQ/rC1U1/BP5OBy9COEH8/6Q5IIMujtfaCfEf4Q2BGw77HgKudGV4GkDGoym4bu+7I1wC5A08AkkX66QDW0A57yzjD3ZhkxeThpUPoOw79dL+QhkMQOX04WXsIialyud4C7NLDSoycpCDk7jARobbE7MsvtboMa96MClLhJe8ACzBbcZW4EvZqD2ICcurXfO6MEnJgaTtrvVCVLc2dhnDnn8WBxy8irxNQj0qJY7og4ngVIHRVIqvf3wozdzFGwXQ5pXJq1pyZV5hjjt2Cau3UekFP/+QD7/nOf37LujtKY2dj0hIjMtne+k6tvzmfZbn16+9gHyRRitVkxnr27DHkKecQH6SA7d5AFIvRKjbM3Pr2gDFtGmhj0TPhlz8GZjygp/NVb3YlIs7D4jBh4mlFnCMH4jX2xhh+Me7hyM63ImW/0KktmAGgyg7RWPAzx+grQLB15W0SfSzD0QhHx9WyXb+R9rkWp79DosixxKfB/4AFJ+zLfK9OHsI+cTS2zGmmWlNfquGOIHWr6toMXfq8sQ8Tvdscb/txE/B/hVcY/wYoMG1DCASXrbP8BgAnFBg8+zf70UGNk0R//0iTirH+NCphZoWEeoNUC//8iTi/w+CP/wL83UrRhrECA0T+lOE4gZL6qhJhS8JcOvIBT0cXQ/0xeC7tC4gSMm4DOybv4oU40TamiR0q2mV43+DlN7j+CSY//WSbdGVCwP/A34zU4FdjiGGjaiz6qpLtQ0GPxryNxa9VQhi8vL2mT0C+3fjfwoAVX+ChIwP+ICjmGzxNweFPZgP4FzKq3CG5czydfeMaw0nm0sM4CdLRxJp5SmwO4bDwt5SjjSUrhw3mAmYZqFgyW8ZA28kUIqqWykn/Bxd85NZQR/AgAJ/AYDUNkeggSxPNa4+k0i90wKhKAA/m+nDg0XfsqbAqH+NE20PI/gO8wVD/3QL+P/go7uZ/he9paIQCgYJy/iq9p/xMyf6/5QVRMj7s1z170pxJtK714m3E6RQUu8XCFvd49r0j11bjc4AcELw4384RJJgYkv5vProgXdQ1+4hlph/ggMGXu3Y/gDABP9aGG5bOxrDl/Ax3pbjYlnTE4IRXtAL+2gsrKc5Bsw9IiBFgxBVFcumsi41PQfwoBVeH8ChgzGw+n8BgAnOfwzwxM0YXzDXz/wziBNMIBPRbPQX7Bf52VNS/4kBmMMMjpqjyKXgWPYAtD/AQcfXuMMzK8IP/gxTuH/gceF4T+Aw861ITTHrddYyz9ALWL5LckrQoEqlYan9VGFdJH+Abkpu26jpg8CxjYg/K7Xoia3LrxWjc0ojeXI6WRUiMIYwPwG2in1ARv+PwqdDSpvzlKUN8CrA0U+Gs2hAJ1GmigwEv4/4nPof4qp6AAADpPN87Kz/wEVDlfwQFCX0UoDDPK3Y/Eb3l45u9hDKxbmGWGVLmDm+JHxfGCo4jVXxLgnhFEJS3TA98yQUIoYjnG1Bon+DlYfX+CQdr/WQrWJeQ9ZfwN1z4QPHn99DPsDFv1YWdD8X9nzijnfHUpnKdunIOraHCLQ3X8KAVX+ClP8X+IQoQCjEQyg8Wq/gYWMxvMoxAoyoczBYnFCcOiE5Svmz2Tpq6f0NAS8TL3oRF2T/zEwiCD/AsGUKf6+DUUP8LCISf+Jg8rT/BiN2iC0X+wANTZ/woHmEfwmFm/IFIG/8ChZwJ2GH+kEFVP/GCtNd/INSmfwmHL/w+Gn/4yIUHazLI7yBjP+DBZrf8Jgm/+ViBsmAAAJVFrZaCECoP4fA7f4Epmk+Rmy9oGt0sAIDPuxPPdHrxa8UFf4TBuv9xIKqf3lZxIGEXurm6FvHVIhzVQaF1mYXAY7D9e/A4hB+Og19hgro2SSCXWpk3F7YlRutyYqRJY0CHf4ODjrE+EBfXP7f4DABPaDATQM/1cWRJANMgFzgeVV97Ad30wNgcrNn9mXfCPV8T7B/g5VtN/mIV4/8JB8/v3nSWLZGXs9OCuFCbYg0TnfcG4tHf8A7/RGoAKDgW47K+it5f8KAVX+ClSkQEw6ah/gMAEqf4ZcMnNoOEPdSD+BnDLRWxGB5jH8cAJF1LyC+lUI2ZItHoWyqAYwn9aWVbMyLwv/hUGesf4EBnlD+CwZ48Tv8nFkHP8Jgzj/ngX9ZuQC5BIxui2NojmPtLrCmVrmFn1JSRYiUB2UMxjfklZs2umP/gWLi1f1oD/4AEP7ORE1WFDiNo8ao9AgklvsZrxhyBrc/Ca/fByZZ0yPteeqvOWGxWqLX/wEvwNUGKy5/+oP8DvG/8EgrX8s8EmOrpABgKcS//4CWQLeJQKqnpEt43RKsvPT9hkH5BDCB5/8FECUYFHtX+LCBKMMTQo1cD//xAlH/AyG7XbNnef78IEoy6FTyIE/7sQJRy04si/rf9WIEo/4TDOf4rAH/8xHBH31/jxKQ4CH3/eRAJR/wHW87LycXhfYH1wAgu0DpsUEy6YhR2pU2Xx2Q4QQDgZVn+AmILC//hQTx/gaJ+zLj6hATMuQL/Gh/Y5tyT1DER/A0T9u/vUeX3jVJTSs3WIH7zF60VJsnZSOQwarCVi+ycCde+kf44REo1iEAmUzwv4DABMmDCOaU/zsiJQf//BEvj/YCIlG8XWymWM3aWGcBuzbTiyhkAIt19naEgVWUeCYQui9A9Vf/w8Cv9v8+gr9umHyndsLMhvyGD6rDFm8PD2n4PvA5rcsrRHdAbMNGd9Ny/7+REowlZ8LCz02RVw7dCafa9Vp1UtLHXpjp2djb0yEHCoSXTwcPl4LJRvEB5X5/wELsZDzCfo4njcWLd/sGdcacnagceFF6PMSKm4GIuCca2yom7nEXwNhIT+Dm6yw7UzMYYFQa9JQUREYAHNBmJOlrvrVtn5WUq/KDoogVqr7V3CrQ5izehOG+wOIcpQa96r2GXLHXWAYft3L0q5gm7j80C0Vl51wRSJ/EcCoxHjLIrcsXfU9Wm831TD8du8+aMVOjhHI07rEaFM4mb52gtYXlaUuf70ylGn+AimJB9bRZSD1qET/wTZ8whmEGb0C1rm7jgLljKxfvMl29HmJuSdmuC6463sKpf5rJU0BR3ceDu8C3KwZyDpruB0CspDPkydshNr8mcaJ/wEFBXC0Yhc+FlZAGmiBeEgkfWY1JbaLVRGPjIEkwa2qAyg/8pYyA6BQNFsotrNouqRLtseSAbCBRbL9zFJBHXIYJlcvb2UB6FlauNSo3YHs5JfmV3FqmoMe3n2by2gQMMkcKesrJGic2G/mXElBZpdq3m3Rxoe0ZlL68YwVGQt+rjl5DMvgYBOHBzO7GBSYkxFAtotxd9Up1Pb0wMq79X2DFnklJTGQBbll4BuSiT+LW2Dx8iteMSlzZPc2MPVTAWEloYLdFhMiADDGp3EoNIU6AtBrbUmSNG6Ie2i+nMUntQTh+JIfSQigbx5OzsTHyjc9I0lhRPjqjuoeCBxAHdzBAt8kViue9AOTS5Zv++VuYZ22FmgXzYiV/gISRDCmfLk8qTioYJpP0Bbkvyo8mj9q10Jl3zAKhaf2qzk27WbNpzOucUPmzHIxG8wZmPl09hNPxXw34fiej/Ediy+aKJdLA7l5mB/t6w7UgEtWcUDv6bfJfvxtntqNlQ2kFcZZTaA1fJK5D9xgIDcl8T7RxJInSYBHMeu7opMibhzGB03p0gWaoHGQxNIVV2hUvX7vzWDoC471ZRClM4WDnYsPmNUUGeuTGLZFgkWRTfYCPlklBXVNPB/e7ny6LFn/AQWg3s8EY2QwoQBZVURcRsbD2gUDVwvL4zsNUKm+NnyBcX7FywJrBYaLvX5BG8ct1wxplU/PfS3uJtdUA6atdcdCvMIuKwnTD7NsACEfvTa/Dh8A+z59G5YmhBc2mVav8K1x4KIrtgDVRZmL+B77xO/OUpko/5aLVOPEhTq80Uw+A/AxWAZqsyTgDCWVk0XmjSvoiAbZmODUwZ8qXdnioPJUpi08HV+G3F6CQ4fL5TEiAThTkQ1GWuElP/Wirprj32ZTkcUHm1heIOZFnTh9JBXPlOkATqjHxbaQx1HB6p58IxDXTDs8Mqz3Kx9Ds3d4Lg4QE3F626HcEvJRv1epgRBjdixHNotLAREiI1Yf4TFDzvKIWEcssqz7AQMElLCUptitu20vToVIxz6EXnqY3lsOprxViEl51uGn8Bk+CmN9M8qA/DzMXUfsZli4cisU/+AmF89IDc0jYNKkbg/Pe4MtwZTzCWKJ088O8r56SISdJiaRIo50rgVNEDfFncgQVJ6H5LUhp9NS0SDQ88WaVFBtGdxZ8lZk4aJ2M9utXgYBRxQqxywOEZ7e3YIsttxxZ3vmUYAG80+oe5IEJ1tchLUGr8FfGU7eUKUxYbn5cb54YGkr/avS1DUpMFdUDgdOi0T17z449NGR/qBJ+ABOaN5JH6iMatY9kMAQfu65JP2siVOpkR2VZ1lz8PXgHRzEYbGcIK0JReehsOSx3+ATvCnxSxLABN5NcVoePkZ7wA6s9iEAscuGItYeohAzcoIaYemZxUEmBVsXu2tVK4g9zeOZ8n7bT8PXtPH6NGxi79OhCSzOuDwrM31gV3FaILxyAZagajzvFuPQDXU4EJhwx952iaZfMZX3ACAgIltSYvkeSR5/ZlE8B8oG0se/gXS14WdH+PU4mALJpOgqvf4R+tZHpS2i9/YjJQeLX27e/fU/wER3qy9FInM/hqH5HSmP5v4oh1EYbE//g4GtX/gkk8/lg4tJDmpKT+Bsbvl4tNpRDTYCVSfkIwdnRx8lgzm4TF737v44ncRh9JJGzuV1vAOtMoqHASJL1zRrfTJfzFFPn/BgTQQqupT+0Iq6lQAf4wPvQwMyi5RMQACAwP4UA8P8FA1zL/ERRYP5IEmB/+DE+TH76OSjD5qXRSKWum91emid43kJ8T4sufyuct2y4/7cbRaAv8cH/ugZCA0qJd38BgAnvAWjef5OQPn7B0qatmHyv/ASsFLAC8phyQs3aHNk/46VANp1etYsVQse7Xt9gJIiep9WjGc4Q2UKtR1lLqTtFg4LI4rWGr/GivExk/wGuE/f/GBm9L/cQNBeB9H+Si083zgP3D7JDFiCzjl/n1jYtqIXR/s+LM+ygrEZzcE/iXzRLQdjHrIh8u/h7XaQGKoMN1wywt27vA/3R/wce5lgmncEr/AYAJO/y+jPA7iWukYgMEQIUBgrqPGvh3b3uGl8gQnQRuTfwKD3jpxgSn6U/W6WD26HWRFRjqvOObPr58BzMBcgRJ/gJPxy0GP+PkZj+DHPT+E61v+K3db/HAUkZ/AGnVY6AB4FToR1uoWIdq2BfT6samj3+VkBBT/Hx23LrDaHEips8HdyiiBmkZCvo3PpBUH/lY5gjrZjq/FxuLjdNhWl54yz+HODWdX/DAXxT/Ca4x/FYYfjw1Ot3Xk/1P0o7We3SFjKS7PNn+GD1r48cm/6CXoT1WqJkJ4+NZ+kpxRRf8MFxFM/PultNZxWgHG/U++H0Ej09Ns7/Bn+/9IiBG3SC1u6dV58m5g0msg/AR9n/DNej/gwUZyov8kFVwoTYjd2UH+ZhRnL+F+Ib+JwB//LRVcL/AoFN/nIquFJOAlzGke/9dKA0f8Po8gtz/7SUBo/4D5qfB0JXtIQ6ARcEAcg1ze1W2AWsuKf8BLAYRa4OmVZaFAi01n1/woKT/4Cj4QL+CRSb/GShxyukIRX/4ES7uGy+4fkaOf4BAWFgwx3iSDq6LCA5viQC+QQX/wUqbOpWu3/wGACPEGDo75/IqEzDAo+8+DEyvv/Ci7I9/nYw2Mi9s/taN8US7ngy1c0Cicz22cyebKOVPWECgvMPML1zL/4sQKRfwmCc/wy3xJ5X1jVAAH+/C7bT/QC3b5/vAgUiAQJENFpK+/5YBn/PEhq0n/JgO+El4bFZTp/34gUi/gbOx9orbVPc7fkI90/k/wEOiEgCCbmIrz5KI/ozwqEKQ3TtAFoUJ/jhV4mVD+CMCer/14q8TP4GDP0+/72Vhm5NWNT1jBvt8gMX16HI1bEdACEr0CRfDqr/sgIKp//8Iggf8QBkz4QoaP4KK0geuWL/IAQOSI3UZo/4CZuIr3Lpy1T3Ywvu7Uq5cQGNy8zWyIS5C9SIzH3D7GyKe/myIbHnd/pqQS7m5xGgygBGda+SJO5rRQxfMHszRKQvpSJ+gWVbzoboOGBQ9IkJ388BCrdb48AzNCMJVu1Rw78IsGt1r5D3Vk/TFXursFWzQ6oa0nFKDkyy2Ihm82eMSXR6YLPnMnwAigsRXyB50PUDOzWQgpvl4APCyAO/JHaB8d/vb+2CtXKsAqIv9Lefh0W2U9xBeQ6QANIshcfQSX/kBScIQP8BHAPjdQVI5BZhQeTMVlK6Pe4YUuWHIIZ5FtUU4CZkcIvS7RwJtkpnSQ1PqpSgG9yBKNvyBoWXl66vCxFzbqt8h4IriNhFIA0uNkLe6D2mkpeOJL7UgrH4y394b3n0bkJOLHY/D0JQdh4fCJd9U872IFnS2kx5mUPxjm2PkxvCOzr8ywMkkZJKVnLLS6AE1sksJe8PilHV3JhF2AlebDhZIm4eATz5oeTQCEVaEM8ZOcmqRK0JQMXcWAbne7gC+F0M4H8Kz+ZxCAULTYpBgIiPf5OZowf8qJd7ACaTI5QHDVn7X+CkEJj/GyOLVf/uCNVf9QAwfzlUEX8C9Fn0mL7yFOZY0vcPz4AYaAlPeACXyAghFNWTQ9kZ/4CVuRfjwRjVKlqZQAZ8EtebhQxW+4VsnLscVf8HKOWX8xIHo02N5EAYCxEVH6eaCOtGKNiIsekm87Ga7KfwNe3Jv8D0nDEBf6+J6Hf8cKlRqEf/+GPJ/9JDHk8f/AvmN/AFFk8zU0esBokN3rNdULeLyu1LGxGR4TLu8zZXNSGGPaJ49/i5fJmy1v/hSn+L/pRBEwf8SKf4pgCAoABgwOAggQE/7kU/xYfQLP4cMO4gF0yc/hwAfGBpo/jwAfHvZW/h0Af9yX+PAB8t9bj+PAB8j4b/HxpCMCYAK7IypPLsoNg+ka5+XpsAmsPd35Y63T8B8GlFx3slQ5hmPlkZYZ/cFxPLmHDZKIJ3H3TQKqQttIMqPfgoLyyzXYfFWA26RaegV63At4VoE7xBDrqN7nb+zG4+lDHo7HsNJO2fIpqze0Q6rSulP580U/u8njZhmnqG7NMVy3y7de/w+BPj/NHT/iMfjclHZd1xadh9YsAEkwElwA2AoW3TPkmNMyFiBZmzNSfDJ527NtHIC9flC6TqA/r2XVCwoPZIkwBIgiLRzWkA7TDgy/nQFKKu4l1jzroDNK+00ylzNcpG4NyCH3YXtXjSmtpBjzGXjy66dQioYUcxTlXRnZL9XYk1vsz/44XflNcx/v8Pk/v8Pk/v8Pk/jKD5P4UKY/4D003L+IVXmG56PJ/wEOY1hf8C6af4BvtG+z+Q0YW8t/VSZWwg/U7dwEPfj2ezYkbbEIZH3/RBf4Kky+YR8/4FdIQ2EByV58f4DABPqDFEwq/xYmXzCOJ/yCTL5v8fD6QYj/HSDUeAiZJExv/DtQF/gIthU/lwqltP/AQj6n/vpBr9DMD/rwuqNQSNdqky5W2cSIAEtl7psY0HUM/L/HwLH/HIC3/MAH+FUnGzLSH/I4G//iwIOruPyKwgPKnWlzojdfgmnFDBE85X1h1OUYDP89GXHATSGviAn+ej+Ir+HwQO2Y/4dfmbg7/hq/LAdGpSiD/LCayF/kiFzTgHg/z4eZhMH+fFzwQE/isQH/x8tRs/w+8S/5kURjMQf/4Bv3P4CAAf8IKzk/8XrcIp/A0WaBfwI23AP+CBk3FA/9LGkEH+fAx4bg/z4W6LMH+ejhIL+INDLk9Om4v8dEt5S4AKP8fMSwF/HwA+f/wmNf/4mX4pP8HL8SH+hQEPS/iAE/cH6p/h0rlLf8BJhPn9Jgn9cZp7hT/iJ+YFvzzUb3gGSS+fAeP+pARRL+QQn8t/gAiqzf6qCf/4fEUL+bQn8Dv8kUMuDa/n8E//hMEU/ocJ//hwO/El7wPNgJg3+fwn/+ExNT+FQT8y/1Eo7bf/+F8Zv//C+M3/7BfBm7yzreddLwIIdIlM15OJWU6HuSgvWwetI4Phh6gG4MF1FI31FH8xHnqASY8sAN/ItU8Vnt1srNUyLf8BE2f/BS4lM4A5/kZQA3/AelbalVAo6DhH8DxVpRDjIpRwIUYGx7tiuSKCpQAzfTY5SOjbWHf1YbuvO+Rq665dt7jeSAH/BBNLKg/+MVzRTZsDx8kwoUyiPrHQA8tiU9MJ4mfgDD8kt7KJeXQoGrbP4DBJP9lK5ov4Df7eVzRfG1/+ArmIs5cWgaLkUNbYShyc/HeK1LK4ApE0JQalLGezj4Kcuo2wpV5FvVr2URfghgJHUUpP5wrVF25WxrhIVfv1CqFNz2H8fh40SHWJvjppZkbKD4UbA/TstnJF4L3GxW1ejatGif4CEWaJwFbDiXo68fkHvWNGKZyLi3OMGtUZYPyMUY2Dg++29N3oe/z8MdgYwFwFQ35cCMxwEnMKM4DK5HbOiAmkpFVQIo0CHnmyPaohVt8QwOLR5FPqUWXCAPIcZK2C12ZW/KyfFkur6llRB/x+B/uozzI3bqD7kF2TCnQQmE1w6jZ784zg9538ASE+0BtD3geWW49BJ/gITKS3ZWxjjRj2VnXP3l11R1Er1j/wErvuOw0LmitOsXH/noZKlBvPxgYia5/nF1y07U0ZeSOhIptBe7nggTNugOpAIiEFBMM8/88D/AwlJ/sRXNF/Ab/byuaKHNX/QkdIg7CNvNsDQiFg3CqpRsvBUW8EHa7A9jx5LW0ziNZ4A52HfQ4PiM5Ntjd8BcNoWuHFPviTozB8NnLf1Kwd35Noo/4/DZLu+M2xDrSb+Ve6VuEp2tTRqfBRyFMwgSsdiBHbKgAbiV8YQa9LtpmcsSBoNPD9PmZzBiIrYIEzKm9/eo29yNHaf8kC0u2SC/PpFtf+YZXvdt9Q91z/gIkTRkMhC/gfi4BT2cbn+AwATB/1ocChz7pvAGA9VAHj8bg6j64Sh9fGE1mxD586Vb/gphzOC4QCkvD2/gMAE/wEEviK+DKf4mYGgoYNLbdNhAAoMjUMKGnVUj0nsPZn6arWIcqUHYMEO20/wUFui1ltY38Ccj34FxiJ0WKNLn94Ke6pEfwAxXOsrWjCj35nlDCncukwjEOMIPIF/CgIh/AvGwCo8wnfwGACK38DPGxuTKG7Y4P/jwZ/xyP7/lVv5MlVv4DdIQ8/mHIC8sJIA+QeDj6tM/xc3VSkg+voSXNBtURlUSDtk3/FQJ5Phhz/AtQED6TJKfwnTtwFyBUn+hDZlNMAfD/FQJ9T/lINmU+vIAPD4APi4APpEVB/DvX8nDdRlUuSbPMb5TbMOgziNhoED+ryIUdSMKxaLd9H9JmKH+ON6SV0yFiiKjLLAFKklTmXvRdY6cLEHgcAAH/EwJz9kAIJwNPb+i+QDMtH/DLU6nwAXACZ6c31B6h48mRiwFsUjO77f4oNmdAELAMA+f8ZGzOhL/gg2Z0Kf8OGzOiMBgKV/go2Z0TAMBT/zBRszoo/4wNmdG3/IxszpkAzsdZi4sZn+CjZf4DAaj/CIbM63P+CDZi6o/xgbM68f+SDZnUCAP4FARv8OGzOoAj4D/AsOQAnoCjAIWAACAAqYC2gJSCgAAAwIDD/4KNmTVaAV3/BRszrYANz/go2Z12gHj/wUjwl+4EAf4KQ0JyBAi3/BRszqWgRt/jxaul/BMAk3iskTgBIqwhlLh1dM0y5sMpvA/XNoGfti+yAf869WxFnwpG1gU/X9zYWSB6oBqJcVJIP/BwNYd/BIm//Lwvf/AkGqLLQYbpYUGTpfLtBXomVueRbmyw4RRgfFlotuy2sIuRT/gJEGt/hMCq/wUA1h38RCxIG8LYEzef8C/MX4NPagT9/L/gJQCOPMFvBz87f8BBOR1yQLTtvhdIBiQ+P5i+4C7vJ/wpXF/4KUUYD+B4ayj/xkVCC7W9C1+5YQGB/jhZgadP7/F9f9JFLm02/wKPljxEuVvpXc79J6Hes3pc7Keb7EYXSZPkdwHAzjrcE5QzNhi4z9QTm6GxeRoUakA7UV6HftBXf4OCuagVD0sS/gMAEGv5Zt5mSyUNj/Hi0YxCf3sQFFoklRaS1K42JNBLsYASNJJi58s5fx5x7e+GA/wKTwESSnmlGAzDrxOaWsQQaQDG9tlcioIWu0ArvR+Ln4SQiuP8KBCn8ByyO5/xCIrQ3O2Bd6PS/+PE+1Q4//YKFJcRnjBDINsB4JUvRzi5/gIZ3SVHzQEP89B06bYtC/8CcJf4UdQuwvvfgwWNE7Al+iaRK10nfTA9tHx11h8SJdHpebBRimnlwVfYPMmQF1UBm0K03MM6ThGfwiH0/64Fs0vb/HRh7Wqf3+U+n+kj8NJS/x0qO/W/97BvxUFpoeN8n0IxZk0txuTPuB3/FApX8efJS+toX8C1QBdTqgJ5Q8PsNuhHSpqoprLzIH9ep8wF+fEDsxf6rg6Zdn/DIb85/zGO0GcjeLQB/AofvmLcB16hfbgymx7wuOkzYbMXQk6blHcmpfBcpVx8WTOzW3k8v9L5OZeUJh5X7ySE0D+zJ4MP/4QKmO7P+aPHFgAH2Y+D5NtkEOAOUwF05G9ZI+fmSao2PsNEq6LVaTgUMt//YhBeMAAP9/+Ov9EeOsM1Z/8FB1gecVT9/kBWNzqIHNjyXv1y7CJy0Vppd+qzu+AMiNUp2K5/IHpP0biPCX0TgvNzGiZFns3eeo2RrVE96AnUtB9yol4lA+GkGz42VzxOPtbNZaTNcPlF0dPBgb6rYMonrH2Ov8BJESWPfR9FJ8KZMrjzPajQu7y/Pa9RYB+x6fQbZBQKujoJwY4QOW1i938WXf/gJO9ltcJR8QiTxv8BKpdHviql5cq7S68irtY5ADnRbYzLK9uNCvZnZBjDXa4omEHkvuBWJO4nyAkZrgF07tSmf5AOrhXv0ovehEGEEzSmIPLk/eKaViYgNuUMAIdc/wEHh1qZ1p6KwBezwYVPMJqUOyCzEVeGauEsVfWVbf4CRbZmmbrVIXkrhwsUz4lYDqJk4VobpDr5mB7k7T4NJBu/AUHooxUUsfRnujzz/ARz9/KLgcTx3evdMpdDuWXkcYMwLYQF8DRh8vC+MlTeeGQuQGYgzpmYJCslLU4sAFD9ewvPn4fHr7veEIYrB3mYkDCKgvr/Z3Ablj6M6hlZUhd0W3RYM4Q/VmFS5hIqBrjgfwoRJ/4KBjF/OCRRP+GDr//ARIUHHX/dUwH8Cu6BrfntLbjhBBMHoE86wX5UbsShGqh5YsALDHp56D2h1ONH/jROYGH/8NFeQj+cZ1fZjn23vxzCGzFoKc6TSTvesMDh7WBV+BAYH8JgA5/gxCEWEv8KFeQoDUFH/kFAaD+EwPL/fA5zcloZr/o+TmBlzPr/hpoklB7GOJajxE3KZQ5jDpvmJMAhhXBWggohKrDm5xb5sVu0mffdWetHcAMIDepH52Ya68Gt6aJRBLU0PM4XRxSKCHrujKsETG7zfxuzu3Mmf/c2ZGWG0fAjvskCu7to7GEZzSpZuJWqlngVYjU7Uksa/Gstewetty1QBkPXruP6sS1603q2aHgiZQ8LRytnsX+VW3+IlpJWBiaTiMQ6/M0PYHg50xjgYiJhOjU5COc591dxvhgtxgocG4xO9S3yU4ck7KTurRCEwcCDXJ1y4sqFgDehT5yOjCoctKJ5wfBd1EE3VsEps0Vh10pClyW21NYK9D2xVX49LBl4hFbN3nOsfjPv4AatkmCHT+6NvnOOxTP9cE7k8fsERmPuvKZFVvisJk+UpGwPi9EUTw6PC+7ZIwFPk7VSMTN8RIKadloI+yOE+Db10Erk6Cd3moLLeqI5JSjRj9hIPkASbiBO5U8eIcDp8t5+GEOwt9+VdmazW+krZ+ZdCFki5yTLL3PqkqAMiDlaEG4PRqz8X48C9ekAzHbGj7BIwA1eVak/4BYb1I1YkV6KKmgMAc3tdxPh3r/AQtE47WQeCh8rE/LzJbeMdFlc0y0CqOL/ADbnvV/hYisXx9n6q28GVXxg/4CWgmY0g8LQJctJnm7puXizhqwa/2S06LjJOjYupt2YFhBBct1Cc45s3VwSnkIZIkf1qshojAVbz+kBPVc4z7sTl9PASKFSkJftphx0k+5/0AhCLfwoIj/wJ9sQn/sxOYGbxX/HS1Avef3+zB/6SP2WDX+BdqU55Fp+IzTd3w5THXmF44KZAkG05nStKebJiELmnYQox/jb61WjEJL9HFIVQDsy9vqZym8NsA/mowkNJf/AQhAT/Ayz0eoehi3Jg+Fg8X+bHmm4CSUO8TY2/e1QfmK66dvJR6ouOz8KAVXf/Eo91DZ1VMCdeY/wNvdoY+fiyUBSMfHkbmpEIcMFQrJ2aUG0YFt9lyYa1tCVAeWHyWuvZrg0VjSqirc28G6HWzbVDon+Dk+TH+ZULmaOP4F4ZkboB36TdFZ5gQyePVpSUH9mc386ReJQBij/SSh+6UlHof/HBUL2P/3sOahhPUXr2xt1PnlwXjVhv66Kleen/Hh7qAjKlt/wLz4yVN9fQWuHBc1bUZjiJvJn0+Lh6NtUlZVoXdDGqcMdATsz/DIR03/zIGoV+tFfwMe6p1baGpB8Jj7VhFrHQK0pGvKb5do1CYoS43nCQI4bfa/OP8KBcv8DPquHjkP8BgAIU/4yRfWbKc3+YUNf8eBqTkL/f76r/KDK+qbvc6ZySyno0vxNQr9kAv0IlpX5/gJNGcm46Q4LKBxano3+j/HBHFZh0ID9bI6/wGACP0EtHkpSZHOfQOxjmpOEjqFWiVoDgkEij/Dlndx25Y53tAF0FVzLrDogm+R5bzimAJNVaxBf+Ah5GS5Rpfc9rh0rrIz08Dvg4Yk1kkHr/xIJCM98OcRZOzLkD+CBSXOwSv+AwAQz/lxXfXlj+BfYxa4ACvUosJcxLivq0yzyDL3P+Ae/SzzPElsJKnROC1txE/L/4SQKpv/goCq/1kjluA5yRAH8Crmq8K4gzge+Dl6wEF/hACAgrRqD4m/zu+GC03Ih2K1rQgkYhPxuK9Q9h+uKCMw9cjOs3r93JwGCfIG4f4ILnKv4IBvYMB0CKU/hMAqgLhAuT/YRxx0BQH/FjZSD+UvPT/MBGSx/mBX48RcIv/idDbf8yK/TH8fgb/+IDNk/+Ewpb+PPnhFwi+/G/vYOCyUaIqMK8UZTRA1hyZv70qVybi9WJVmq2s/x0fQx4HwvfAuf4uIj+AMBoKsDGgzPmiKMUNvy21ChCrQyKluhTmnPDTqxTHVt8G7kKBedeOgWNsp1rp8PD34i6zok/dE4Mok0RJ3qLP0YWwx2L+FBN3+BpnP+CBDH/GSjhu8rEZ4D/zjxEIGxP7/mc/9MIFqP8BzObDK2X7o7J7UgpKJefI4fHUgPHxiqxCCKRwxY9K9nTKJHp/jBVc2yv4DMRBmC+XI/+AgpA/wJPOMsX0EKLtufzD95Zx8N048SHw9ixoA+rzoaQ2hFbVZf4UgDEZA775KGLuDhTw/W3N+aAVqD3/wQGgWucgX8BgAgV/LLR6xTO/ffwM0etOpU7QLzomMzhvQQ/ufQcS/WO1wTy5QPQZxtyUWRVarln+OjFeevhAajHTW/gMAE34MEisv/qBivP/iZGbQ/3sYojgADCsB0FTZ5P48H05N7+H2cLcAqCRSuTXC9F2LO8MvRGzacipTpU74ldYskBEn7HJVr+DBvjBnk0oj2RarARQkRj6eIXt7THSv9fIzln8TtgX+JkWIT+egZ9+mjZFV3qL6uj+HsFT/IAf5TuaS62w82IHQt7bAKhIR/gILsDptQEbAACtjdfE0eTu9F8x7+DLKlMLH2BA1daRPcaWx0d6Drk1Ei3/kEGT/8FIscP+LlPPz+HMD7+FBLXw/gW84/ggS0/xkj7GThM9eOf/wMkSNuVbtY1n0JQpEL6tne36eE7ELfYaBL3LY8mrqBenRPcaH+Jix3j/Bgn2WAnf8BkEZZ/AZBQawYPS/+dAzLoB/x4aTujEF/j4toz1lznYZBlkhRb9A9zUkZPnBJQCIR5SmsBReJ7oPVMZsiS5H/ASOM+GgwuwcHWDIkDSgsRMb9coqm3tqDJgTcWypyCfw/d2Dn+CFu9X8cdwUsQBNzwKdxWx0pnG0gUM+1EABn3yff8BK5x1kv0JUHP8AFx0Z/Dj/Faf4QCDtRP+X3+ICPoIl0SL5dKsON4YoCAI4DWamv8BFS/2pzzI5WuC/rEK8jLf+yCxh7+/3+Lz4gNNtyK/Q/wV9fg4IVf5AW5n3k7eS2O+N1L1kSwvpML2YdArxprGpko6NPoNNN7y/LpLw90ve9SW/MedQD4f8ATDVj8OMcGewi9WVzRX4XoZgi4alQcEGAqJ1f5cmWAB+FNPQosKd7uQRaKg77s90KJD9wIXFfNSUjhbqP8NzSLhGHDru9kj2mOwcqIl6/hfL8Q1FL/mKbuh5US5NvwPMV7Wa25H8PIcjVjzM0/lfCWvqcKnGEzJBIZmmM55m6LCsHWdOWuKM9FjSrsjYztpr0KSb9spKl/kAU78xwP8FEIC3UpTwtpEZ6BxE2rOSDVjDhHf9tYjTmLFxUtHKr/AQFkkHWMCKIvF9azcu2pdei+3XcaEdSOjlv2LTQ33mc9h0gleQgHWBbA78kpz+szFf1+KCG9XkwcMmi/kx8hjf8AB80z/cObVzUiYnEH5p03VkgvgK2TnC/gHDowiKjbP0PnNU/eB2Bzu9mfBmB5YufXvVfeqlFgN5Ncc4aP0AW8T4E1csqSwCCbagA5CSWSIG79b86wJeeH74kKaXZcTILCL4GaFO90mrIy4uenmjow1zeMcnbMgfwiVc/y2lLRBg/48XLI37/qS5ZGT/76XLI0kZ/14uWRk/+gFKSj/HCKghUf9/7K38kuaRt/DqKZ/+4ad1u0aSdoNtswTvuFpGIB2aGOWQQBd/jyKrDoP+Bs6B0M+olNqRPZ0E97fBnlEknhxuzPf0xAzIxA6momECkvBpmAWqGdoqZftBT+tOctCm+ksrsmY/gXoiBf4i6IikKFBpFEf4f6IuvASvuJpg8GuH/ATagSRmFk1SVDe3MhuUP+DAqSmp/24Eye/jxJz0DWZ8kFDfROMOu7fgz4sPO4g1QusgD1lfKjZolyPN/jiVK5LD+CkIsCYqX/NyMPetOHSNbf8PHw/8P9IjAvivnBNUtp+1qflSdUea1g2uemc8gJQOpnYRsKg0qJpz/BhIS30BLzdnX7Q67XJ1QT/e+vZHY3f+hkYe93/wQjD4f4jEYe/+GR8dv/mNKBYEXT2v4G3FXyoxwPNCaI61ZAkfEHhgylcCIk5smqDvndslCbkQI3rQ3+FCHL+BvJ/+IDnYBGmRLw4maf4Ggm4AqigNLVTgTpFDS1d0yBGpIndNlLR46JisaJEFiIzZez7/EwQJJ8OMD0f4KDvepNp/xUvb2DIFr3Zwf4kTKb/4TATSmjLFV3/gDcj6TNTVj4iTF9eMbCyt5qRtYAlehazn3VpG7B/gnjZhGCxaCol9KIlDUag0UleSYRKoAqF6F0foLrUtbxpiVlVWHd6FqyBE5+Js4Max988ZZ+qoLDf4Iug/5kJi2hU/gX5jigLWx+LLlxFU9h84c3k0XAX6cT99elXZ92FQymawFu5EP4ZAqnT+YwhuPR2GtADA4qgBRAsWD0oS6uxFu/FHDDIAO8P47YIEnP/QiqKGAjP8PIJVJ/D6CViCVurnepfyMLoZtLhqNsxYOVPobOOV0mEDNhHEf9UfYUryEuPPQw8o4gVO61Y4YdRhScTAYI9crv3odYFgW7q5/lBWWK/I2yjqAyKnZr5biAhMGdbVUcY1FOPZcRhGzQnaBAtp7pWH+FAljw/gXsx84hR6G6ij1Ruh/4e6G3X+9uhuxTSi26aM5gdMFyOdasi4x6Q3Hv89G17FXIkX+OkW6Re/yoGWFiNZFew6OTnkbDAw2q42xNlwQoP+JqAw1PnRspT1BQstf76RbpL3/HxQ5FDZTR2QFdiK4qrLOsF3tsihuRGIYt89OVTRdgkosMw4+1RMfmoVhNDzHL2vimbutQLKLTZ/6a2hioATTIIezOlFEf812gzCwLZdIeR/yxxMOVshlQ3wgth6Yf56HwBcyXf4FvO5HP6vEhPYruKP2sGYedflPNvtb4Ky2R56j5A0F3tj7VIr/HS2u29wgI4Yop/AYAINwYDwEH/EEHaDFKt/pUITMBTg01CBh/ETO8h/tTP+KwB//bCDa9/j4t+gB/3Z+yHHP89Fv4CrRFX8OrX6h4/7AQdoJL2fkAxN4YTHBxPFtm84hlSyTqmf8FL7kwDCAUE3c38BgAndBMSl/IqTTqQHCbbmbh/n+ESTn+EwFa4oz9boH+kKgaRdO0nQu1zNFzV+btntMq90Cex0ojmFb+HkhZ//qlIWKP5+SFgj/l5IWKP5ASFv4UOtv8GBQY9VCASYI7H8BgAnDBg7vF/+9goMfj//wFBjglz/jQ9lUeAPprCR5//8FBj/w+CM/8C98UqQ0GuSOflT3M6xNtTpOIBV+LjxdUn1Oxezt9zXb0WsaIDXfnkvsSRyNH7a2B+6KuPDmpjPP8InBP+sgHHdxddRAMCUMIAeuvUJrUkgKqvF9bxv2KPa+AL/BTJUhRf8CYIoz/AsgLSCEp/iYQ1rhgrp97zAAP4GTHplkbAWf7I2ycEavgSxnWQxpMMrBP3DAp9qn+u7OCrzVEz+FADy8P4OOUP4YMo405L5FVKoDA7zupvpHHH82pTZz8SMDUTU+MdP/BQ+BymfwRh7EH+Mi6Asz1xfOdw3+Bk9CdD2/4F1GjqiAmdK5yrbInSHUcUXCy2Pc8Ut/A3OCb5uFP8ZL+4uX/AgIx/1gIUs/1EkN4giZ/kg5Jf1KEYOYYOANyCt04R3UY56NDP/Co/gbmwI4QCyM+M/gMAElYMCfPzySj9v/CYHl/voggfdVr+X+bBn/gBAqJShvWoQrnimMGZm+SiwSEHsK88cxu9ps9CehpEwufRrLSrSjyDXcFion8soNtM8JR/KfAQza7/ATOcsp3c3M9wMt7L2WxszLZCPUlDWACrBDJnC9Ldk3/EI42YZlD21ly0lUNVdq6C0ULaGv0XGhiBAFw4bqmTEpW1EKBGmpp9TZSMF2d53Bix0Grq1w9xDvgxrNjxo9tJkhCaFRKY6AwdlLuOojkFeDfTVrHy4JY1S4B6fO6/Bd/j7XR1MfI9wctW/P632b/UGgERf377r2oA1/oKsGbQr3EIdTiDYAmE6MeJXhKH8iSOk9F3l5i3fqFU3gPNugfdGSqH4zE/cB3T3rz/gIXcRMQwBkxItjtzGL+ouE71Yv4AtcyViv+OtyaojDs1tTt24xFtJ7MKWFPv37ZuPlIytEQCFOevSEmrfVRSAOo14/dA4p7KgiCo7CtRtEEmsQtM/YapToFZ2SjAy8Il4HJ/Pf+AC3BXwRn/DoxLoMaEoxQgH5C/wEo3LjfmKssVBiSzCZHg+EBXCRlGmPL0czHv3uAy1HUppmuYfHJWP/dbNiPuYnRqfd1VCIxsufgvwcvmtCBKrfqstYR+3WACQR+tcucgkW/KtSh39Qc2H8Cb4NCvWitLyd5lwY3QBfZctxfF13F4QnLtNEXrezQRe2sftUZWyj1lT3FE8dDhmcampCPuAmj8b/g4Hk2/mIdYTPKkXP4G0cKSGGhC5PqDmeUigJ5CM6eHDyRkSzsiBYTbRYLp1qLRnHb+FAKr/BSyXWDK62Cf8BgAg3/jRPXmeINSQqj/HiMv1Mf3to4UjAMBYAMHoRADndClt3MXXPx7NEfx5t4ooE//wLsjVC6F3KPpmkqiIUUseZYcQ81AV8itMw9/Jc4Pc51x82iMv+MFXEwLhAG+BsI/5kZkp3ZGNOcxOf48Rb2NH/PSriYGQgN1d35/02GWKgRjYfv/KBOy3+ViZbaRUYG6YSmto5N1iNXaAgAxqQQrcitd12O/FbzczGX+OCU7Gm/2gtW0wAHsL0F7IjEZgvfzN4q7OKWzOKKij+0X7nxhUgGMKvfktv//4Efq8/tCgrjsJt+H/BQbh1JRr/kAXcEGyxdAkqzN1OwjxveQ+abjDwMOQv6ne9f2pKMzz/ARBo02xGW6iQnS4SWNu/ONLjUZ3rrFswZsWrV1QHs2xWdt3tF7wEgKDDmmwnpeo9RAb8Ho9w2tF14xfIuZ849rvYRMqkGkvGOHbnvJ6Q6xkVzknngg50ZmFhXhnZWfY63tN8SiJCUNVt/tggdmq9MENp96j3BBxirC5qpxgDjYOcZLuCQnaFGiIiw7mQoIo3/gIa7IQuTuxpR//wELu21uwGTUIWB/AQt2s2f5AC1CjmEdWqJwNDQJyjBG3FcZiNzeZRZl4QCI6+To/BsWKNGOUJRDYIderlJGLKIiB5okFXhFI0F2/cIlv4TRyi0iMzPO+8vMlcPMQsiCQgq53bypG3C98thZKUrhYiAY7agzHve1IkNcbghx/Dz1HcCJHorXzSiDgKgjPiSryyqSktChSaegOqJw4YqMvKfn3v341YmHGnsnixgZoYNWbNnTUPGfYAlk45E5zr0MY3i6IGyIpwRIBYMjKJhi3PRScZ0DzsCc1x1jbW4RVq53TsqR4BZ8irHYCP8HCGRYMxtw9/AGACE3+vgvHz/HCmmV/gJasd//UG4ol0wGmZR6Nsxb4ViznmdKiJOGeIvfxsbWgOh7rQqbNlh/A33nYXKSUFejODIMalNeh5Pnvz/cegzAMFCB2In9YpB09712fwoEKfOChDfX+CQhP+GHjcRoAT0Ujn/Fyktl/gox6cA1qphAInBYGhAKdnKaDBJYN/xclkg4n+7EskH/ECWRz/i5jrVzVIARdpXbieHUvEURyV5yUbQUhbS2gmyIVvy9dgC4NZMFiLi0AIA1cX43xT5xAalIkbho7w7GhKCLg6MDM5MtMiI44rxGq9UKYSbNzRb2bTMHkExVKITWrxH55IpJF9wBUxws+ocQSKOG562Fr0THg4dQ0IBW1Zhe9Pfkekx1PyFniRPN5VZ1wAOA6zk/Y1OQK4C9xESTZ+7UoW5TNRmzIGSKogMQ4LKguzRvNaFFJ9I45Qz1bu7uZPat8VLc4iuAZ9U9dklZbWkiOiFAvDubBPePzeZiUlZFgySfBVV7s2HMa29LjHMx0c4VlD/OB99x2gYAQBr//B7YN/iYWpm/wPJdUFE5rPt8zAWjIrHpPWFTx34dBy9/hgefLHga7PjT+s3MJyLvMWvXO4wn7c+s/isAf/zEX3F8/+ZhjMb+BfN/tLF0F4cPtexqewgie49UwsBis0upLH4CSF4yuwZdQ8sQ+S3bhvxs7Tusx7iwmNBXgIU4KAI/wYo82sSpCR4SjABCgKDuuwB/gg7o8/yYJGG90IspHNANol0swm+VQPKk3fMZf/CzGYifkHqyHD/gIkoH/wcDXFW5xIfTtYti4vG284BXFA9WEHorhTvWyx15jL279wo0adBIm61z266CCQ3GKsKpSATrFT22y38DrVgM2X19/AYAJjf6+RbBf4E5OsC+QP/BDYhJCAmJRif8BgAhBBhVV5yn+JhQJiIBgO7NLiXgrZuQOE8sQm/xkYweHo4rAdsRYH+ViZGH+K/xw/gcCG/xwcUGNUAU/w7pzhWJ/rwcleIP+LCd8n8ZM8L/wATvkgP4JGdP/gCd8n+QCiYf+EvyD/HQ3tuswgEykl1/wGACeUGEc1l/nZO4C//4FP/f7ATuAnkFIqhyvWThl0y+Rq2iHTSaU0zCcSvEIubCNLEvLzI3aL+HgV/u/nNBX7rR4qWthmqo3WVETRgGFBHMkG5oEsXhKSus2IPdkL/ARAKKzBH+/gqV4LsWXFwPmYkre1oJYpj/i0Qa5ZFwY85T5a5shzoK3HOQZ5G70lNiwVWHjk2TKZurGF2iuXJZ3lE6CrOnuj5OVgwCqb5kYt11FvPZBLS+6LGqpDGu8KtqMX2JzAMca7q+iAthU/DyTfOrh5NsShQJOzCtCnLCgd6YpcOFZY9kUxYJooxsoPu8SnJuWHniwK9iZuoUyaFO+tdGp7Is0tZJE8isJ2idfxtY87rBVapAl9Z1j4rLk/UYgRjRcsqhowyqgMQC5Hk7IivxwuS7YK9bBsVokyZBELFxq+MtLIOedWK5JcRhkf3hHDfGPRNxhmXDpjLJVJHz0Ab4b61yNaNs15ayUcUeKUwxvANQfHu0n/ARYiubsb+/liDaG9bgUf2BmKp/AprgOJlh4hj/4CUdnede/wEwWokxYmprzg7iCG9nqgo/V6Tx/gJQD3y+kKnn7J1qJKFj2RDnZx9bffIrZ2G4CW61sQ/IrAZ0o84UrldATj6mQryShEJoHJzCJXbcKqrOXUaeA83v7/bF25MqIdG2e+hijUy2x3D/MvKVc/mA1XwuphxGaEsXWDo3K4EP9zkwi2BeoUNksO4WRPBKREjPuv8A2SoRvfCPMgw5kckvocAb0XBLBVokEYtp+YiD061S3HcPJaoPunUQ8pl0lLaLoOYxou3S7tgFwLEJVb7JHdlIUVpGq2fcqqBZM6oarTvBmRE2U7IJgexWYTnnnxispHGN2rObi9Ges8Kw+AhiO2aMoyRyyN91dzlxJWMjUuxu6GEP1n70KTCI5wOi0MqPDBdh85aAORxBLv3Px3WzByc+0E/UDlXOuxkBqbfAO/wEVbLjlmMtJeStLuplhlLBBjSSNXOntC3pVZ3mwSMGCpCcQu+x4p1It5yYRQOOodEGRMg3sJ1bQM3mqZsAmNT+cglg1xHs2cqnBF2zBPo0dvcpUhJ8Hdz16tLXzSGuZebcirHclJoO1mz/SPGfCfw11LDLPk4lJ0t9YZgrUWFHP0jkMgQbCKTgnSHoZaeHDDdFVFpf2ipYNDX2NPPEJmY8NRFAYJL+L1ty4MSWXKCtsO13Kf4COgJkYBPd4KXXh0GX1aIodGZczKq5M69ErANS6Ze0+U6GJ4r5eY+Hc3eeRRMEMOOQBTUo4f4A1rBA9343ukEReYPbxThWdx1SY9RRwbqgQkQC78oGW9Zz93lTy6RUBsKKLuGhctQCRTdXNc9oTmHSecb1DwQqdnOyOJI/wE36ODMyg6GpQM2mH6+Nw6yJwTkAWWzs9G/NjEZb9qozjHFZny6R5cpebdtm+fU9tNKy1++EAl53QcvgzepDiJl6D/KBfHhzpGuFZnfzIVo6RwHxj8sog3qxoFlZpnrDaxkL0JNtFwiCOPbHdFtGgVjW4X15V/4CWtmlXIRVYbluxbozI6v0wKO1ryGwq5Hl9aZTwnjadeaV3UdMmnzklWtmnLWfYWIkxgJaPeGJYB9NsEF/GJzaUYDEv+ee8ZMhcWjjde6tzfv5aogiHczv/QukAeGaU8vxhSLjT3N/UUs7X1OhERfJ5ZuzHXn6oXUscQOh2fUdr1pXnp8UQycoMwRj0JOgWt50BTAM+xqxG3ymyIBH/AQJnyDEsxlkerjdkIQCW8CWrFbPb+5Gkg2AIfH1oauMug14mHTEr5wmwPd+YLi6o8Jp3VHqm7Y8aD7TAGfBbfOxjcMClefMa0+mGSYQ+ggujFD6DolQSyejCs/lWXDCyhUeRMMjMVgTNHFbC/LVDyNQOPlO1IH+kysmOm4ZIWObvRxQf8A3S/1ee/3zsICL8wknQL/P5Xn0l6EKPOjTx6wdb823j7UBG2bOcojq10s7xLfY+Czd/eSb23wXGtKssdQcgx+T9U46QoNh5luoG1GxT8v/6AwCVAN/QQfk09HP2v02wir0A1Nq7YZucJ6JXln8O8aNJ/e3GiRrhm6xFRyLnx/quM9brvDQ7lAl5/HjbDf4cv/Au7PdgRADgKH/4CKGRvpCInmqR54Ev1v+AMITIIj3MHLauCezQn+ysj7CfbK8v6VvvRhCBPRaVSlWpbyqC/8IGxDBt/tA2IYgN8/2MbEMAIB//42IY//8bEMf4sNiGHWrhP9/G5Bu6VILxDWdUFL1PM0mWuXLK+iClJ/B+ypsy7TPeGVQCU8O4ZdQhmQHIIVDq5S3URuqm2xWxdj9I0OyI44CbM4JUkkQxPcwhos9pYKfUJUgWm/hxniT/4hhJnES5cIR2lOsvdIdfbMzndCe+OA0HTBr7FFkK+0KTnqQfg4/VZtbyWJQ8qUEzIFq2tnqVL00tHGycEW8QCQ6OysLhcRM/79nd11Rp9JgYh4NfJjYwiID3soXZ4Nbk9cxgWi2DexeGVp/kBCAWgZJOsvCreR6rkbW7oxnoBiyZdjSjcwHAflLJcqfwtO5G594Uo0b8Dr6f8BIdqKP7NV6q24jQlSk9lawGquSGp1fwOebLgdAzSDdx656rkt3Ot3FU0CaQmp+oJxMPyPjYGf0JToEUnazRclvR1NalyMw/q6kqi7hjvftxoH1gOmNN1gk6fLzJ6iqulZyWMz/u5CCoJbX+AmedsQDHCBTIM/AcWR7IBmDpD9sChUCxPFSFKeR9DC8atFVHwme3YFCA94yh8e/8fR3A/hRO4/wUC0R/wSnb/4wWxEeewoHkjf4G8/MPutHMktkdHcrWkrS8we8kuINU9cOzS8HcYtyyyMOqUpQmErnraI/LK4oWt2JEX917AJYwdP4Hjd/5iFCP/BR+uf/C1zJ/ggfD6q/8jLWFf+DAsDv/Mg+H1/Cc7LfxWAP/xZZk/wMBTf5wC9bAMwuq1lQ8n/XRqwb92/zjwTkN1/9pBVQ/8B+W2S9+Qs0fgGBo7puC4LT0AZ+JoBi2Oa7TfECrDNqyPDrQ7/CgWF/A0sf/D4ZbDgQKRxOkO7rAYH8CekWfxLlMQsoiu8+i8pNCaJYvJChAPYI+cqTEizm+8RS6QQwq4J4RNhl1rY9Kz/jnRFTQLP66lH+UgEArIaa721D/HwPSdthmB/AnR5plw8izmV+2AX4hcyTMN4gut76Xn+DooGgINllyU4bCOqfwous/4MS/Bg5m2gj/gMAEFIMCeu//JxWPz/j4Vf4/hAa3e8EqwE1J7InzEBHrCuRBVtItQeN3y2Vbf5qEQHuDkiwA/x0rxSLUIBL4syf8BgAk1BgTOhf52V4pP4ALav88DXRkFSLn++leKTx/q9dMB/qbLBZBgp+6czhIYYt5U5+/lxEW58G0e3ndaJcb/fSvFJDf4+FEmVX+HgV9dDRhjNEpPXFYChVSYduEZA/ftkmzHfnlZsmHFAMnK84A/j9YUli+TQ40zxAhwuSDrp8FpcIw8oz7LjzJlXQC0OV4A5LKR4Y9Wzk6INrDJHjK9xhZmcgxzi6sCjvVAItnnjEhBFr/h+6ttez+APbkt09SXUqlCmsAarpyfLVFQ60ab7tIrnQy/wETtR06w3z+rhsCz+IgtcG/0ErxSfwaDftPb+VoAAgAN/qxXikBvPVivFIDf6sV4pP4zAVwH+IAi/+NQK8CA8pvKUaymHRgSsKhfq6VrjF5mv8egnDImXMSTkITzl7/hjGu/gS3t1hHyw2HtOq4R082lYQU8qXrvFeDo++ZXT24QY924Y0eoL+FBjT+Jx7IBIGmfP1UlgMD/AAAAAAAAAAAAAAAAAAA=","commitment":"0xb0a3bfb84c00471cde226a3498a4423b7af9ef1aba906bb55126905ceead873223e07bcafe4f79e18b0f5225a6d1a39f","kzgProofContract":"0x992d307d37be2bece98ab73219206050f80a6d9ab79d4f0ddcab7278a32ca17cc0f94bf466b2255ef4bf11e6de68f460","kzgProofSidecar":"0x8193809f74d276d1f0bfa6375804f1dc6576710f091bf7215edd0b54d59140dd2b9012e9cc7e77cef7906d1466584c90","expectedX":"0x248fd097e571df172a37aa7d51f6fad1e4c4d2f775964ed542ae6ac6db291c64","expectedY":"0x4d86470e1e764ea8a5c803baa86e771f1bb06fea165eb6bd6d44c6643db0137b","snarkHash":"0x123b7daa014e60a68ce296f688f118229c5ccfbc14f0c148f754debaac7e349c","conflationOrder":{"startingBlockNumber":30389505,"upperBoundaries":[30389593,30389690,30389806,30389939,30390016,30390023]},"parentStateRootHash":"0x7bf031741eb6f9c15bb8233d183edf8e345e26c971e25d9278bbdb5e48bcca7c","finalStateRootHash":"0x2ce14a6a2bf22a541fbc493f32580f3e44af6afa37c1d19e6512097e1d9b0ac0","parentDataHash":"0x01f8ee078933b7265e4500cbe02c1150ef6476fd481dbb3d3b2a71b01c533c47","expectedShnarf":"0x2ce0de20aa961b4a7a63ddc74bbaf82a91a38cb771dbebc687e2673d137c71c2","prevShnarf":"0x27bfba217ed5a1e16e628eeb6699a41ec6aa7933aefd13dc6b374b4d4ab45980","proverVersion":"7.1.0","verifyingKeyShaSum":"0x213d4f5d80ac94278aef1a2ae0f73a0e069d5756b2346147da1aae310fe8bf1a","decompressionProof":"0x008dafd2e9c0b5e83dc49972763baf8e826ee9591e0a283db09ea77b9f2947a63b5ebe34db4356fa05e7e1083360d156006324dfd547089ac03ada072097c748bf68ccdb1082686d1c0980aaeda51ccecc55af755df7d5b436762a4a446f2dc1011f85299f7aecdee27e7101f4ba805fe8c4022fa8941f22a858216fef4b39efada10c392a1b37255d6db7cdc15f9c9401a8238559a69a0e08bd681b1e0f91d8138e0b4c9cddec576420620e38f2115a0e32f68204c3c109444c550deb98519700c491a3b9e82c4cb715fc04783215929fcf74e2043c7fe546ff2d1da42d7d76d5f0d4f1eba8ed0e48752e97246def44016ec65be362ea9cc2709a07d61c1f7d84e4542878e830fb94326138e2c5ef9a5b57732fe8e53996d93283c2815dd0dc011ef32ba3186b9eba0727724ae41ae662752fb4ba479e11f34765b948a55ba1ef9816cef6aab18ba87c82b9720419d201790c94bf8364cdc4bf0cf2548347ea646953c9a23d878e1111eab5ada625e7a7d060104eeed6c11b0f190ba379e6400045484f5c00851a6f5c0ea4e933dda5049d8319a960963e072f74e485aef33380dce50a15521936ef850c4bed9c840500973202badc28436378d82b8303c73b5433710d6288244fb455d968029b86563b56378c674d9ffca16dc871b80b4749001d66250185e8bb53c48f45d22c7c82677621bf706841bc4d02b3ac6df63e1c743318dbce8d40c1ea46d65af4c95f9701771e3c7da86477585e9ffc6722bdda0dc779343c8d1fd5deb2f6f325e24eeca3e92d5127963135cd266c4f7ba9479b01a509f3476c1837fcc6ed5d0b1ddb1072c5838e91bed8b37decf8b69953a9d17054bf2fba88ccca62d6d75c12062caf009f57850be284d346a4d25489ff33b73c7f0b42eb5194eee5ed9f6e300f3007f1c823bb495e9f2b65d746a7961ee5b800b1d553715685e8738f9b5dd5216422e09f5d2c030c107157dae96bfdbd81614fca3cc866e6f4c4c24aaefd3620d1700098b492bc4f71b896a78f04ef1e1748748bdeefadee1a04c79cc305e8b54dcc1142a3367a28cf37d8a3e471778bd570000000070371f5c9baa3e6e5ccb0492abb6d07449db9f3bdcd9644ed7f471080b26b97ac0bbda9b0ce8fad07a167f7a615e6dd38c583a655d611d2bc8b73cf2dda86bb751169fe3ad519f1ab29d945b3838098f36f53758ae1cec35f7ebbc0757c3ad12801e36df4b840687b74219f72d330a8bf366623666713d6cf61471759c8753e54001945e567593d5fc48cedf610163332787706929420675764fc1d7434f1cd70075669e9cda8dd73360ddc449fd2bfe46c3573639b85d98d349f930299ff63f300c31bfd66e8ac5ac76735532a8e7a8968ecd05b6b8cc74d43ba2b6371bd8c6900733202453b2b48b012797783ef903f54fb8d3710621ef7835648cdd360477cd89e86b9410352025183fecfec56785e00deb04e87a9ae4969c3b5d406d731016a57b751c5ab0a59ef89c1e0af97f1e71fbec75863a2ff4b4df53e0fa6e5c29e0257f184a32d85b7c511c695fa7caf2746f8acab694f3b737079e2af31afa51e0000000100e428a8bd0fd0e802af0d1f759d5933438b569c8aad2d32078f5ba0d292cad5addfe6082114647738c831de1eff1eef00ca00ee74f471fd879be408b54f6ec9f200042f34efb0594e296f91034c951aea812e09eef8c405ea58b5794ba36251","debug":{"publicInput":"0xece75fe1ac02da36904a0f27425d51579f60b660f63a0d9f0ebc8efddf42afc"}} diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/env.txt b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/env.txt new file mode 100644 index 00000000000..debaa2d37c8 --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/env.txt @@ -0,0 +1,14 @@ +host: AWS g7e.8xlarge +date: 2026-05-08 +gpu: NVIDIA RTX PRO 6000 Blackwell Server Edition (97887 MiB) +nvidia driver: 590.48.01 +cpu: 32 vCPU, Intel Xeon Platinum 8559C +memory: 249 GiB +kernel: 6.17.0-1013-aws +go: go1.26.0 linux/amd64 +git head: a75e7a74822da73e722aa7046fe31dc9f2ac6f6a +build command: make GO_BUILD_TAGS=debug,cuda bin/prover +config: reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml +data: /home/ubuntu/provertestdata2/prover-compression +runtime env: GOMEMLIMIT=180GiB GOGC=75 +GPU activation: automatic (gpu.HasDevice() detected) diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30388561-30389025.time.txt b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30388561-30389025.time.txt new file mode 100644 index 00000000000..7373c80ce2e --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30388561-30389025.time.txt @@ -0,0 +1,23 @@ + Command being timed: "bin/prover prove --config reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml --in /home/ubuntu/provertestdata2/prover-compression/requests/30388561-30389025-bcv0.0-ccv0.0-b686e4f81a269c88420e67e8235e63faee8aab7b735520cd112737fd283bff4f-getZkBlobCompressionProof.json --out reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30388561-30389025-response.json" + User time (seconds): 269.34 + System time (seconds): 102.47 + Percent of CPU this job got: 285% + Elapsed (wall clock) time (h:mm:ss or m:ss): 2:10.41 + Average shared text size (kbytes): 0 + Average unshared data size (kbytes): 0 + Average stack size (kbytes): 0 + Average total size (kbytes): 0 + Maximum resident set size (kbytes): 210398068 + Average resident set size (kbytes): 0 + Major (requiring I/O) page faults: 0 + Minor (reclaiming a frame) page faults: 55817808 + Voluntary context switches: 194020 + Involuntary context switches: 9978 + Swaps: 0 + File system inputs: 0 + File system outputs: 392 + Socket messages sent: 0 + Socket messages received: 0 + Signals delivered: 0 + Page size (bytes): 4096 + Exit status: 0 diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389026-30389504.time.txt b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389026-30389504.time.txt new file mode 100644 index 00000000000..918fb750644 --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389026-30389504.time.txt @@ -0,0 +1,23 @@ + Command being timed: "bin/prover prove --config reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml --in /home/ubuntu/provertestdata2/prover-compression/requests/30389026-30389504-bcv0.0-ccv0.0-27bfba217ed5a1e16e628eeb6699a41ec6aa7933aefd13dc6b374b4d4ab45980-getZkBlobCompressionProof.json --out reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389026-30389504-response.json" + User time (seconds): 270.40 + System time (seconds): 101.98 + Percent of CPU this job got: 285% + Elapsed (wall clock) time (h:mm:ss or m:ss): 2:10.21 + Average shared text size (kbytes): 0 + Average unshared data size (kbytes): 0 + Average stack size (kbytes): 0 + Average total size (kbytes): 0 + Maximum resident set size (kbytes): 210422152 + Average resident set size (kbytes): 0 + Major (requiring I/O) page faults: 0 + Minor (reclaiming a frame) page faults: 55823826 + Voluntary context switches: 215885 + Involuntary context switches: 8222 + Swaps: 0 + File system inputs: 344 + File system outputs: 392 + Socket messages sent: 0 + Socket messages received: 0 + Signals delivered: 0 + Page size (bytes): 4096 + Exit status: 0 diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389505-30390023.time.txt b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389505-30390023.time.txt new file mode 100644 index 00000000000..97e237329f1 --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/logs/30389505-30390023.time.txt @@ -0,0 +1,23 @@ + Command being timed: "bin/prover prove --config reference-benchmarks/config-mainnet-limitless-7.1.0-provertestdata2.toml --in /home/ubuntu/provertestdata2/prover-compression/requests/30389505-30390023-bcv0.0-ccv0.0-2ce0de20aa961b4a7a63ddc74bbaf82a91a38cb771dbebc687e2673d137c71c2-getZkBlobCompressionProof.json --out reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/compression/30389505-30390023-response.json" + User time (seconds): 269.75 + System time (seconds): 102.48 + Percent of CPU this job got: 286% + Elapsed (wall clock) time (h:mm:ss or m:ss): 2:09.96 + Average shared text size (kbytes): 0 + Average unshared data size (kbytes): 0 + Average stack size (kbytes): 0 + Average total size (kbytes): 0 + Maximum resident set size (kbytes): 210266244 + Average resident set size (kbytes): 0 + Major (requiring I/O) page faults: 0 + Minor (reclaiming a frame) page faults: 55784187 + Voluntary context switches: 202579 + Involuntary context switches: 9104 + Swaps: 0 + File system inputs: 344 + File system outputs: 392 + Socket messages sent: 0 + Socket messages received: 0 + Signals delivered: 0 + Page size (bytes): 4096 + Exit status: 0 diff --git a/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/selected_requests.txt b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/selected_requests.txt new file mode 100644 index 00000000000..3d82185b331 --- /dev/null +++ b/prover/reference-benchmarks/results/2026-05-08-g7e-8xlarge-gpu-compression-final/selected_requests.txt @@ -0,0 +1,3 @@ +30388561-30389025-bcv0.0-ccv0.0-b686e4f81a269c88420e67e8235e63faee8aab7b735520cd112737fd283bff4f-getZkBlobCompressionProof.json +30389026-30389504-bcv0.0-ccv0.0-27bfba217ed5a1e16e628eeb6699a41ec6aa7933aefd13dc6b374b4d4ab45980-getZkBlobCompressionProof.json +30389505-30390023-bcv0.0-ccv0.0-2ce0de20aa961b4a7a63ddc74bbaf82a91a38cb771dbebc687e2673d137c71c2-getZkBlobCompressionProof.json From 4f50bfff16270cf7d6cddfa5de4d02dcdbe82767 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 12 May 2026 14:33:11 +0000 Subject: [PATCH 2/4] chore(prover): gofmt fixes for gpu packages Co-Authored-By: Claude Opus 4.7 (1M context) --- prover/gpu/internal/generator/config/curve.go | 16 ++++----- prover/gpu/quotient/quotient_test.go | 5 +-- prover/gpu/symbolic/compile.go | 34 +++++++++---------- prover/gpu/symbolic/stub.go | 8 ++--- prover/gpu/symbolic/symbolic_test.go | 4 +-- prover/gpu/vortex/gpu_test.go | 2 +- prover/gpu/vortex/stub.go | 26 +++++++------- 7 files changed, 48 insertions(+), 47 deletions(-) diff --git a/prover/gpu/internal/generator/config/curve.go b/prover/gpu/internal/generator/config/curve.go index 7e7fed9f5a8..ecb7609049e 100644 --- a/prover/gpu/internal/generator/config/curve.go +++ b/prover/gpu/internal/generator/config/curve.go @@ -9,14 +9,14 @@ type Curve struct { ScalarBits int // scalar bit-width: 254, 253, 377 // gnark-crypto import paths - GnarkCryptoFr string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr" - GnarkCryptoFFT string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft" - GnarkCryptoKZG string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/kzg" - GnarkCryptoIOP string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/iop" - GnarkCryptoHTF string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/hash_to_field" - GnarkCurve string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254" - GnarkCS string // e.g. "github.com/consensys/gnark/constraint/bn254" - GnarkPlonk string // e.g. "github.com/consensys/gnark/backend/plonk/bn254" + GnarkCryptoFr string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr" + GnarkCryptoFFT string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/fft" + GnarkCryptoKZG string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/kzg" + GnarkCryptoIOP string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/iop" + GnarkCryptoHTF string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254/fr/hash_to_field" + GnarkCurve string // e.g. "github.com/consensys/gnark-crypto/ecc/bn254" + GnarkCS string // e.g. "github.com/consensys/gnark/constraint/bn254" + GnarkPlonk string // e.g. "github.com/consensys/gnark/backend/plonk/bn254" // CurveIndex is the integer passed to curve-indexed C API calls (curve ID). CurveIndex int diff --git a/prover/gpu/quotient/quotient_test.go b/prover/gpu/quotient/quotient_test.go index b189d168212..9ff090e29e9 100644 --- a/prover/gpu/quotient/quotient_test.go +++ b/prover/gpu/quotient/quotient_test.go @@ -15,8 +15,9 @@ import ( // for the full IFFT → coset FFT sequence used in quotient computation. // // GPU convention: -// BitReverse → FFTInverse → Scale(1/n) → coefficients -// CopyFromDevice → CosetFFT(shift) → BitReverse → evaluations +// +// BitReverse → FFTInverse → Scale(1/n) → coefficients +// CopyFromDevice → CosetFFT(shift) → BitReverse → evaluations // // Note: GPU FFTInverse does NOT include 1/n normalization (unlike gnark-crypto). func TestGPUNTTCosetEval(t *testing.T) { diff --git a/prover/gpu/symbolic/compile.go b/prover/gpu/symbolic/compile.go index 291429c8c17..52e8bb5289b 100644 --- a/prover/gpu/symbolic/compile.go +++ b/prover/gpu/symbolic/compile.go @@ -4,26 +4,26 @@ // into bytecode for parallel GPU evaluation. One GPU thread per vector element, // zero warp divergence — every thread executes the identical instruction stream. // -// ┌─────────────────────────┐ ┌──────────────────────────┐ -// │ NodeOp[] (topo-sorted) │ │ kern_symbolic_eval │ -// │ │ │ │ -// │ liveness analysis │ H2D │ thread i: │ -// │ register allocation │ ──────▶ │ E4 slots[S] │ -// │ bytecode emission │ │ for pc in program: │ -// │ │ │ execute(i) │ -// │ → GPUProgram │ │ out[i] = slots[R] │ -// └─────────────────────────┘ └──────────────────────────┘ +// ┌─────────────────────────┐ ┌──────────────────────────┐ +// │ NodeOp[] (topo-sorted) │ │ kern_symbolic_eval │ +// │ │ │ │ +// │ liveness analysis │ H2D │ thread i: │ +// │ register allocation │ ──────▶ │ E4 slots[S] │ +// │ bytecode emission │ │ for pc in program: │ +// │ │ │ execute(i) │ +// │ → GPUProgram │ │ out[i] = slots[R] │ +// └─────────────────────────┘ └──────────────────────────┘ // // The NodeOp representation is decoupled from linea-monorepo's symbolic package. // A thin adapter in the monorepo converts ExpressionBoard.Nodes[] → []NodeOp. // // Bytecode format (uint32 words): // -// OP_CONST (0): [0, dst, const_idx] 3 words -// OP_INPUT (1): [1, dst, input_id] 3 words -// OP_MUL (2): [2, dst, n, s₀, e₀, ..., sₙ, eₙ] 3 + 2n words -// OP_LINCOMB (3): [3, dst, n, s₀, c₀, ..., sₙ, cₙ] 3 + 2n words -// OP_POLYEVAL(4): [4, dst, n, s₀, s₁, ..., sₙ] 3 + n words +// OP_CONST (0): [0, dst, const_idx] 3 words +// OP_INPUT (1): [1, dst, input_id] 3 words +// OP_MUL (2): [2, dst, n, s₀, e₀, ..., sₙ, eₙ] 3 + 2n words +// OP_LINCOMB (3): [3, dst, n, s₀, c₀, ..., sₙ, cₙ] 3 + 2n words +// OP_POLYEVAL(4): [4, dst, n, s₀, s₁, ..., sₙ] 3 + n words package symbolic // Opcodes — match CUDA kernel's switch cases exactly. @@ -44,9 +44,9 @@ const ( // Kind=OpPolyEval: Horner(Children[0]=x, Children[1..]=coefficients) type NodeOp struct { Kind int - Children []int // indices into nodes array (child < self) - Coeffs []int // LinComb: coefficients, Product: exponents - ConstVal [4]uint32 // E4 constant, [b0.a0, b0.a1, b1.a0, b1.a1] + Children []int // indices into nodes array (child < self) + Coeffs []int // LinComb: coefficients, Product: exponents + ConstVal [4]uint32 // E4 constant, [b0.a0, b0.a1, b1.a0, b1.a1] } // GPUProgram holds compiled bytecode ready for GPU evaluation. diff --git a/prover/gpu/symbolic/stub.go b/prover/gpu/symbolic/stub.go index fd6ec655976..b9b950a1407 100644 --- a/prover/gpu/symbolic/stub.go +++ b/prover/gpu/symbolic/stub.go @@ -34,10 +34,10 @@ type SymInput struct { Val [4]uint32 } -func SymInputFromVec(_ *vortex.KBVector) SymInput { panic("gpu: cuda required") } -func SymInputFromRotatedVec(_ *vortex.KBVector, _ int) SymInput { panic("gpu: cuda required") } -func SymInputFromE4Vec(_ *vortex.KBVector) SymInput { panic("gpu: cuda required") } -func SymInputFromConst(_ fext.E4) SymInput { panic("gpu: cuda required") } +func SymInputFromVec(_ *vortex.KBVector) SymInput { panic("gpu: cuda required") } +func SymInputFromRotatedVec(_ *vortex.KBVector, _ int) SymInput { panic("gpu: cuda required") } +func SymInputFromE4Vec(_ *vortex.KBVector) SymInput { panic("gpu: cuda required") } +func SymInputFromConst(_ fext.E4) SymInput { panic("gpu: cuda required") } func EvalSymGPU(_ *gpu.Device, _ *GPUSymProgram, _ []SymInput, _ int) []fext.E4 { panic("gpu: cuda required") diff --git a/prover/gpu/symbolic/symbolic_test.go b/prover/gpu/symbolic/symbolic_test.go index 739b02514c6..d5788e0eed2 100644 --- a/prover/gpu/symbolic/symbolic_test.go +++ b/prover/gpu/symbolic/symbolic_test.go @@ -223,10 +223,10 @@ func TestGPUSymEval_PolyEval(t *testing.T) { five.SetUint64(5) nodes := []symbolic.NodeOp{ - {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(two[0]), 0, 0, 0}}, // x=2 + {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(two[0]), 0, 0, 0}}, // x=2 {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(three_[0]), 0, 0, 0}}, // c₀=3 {Kind: symbolic.OpConst, ConstVal: [4]uint32{uint32(five[0]), 0, 0, 0}}, // c₁=5 - {Kind: symbolic.OpPolyEval, Children: []int{0, 1, 2}}, // P(x) = c₀ + c₁·x + {Kind: symbolic.OpPolyEval, Children: []int{0, 1, 2}}, // P(x) = c₀ + c₁·x } pgm := symbolic.CompileGPU(nodes) diff --git a/prover/gpu/vortex/gpu_test.go b/prover/gpu/vortex/gpu_test.go index 3c6f572df77..f6d951873b3 100644 --- a/prover/gpu/vortex/gpu_test.go +++ b/prover/gpu/vortex/gpu_test.go @@ -526,6 +526,6 @@ func TestGPUVortexLinCombRate16(t *testing.T) { } } -func BenchmarkCommit_64x16_rate16(b *testing.B) { benchCommit(b, 64, 16, 16) } +func BenchmarkCommit_64x16_rate16(b *testing.B) { benchCommit(b, 64, 16, 16) } func BenchmarkCommit_256x64_rate16(b *testing.B) { benchCommit(b, 256, 64, 16) } func BenchmarkCommit_1024x128_rate16(b *testing.B) { benchCommit(b, 1024, 128, 16) } diff --git a/prover/gpu/vortex/stub.go b/prover/gpu/vortex/stub.go index 6e45151da05..ad5ed7c2f4c 100644 --- a/prover/gpu/vortex/stub.go +++ b/prover/gpu/vortex/stub.go @@ -16,19 +16,19 @@ import ( type KBVector struct{} -func NewKBVector(_ *gpu.Device, _ int) (*KBVector, error) { panic("gpu: cuda required") } -func (v *KBVector) Free() {} -func (v *KBVector) Len() int { return 0 } -func (v *KBVector) CopyFromHost(_ []koalabear.Element) { panic("gpu: cuda required") } -func (v *KBVector) CopyToHost(_ []koalabear.Element) { panic("gpu: cuda required") } -func (v *KBVector) Add(_, _ *KBVector) { panic("gpu: cuda required") } -func (v *KBVector) Sub(_, _ *KBVector) { panic("gpu: cuda required") } -func (v *KBVector) Mul(_, _ *KBVector) { panic("gpu: cuda required") } -func (v *KBVector) Scale(_ koalabear.Element) { panic("gpu: cuda required") } -func (v *KBVector) ScaleByPowers(_ koalabear.Element) { panic("gpu: cuda required") } -func (v *KBVector) BitReverse() { panic("gpu: cuda required") } -func (v *KBVector) CopyFromDevice(_ *KBVector) { panic("gpu: cuda required") } -func (v *KBVector) DevicePtr() unsafe.Pointer { panic("gpu: cuda required") } +func NewKBVector(_ *gpu.Device, _ int) (*KBVector, error) { panic("gpu: cuda required") } +func (v *KBVector) Free() {} +func (v *KBVector) Len() int { return 0 } +func (v *KBVector) CopyFromHost(_ []koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) CopyToHost(_ []koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) Add(_, _ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) Sub(_, _ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) Mul(_, _ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) Scale(_ koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) ScaleByPowers(_ koalabear.Element) { panic("gpu: cuda required") } +func (v *KBVector) BitReverse() { panic("gpu: cuda required") } +func (v *KBVector) CopyFromDevice(_ *KBVector) { panic("gpu: cuda required") } +func (v *KBVector) DevicePtr() unsafe.Pointer { panic("gpu: cuda required") } // ─── GPUFFTDomain ──────────────────────────────────────────────────────────── From c3797565e1de0eb1491e148d1a33a6428577d613 Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 12 May 2026 14:42:58 +0000 Subject: [PATCH 3/4] chore(ci): revert mainnet-limitless dev paths; add cuda vet smoke check - config-mainnet-limitless.toml: restore relative paths (dev-host absolute paths leaked into the committed prod config). - prover-testing.yml: run `go vet -tags=cuda ./gpu/...` in the static check job so CPU refactors that break GPU compilation are caught. vet compiles but does not link, so no CUDA toolchain needed. Co-Authored-By: Claude Opus 4.7 (1M context) --- .github/workflows/prover-testing.yml | 6 ++++++ prover/config/config-mainnet-limitless.toml | 4 ++-- 2 files changed, 8 insertions(+), 2 deletions(-) diff --git a/.github/workflows/prover-testing.yml b/.github/workflows/prover-testing.yml index b15ecc23c62..619aac996f6 100644 --- a/.github/workflows/prover-testing.yml +++ b/.github/workflows/prover-testing.yml @@ -40,6 +40,12 @@ jobs: - name: gofmt working-directory: prover run: if [[ -n $(gofmt -l .) ]]; then echo "please run gofmt"; exit 1; fi + - name: go vet (cuda build tag, smoke) + working-directory: prover + # Type-checks GPU code under `-tags=cuda` so CPU-side refactors that + # break GPU compilation are caught here. `go vet` compiles but does + # not link, so libgnark_gpu.a / CUDA toolchain are not required. + run: go vet -tags=cuda ./gpu/... - name: golangci-lint uses: golangci/golangci-lint-action@4afd733a84b1f43292c63897423277bb7f4313a9 #v8 with: diff --git a/prover/config/config-mainnet-limitless.toml b/prover/config/config-mainnet-limitless.toml index 6d4f6d1c4d6..391be15f952 100644 --- a/prover/config/config-mainnet-limitless.toml +++ b/prover/config/config-mainnet-limitless.toml @@ -10,8 +10,8 @@ termination_grace_period_seconds = 1800 [execution] prover_mode = "limitless" -conflated_traces_dir = "/home/ubuntu/testdata" -requests_root_dir = "/home/ubuntu/testdata/execution" +conflated_traces_dir = "./" +requests_root_dir = "./" limitless_with_debug = false ignore_compatibility_check = false keep_traces_until_block = 0 From 066520e37c3c6de268bb31d5ce3d76ff2ea070cb Mon Sep 17 00:00:00 2001 From: Gautam Botrel Date: Tue, 12 May 2026 15:04:05 +0000 Subject: [PATCH 4/4] test(gpu/plonk2): add Fiat-Shamir transcript parity tests MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Adds deterministic byte-level parity checks between the GPU plonk prover's Fiat-Shamir helpers and the audited gnark CPU construction. * TestFiatShamirChallengeParity (+ NoBsb22 variant) — replays the four prover challenges (gamma, beta, alpha, zeta) through the GPU's bindPublicData/deriveRandomness helpers and compares each derived fr.Element against an inline reference built directly on gnark-crypto's public fiat-shamir API. The reference mirrors gnark CPU's exact bind order from backend/plonk/{curve}/{prove, verify}.go. * TestFiatShamirBatchOpenParity — exercises gpuBatchOpen's KZG-folding FS instance against gnark-crypto's kzg.BatchOpenSinglePoint on identical synthetic inputs (same polys, digests, claimed values, point, dataTranscript, SRS, and folding hash). When the gamma folding challenge matches byte-for-byte, the quotient commitment H is bit-identical; any FS drift yields a different H. Generated for bn254, bls12377, bw6761 via the existing template pipeline. All 9 tests pass locally on RTX PRO 6000 Blackwell. Co-Authored-By: Claude Opus 4.7 (1M context) --- .../gpu/internal/generator/plonk/generate.go | 1 + .../plonk/template/fs_parity_test.go.tmpl | 302 ++++++++++++++++++ .../generator/plonk/template/templates.go | 3 + prover/gpu/plonk2/bls12377/fs_parity_test.go | 302 ++++++++++++++++++ prover/gpu/plonk2/bn254/fs_parity_test.go | 302 ++++++++++++++++++ prover/gpu/plonk2/bw6761/fs_parity_test.go | 302 ++++++++++++++++++ 6 files changed, 1212 insertions(+) create mode 100644 prover/gpu/internal/generator/plonk/template/fs_parity_test.go.tmpl create mode 100644 prover/gpu/plonk2/bls12377/fs_parity_test.go create mode 100644 prover/gpu/plonk2/bn254/fs_parity_test.go create mode 100644 prover/gpu/plonk2/bw6761/fs_parity_test.go diff --git a/prover/gpu/internal/generator/plonk/generate.go b/prover/gpu/internal/generator/plonk/generate.go index fe7db1552cf..9120d3c685c 100644 --- a/prover/gpu/internal/generator/plonk/generate.go +++ b/prover/gpu/internal/generator/plonk/generate.go @@ -36,6 +36,7 @@ func Generate(c config.Curve, outputDir string, gen *common.Generator) error { {filepath.Join(outputDir, "prove.go"), tmpl.ProveTemplate}, {filepath.Join(outputDir, "prove_stub.go"), tmpl.ProveStubTemplate}, {filepath.Join(outputDir, "plonk_test.go"), tmpl.PlonkTestTemplate}, + {filepath.Join(outputDir, "fs_parity_test.go"), tmpl.FSParityTestTemplate}, } for _, e := range entries { diff --git a/prover/gpu/internal/generator/plonk/template/fs_parity_test.go.tmpl b/prover/gpu/internal/generator/plonk/template/fs_parity_test.go.tmpl new file mode 100644 index 00000000000..7173b5bf6c9 --- /dev/null +++ b/prover/gpu/internal/generator/plonk/template/fs_parity_test.go.tmpl @@ -0,0 +1,302 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package {{.Package}} + +// FS transcript parity tests — guard the GPU prover against silent drift +// from the audited gnark CPU Fiat-Shamir construction. +// +// Two layers: +// 1. TestFiatShamirChallengeParity — replays the four main challenges +// (gamma, beta, alpha, zeta) through the GPU's `bindPublicData` / +// `deriveRandomness` helpers and compares each derived fr.Element byte- +// for-byte against an inline reference built directly on gnark-crypto's +// public fiat-shamir API. The reference mirrors gnark CPU's exact bind +// order (see backend/plonk/{{.Package}}/verify.go's bindPublicData / +// deriveRandomness and prove.go's deriveGammaAndBeta / deriveAlpha / +// deriveZeta). Any change to the GPU helpers that deviates from the +// audited byte stream fails this test deterministically. +// +// 2. TestFiatShamirBatchOpenParity — exercises gpuBatchOpen's second FS +// instance (kzg folding) against gnark-crypto's kzg.BatchOpenSinglePoint +// on identical synthetic inputs (same polys, digests, claimed values, +// point, dataTranscript, and folding hash). When the gamma folding +// challenge matches, the quotient commitment H is bit-identical. + +import ( + "crypto/sha256" + "math/big" + "testing" + + curve "{{.GnarkCurve}}" + fr "{{.GnarkCryptoFr}}" + kzg "{{.GnarkCryptoKZG}}" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + curplonk "{{.GnarkPlonk}}" + "github.com/stretchr/testify/require" +) + +// genG1 returns [s+1]·G for a deterministic test point. s+1 avoids the +// identity element when s == 0. +func genG1(s uint64) curve.G1Affine { + var bi big.Int + bi.SetUint64(s + 1) + var p curve.G1Affine + p.ScalarMultiplicationBase(&bi) + return p +} + +// genFr returns a deterministic non-zero fr.Element. +func genFr(s uint64) fr.Element { + var x fr.Element + x.SetUint64(s + 1) + return x +} + +// referenceChallenges replays the gnark CPU FS bind sequence using only the +// public fiat-shamir API, returning (gamma, beta, alpha, zeta). Any deviation +// from gnark CPU's audited byte stream — see backend/plonk/{{.Package}}/{prove, +// verify}.go — is captured here. +func referenceChallenges( + t *testing.T, + vk *curplonk.VerifyingKey, + publicInputs []fr.Element, + lro [3]curve.G1Affine, + bsb22 []curve.G1Affine, + z curve.G1Affine, + h [3]curve.G1Affine, +) (gamma, beta, alpha, zeta fr.Element) { + t.Helper() + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + + bind := func(name string, b []byte) { + require.NoError(t, fs.Bind(name, b)) + } + bindPoint := func(name string, p *curve.G1Affine) { + buf := p.RawBytes() + bind(name, buf[:]) + } + + // ── gamma: bindPublicData order is S[0..2], Ql, Qr, Qm, Qo, Qk, Qcp..., PI... + bind("gamma", vk.S[0].Marshal()) + bind("gamma", vk.S[1].Marshal()) + bind("gamma", vk.S[2].Marshal()) + bind("gamma", vk.Ql.Marshal()) + bind("gamma", vk.Qr.Marshal()) + bind("gamma", vk.Qm.Marshal()) + bind("gamma", vk.Qo.Marshal()) + bind("gamma", vk.Qk.Marshal()) + for i := range vk.Qcp { + bind("gamma", vk.Qcp[i].Marshal()) + } + for i := range publicInputs { + bind("gamma", publicInputs[i].Marshal()) + } + // then deriveRandomness on LRO + bindPoint("gamma", &lro[0]) + bindPoint("gamma", &lro[1]) + bindPoint("gamma", &lro[2]) + bGamma, err := fs.ComputeChallenge("gamma") + require.NoError(t, err) + gamma.SetBytes(bGamma) + + // ── beta: no bind (gnark CPU calls fs.ComputeChallenge("beta") directly) + bBeta, err := fs.ComputeChallenge("beta") + require.NoError(t, err) + beta.SetBytes(bBeta) + + // ── alpha: bind Bsb22... then Z + for i := range bsb22 { + bindPoint("alpha", &bsb22[i]) + } + bindPoint("alpha", &z) + bAlpha, err := fs.ComputeChallenge("alpha") + require.NoError(t, err) + alpha.SetBytes(bAlpha) + + // ── zeta: bind H[0..2] + bindPoint("zeta", &h[0]) + bindPoint("zeta", &h[1]) + bindPoint("zeta", &h[2]) + bZeta, err := fs.ComputeChallenge("zeta") + require.NoError(t, err) + zeta.SetBytes(bZeta) + + return +} + +func TestFiatShamirChallengeParity(t *testing.T) { + // Synthetic but deterministic inputs. Distinct points so a bind-order + // regression cannot be masked by aliasing. + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + vk.Qcp = []kzg.Digest{genG1(9), genG1(10), genG1(11)} + + publicInputs := make([]fr.Element, 5) + for i := range publicInputs { + publicInputs[i] = genFr(uint64(100 + i)) + } + + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + bsb22 := []curve.G1Affine{genG1(30), genG1(31)} + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + // GPU path — drive the same transcript through the GPU helpers exactly + // as gpuProver does (see deriveGammaBeta / buildZAndCommit / + // computeQuotientAndCommit in prove.go). + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaDeps := make([]*curve.G1Affine, 0, len(bsb22)+1) + for i := range bsb22 { + alphaDeps = append(alphaDeps, &bsb22[i]) + } + alphaDeps = append(alphaDeps, &z) + alphaGPU, err := deriveRandomness(fs, "alpha", alphaDeps...) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), + "gamma diverged from gnark CPU FS pattern: ref=%s gpu=%s", gammaRef.String(), gammaGPU.String()) + require.True(t, betaRef.Equal(&betaGPU), + "beta diverged: ref=%s gpu=%s", betaRef.String(), betaGPU.String()) + require.True(t, alphaRef.Equal(&alphaGPU), + "alpha diverged: ref=%s gpu=%s", alphaRef.String(), alphaGPU.String()) + require.True(t, zetaRef.Equal(&zetaGPU), + "zeta diverged: ref=%s gpu=%s", zetaRef.String(), zetaGPU.String()) +} + +// TestFiatShamirChallengeParity_NoBsb22 covers the no-commitment branch: +// alpha is then bound to Z alone, which is the common case for circuits +// without custom Bsb22 commitments. +func TestFiatShamirChallengeParity_NoBsb22(t *testing.T) { + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + // vk.Qcp empty. + + publicInputs := []fr.Element{genFr(100)} + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + var bsb22 []curve.G1Affine + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaGPU, err := deriveRandomness(fs, "alpha", &z) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), "gamma diverged (no-Bsb22 path)") + require.True(t, betaRef.Equal(&betaGPU), "beta diverged (no-Bsb22 path)") + require.True(t, alphaRef.Equal(&alphaGPU), "alpha diverged (no-Bsb22 path)") + require.True(t, zetaRef.Equal(&zetaGPU), "zeta diverged (no-Bsb22 path)") +} + +// TestFiatShamirBatchOpenParity exercises gpuBatchOpen's KZG-folding FS +// instance against gnark-crypto's kzg.BatchOpenSinglePoint. The two are run +// on identical synthetic inputs sharing a synthetic SRS. When the folding +// gamma matches byte-for-byte, the quotient commitment H is identical; +// any FS drift produces a different H. +func TestFiatShamirBatchOpenParity(t *testing.T) { + // Synthetic KZG SRS: G1[i] = [i+1]·G. Not a valid powers-of-tau SRS, but + // kzg.Commit just performs an MSM over pk.G1[:len(poly)] — both + // BatchOpenSinglePoint and gpuBatchOpen consume the same SRS, so the + // shared bias cancels and H equality reduces to FS-gamma equality. + const srsSize = 64 + pkG1 := make([]curve.G1Affine, srsSize) + for i := range pkG1 { + pkG1[i] = genG1(uint64(i)) + } + srsPK := kzg.ProvingKey{G1: pkG1} + + // Three polynomials of varying length, deterministic coefficients. + polys := make([][]fr.Element, 3) + polys[0] = make([]fr.Element, 16) + polys[1] = make([]fr.Element, 12) + polys[2] = make([]fr.Element, 8) + for i := range polys { + for j := range polys[i] { + polys[i][j] = genFr(uint64(1000*i + j)) + } + } + + digests := make([]curve.G1Affine, len(polys)) + for i := range polys { + d, err := kzg.Commit(polys[i], srsPK) + require.NoError(t, err) + digests[i] = d + } + + point := genFr(7) + + // Evaluate each poly at `point` (Horner) — gpuBatchOpen consumes these + // as inputs; BatchOpenSinglePoint computes them internally from the + // same coefficients, so the two must agree. + claimedValues := make([]fr.Element, len(polys)) + for i, p := range polys { + var v fr.Element + for j := len(p) - 1; j >= 0; j-- { + v.Mul(&v, &point) + v.Add(&v, &p[j]) + } + claimedValues[i] = v + } + + dataTranscript := []byte{0xde, 0xad, 0xbe, 0xef, 0x01, 0x23, 0x45, 0x67} + + cpuProof, err := kzg.BatchOpenSinglePoint( + polys, digests, point, sha256.New(), srsPK, dataTranscript, + ) + require.NoError(t, err) + + gpuCommit := func(p []fr.Element) (curve.G1Affine, error) { + return kzg.Commit(p, srsPK) + } + gpuProof, err := gpuBatchOpen( + gpuCommit, polys, digests, claimedValues, point, + sha256.New(), dataTranscript, + ) + require.NoError(t, err) + + require.True(t, cpuProof.H.Equal(&gpuProof.H), + "batch-open H differs — FS gamma folding diverged from gnark-crypto deriveGamma") + require.Equal(t, len(cpuProof.ClaimedValues), len(gpuProof.ClaimedValues), + "claimed values length mismatch") + for i := range cpuProof.ClaimedValues { + require.True(t, cpuProof.ClaimedValues[i].Equal(&gpuProof.ClaimedValues[i]), + "claimed value %d mismatch", i) + } +} diff --git a/prover/gpu/internal/generator/plonk/template/templates.go b/prover/gpu/internal/generator/plonk/template/templates.go index 824d6126ccd..4fdd406854f 100644 --- a/prover/gpu/internal/generator/plonk/template/templates.go +++ b/prover/gpu/internal/generator/plonk/template/templates.go @@ -52,3 +52,6 @@ var ProveStubTemplate string //go:embed plonk_test.go.tmpl var PlonkTestTemplate string + +//go:embed fs_parity_test.go.tmpl +var FSParityTestTemplate string diff --git a/prover/gpu/plonk2/bls12377/fs_parity_test.go b/prover/gpu/plonk2/bls12377/fs_parity_test.go new file mode 100644 index 00000000000..80cd1124ca3 --- /dev/null +++ b/prover/gpu/plonk2/bls12377/fs_parity_test.go @@ -0,0 +1,302 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bls12377 + +// FS transcript parity tests — guard the GPU prover against silent drift +// from the audited gnark CPU Fiat-Shamir construction. +// +// Two layers: +// 1. TestFiatShamirChallengeParity — replays the four main challenges +// (gamma, beta, alpha, zeta) through the GPU's `bindPublicData` / +// `deriveRandomness` helpers and compares each derived fr.Element byte- +// for-byte against an inline reference built directly on gnark-crypto's +// public fiat-shamir API. The reference mirrors gnark CPU's exact bind +// order (see backend/plonk/bls12377/verify.go's bindPublicData / +// deriveRandomness and prove.go's deriveGammaAndBeta / deriveAlpha / +// deriveZeta). Any change to the GPU helpers that deviates from the +// audited byte stream fails this test deterministically. +// +// 2. TestFiatShamirBatchOpenParity — exercises gpuBatchOpen's second FS +// instance (kzg folding) against gnark-crypto's kzg.BatchOpenSinglePoint +// on identical synthetic inputs (same polys, digests, claimed values, +// point, dataTranscript, and folding hash). When the gamma folding +// challenge matches, the quotient commitment H is bit-identical. + +import ( + "crypto/sha256" + "math/big" + "testing" + + curve "github.com/consensys/gnark-crypto/ecc/bls12-377" + fr "github.com/consensys/gnark-crypto/ecc/bls12-377/fr" + kzg "github.com/consensys/gnark-crypto/ecc/bls12-377/kzg" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + curplonk "github.com/consensys/gnark/backend/plonk/bls12-377" + "github.com/stretchr/testify/require" +) + +// genG1 returns [s+1]·G for a deterministic test point. s+1 avoids the +// identity element when s == 0. +func genG1(s uint64) curve.G1Affine { + var bi big.Int + bi.SetUint64(s + 1) + var p curve.G1Affine + p.ScalarMultiplicationBase(&bi) + return p +} + +// genFr returns a deterministic non-zero fr.Element. +func genFr(s uint64) fr.Element { + var x fr.Element + x.SetUint64(s + 1) + return x +} + +// referenceChallenges replays the gnark CPU FS bind sequence using only the +// public fiat-shamir API, returning (gamma, beta, alpha, zeta). Any deviation +// from gnark CPU's audited byte stream — see backend/plonk/bls12377/{prove, +// verify}.go — is captured here. +func referenceChallenges( + t *testing.T, + vk *curplonk.VerifyingKey, + publicInputs []fr.Element, + lro [3]curve.G1Affine, + bsb22 []curve.G1Affine, + z curve.G1Affine, + h [3]curve.G1Affine, +) (gamma, beta, alpha, zeta fr.Element) { + t.Helper() + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + + bind := func(name string, b []byte) { + require.NoError(t, fs.Bind(name, b)) + } + bindPoint := func(name string, p *curve.G1Affine) { + buf := p.RawBytes() + bind(name, buf[:]) + } + + // ── gamma: bindPublicData order is S[0..2], Ql, Qr, Qm, Qo, Qk, Qcp..., PI... + bind("gamma", vk.S[0].Marshal()) + bind("gamma", vk.S[1].Marshal()) + bind("gamma", vk.S[2].Marshal()) + bind("gamma", vk.Ql.Marshal()) + bind("gamma", vk.Qr.Marshal()) + bind("gamma", vk.Qm.Marshal()) + bind("gamma", vk.Qo.Marshal()) + bind("gamma", vk.Qk.Marshal()) + for i := range vk.Qcp { + bind("gamma", vk.Qcp[i].Marshal()) + } + for i := range publicInputs { + bind("gamma", publicInputs[i].Marshal()) + } + // then deriveRandomness on LRO + bindPoint("gamma", &lro[0]) + bindPoint("gamma", &lro[1]) + bindPoint("gamma", &lro[2]) + bGamma, err := fs.ComputeChallenge("gamma") + require.NoError(t, err) + gamma.SetBytes(bGamma) + + // ── beta: no bind (gnark CPU calls fs.ComputeChallenge("beta") directly) + bBeta, err := fs.ComputeChallenge("beta") + require.NoError(t, err) + beta.SetBytes(bBeta) + + // ── alpha: bind Bsb22... then Z + for i := range bsb22 { + bindPoint("alpha", &bsb22[i]) + } + bindPoint("alpha", &z) + bAlpha, err := fs.ComputeChallenge("alpha") + require.NoError(t, err) + alpha.SetBytes(bAlpha) + + // ── zeta: bind H[0..2] + bindPoint("zeta", &h[0]) + bindPoint("zeta", &h[1]) + bindPoint("zeta", &h[2]) + bZeta, err := fs.ComputeChallenge("zeta") + require.NoError(t, err) + zeta.SetBytes(bZeta) + + return +} + +func TestFiatShamirChallengeParity(t *testing.T) { + // Synthetic but deterministic inputs. Distinct points so a bind-order + // regression cannot be masked by aliasing. + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + vk.Qcp = []kzg.Digest{genG1(9), genG1(10), genG1(11)} + + publicInputs := make([]fr.Element, 5) + for i := range publicInputs { + publicInputs[i] = genFr(uint64(100 + i)) + } + + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + bsb22 := []curve.G1Affine{genG1(30), genG1(31)} + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + // GPU path — drive the same transcript through the GPU helpers exactly + // as gpuProver does (see deriveGammaBeta / buildZAndCommit / + // computeQuotientAndCommit in prove.go). + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaDeps := make([]*curve.G1Affine, 0, len(bsb22)+1) + for i := range bsb22 { + alphaDeps = append(alphaDeps, &bsb22[i]) + } + alphaDeps = append(alphaDeps, &z) + alphaGPU, err := deriveRandomness(fs, "alpha", alphaDeps...) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), + "gamma diverged from gnark CPU FS pattern: ref=%s gpu=%s", gammaRef.String(), gammaGPU.String()) + require.True(t, betaRef.Equal(&betaGPU), + "beta diverged: ref=%s gpu=%s", betaRef.String(), betaGPU.String()) + require.True(t, alphaRef.Equal(&alphaGPU), + "alpha diverged: ref=%s gpu=%s", alphaRef.String(), alphaGPU.String()) + require.True(t, zetaRef.Equal(&zetaGPU), + "zeta diverged: ref=%s gpu=%s", zetaRef.String(), zetaGPU.String()) +} + +// TestFiatShamirChallengeParity_NoBsb22 covers the no-commitment branch: +// alpha is then bound to Z alone, which is the common case for circuits +// without custom Bsb22 commitments. +func TestFiatShamirChallengeParity_NoBsb22(t *testing.T) { + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + // vk.Qcp empty. + + publicInputs := []fr.Element{genFr(100)} + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + var bsb22 []curve.G1Affine + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaGPU, err := deriveRandomness(fs, "alpha", &z) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), "gamma diverged (no-Bsb22 path)") + require.True(t, betaRef.Equal(&betaGPU), "beta diverged (no-Bsb22 path)") + require.True(t, alphaRef.Equal(&alphaGPU), "alpha diverged (no-Bsb22 path)") + require.True(t, zetaRef.Equal(&zetaGPU), "zeta diverged (no-Bsb22 path)") +} + +// TestFiatShamirBatchOpenParity exercises gpuBatchOpen's KZG-folding FS +// instance against gnark-crypto's kzg.BatchOpenSinglePoint. The two are run +// on identical synthetic inputs sharing a synthetic SRS. When the folding +// gamma matches byte-for-byte, the quotient commitment H is identical; +// any FS drift produces a different H. +func TestFiatShamirBatchOpenParity(t *testing.T) { + // Synthetic KZG SRS: G1[i] = [i+1]·G. Not a valid powers-of-tau SRS, but + // kzg.Commit just performs an MSM over pk.G1[:len(poly)] — both + // BatchOpenSinglePoint and gpuBatchOpen consume the same SRS, so the + // shared bias cancels and H equality reduces to FS-gamma equality. + const srsSize = 64 + pkG1 := make([]curve.G1Affine, srsSize) + for i := range pkG1 { + pkG1[i] = genG1(uint64(i)) + } + srsPK := kzg.ProvingKey{G1: pkG1} + + // Three polynomials of varying length, deterministic coefficients. + polys := make([][]fr.Element, 3) + polys[0] = make([]fr.Element, 16) + polys[1] = make([]fr.Element, 12) + polys[2] = make([]fr.Element, 8) + for i := range polys { + for j := range polys[i] { + polys[i][j] = genFr(uint64(1000*i + j)) + } + } + + digests := make([]curve.G1Affine, len(polys)) + for i := range polys { + d, err := kzg.Commit(polys[i], srsPK) + require.NoError(t, err) + digests[i] = d + } + + point := genFr(7) + + // Evaluate each poly at `point` (Horner) — gpuBatchOpen consumes these + // as inputs; BatchOpenSinglePoint computes them internally from the + // same coefficients, so the two must agree. + claimedValues := make([]fr.Element, len(polys)) + for i, p := range polys { + var v fr.Element + for j := len(p) - 1; j >= 0; j-- { + v.Mul(&v, &point) + v.Add(&v, &p[j]) + } + claimedValues[i] = v + } + + dataTranscript := []byte{0xde, 0xad, 0xbe, 0xef, 0x01, 0x23, 0x45, 0x67} + + cpuProof, err := kzg.BatchOpenSinglePoint( + polys, digests, point, sha256.New(), srsPK, dataTranscript, + ) + require.NoError(t, err) + + gpuCommit := func(p []fr.Element) (curve.G1Affine, error) { + return kzg.Commit(p, srsPK) + } + gpuProof, err := gpuBatchOpen( + gpuCommit, polys, digests, claimedValues, point, + sha256.New(), dataTranscript, + ) + require.NoError(t, err) + + require.True(t, cpuProof.H.Equal(&gpuProof.H), + "batch-open H differs — FS gamma folding diverged from gnark-crypto deriveGamma") + require.Equal(t, len(cpuProof.ClaimedValues), len(gpuProof.ClaimedValues), + "claimed values length mismatch") + for i := range cpuProof.ClaimedValues { + require.True(t, cpuProof.ClaimedValues[i].Equal(&gpuProof.ClaimedValues[i]), + "claimed value %d mismatch", i) + } +} diff --git a/prover/gpu/plonk2/bn254/fs_parity_test.go b/prover/gpu/plonk2/bn254/fs_parity_test.go new file mode 100644 index 00000000000..e402c59235c --- /dev/null +++ b/prover/gpu/plonk2/bn254/fs_parity_test.go @@ -0,0 +1,302 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bn254 + +// FS transcript parity tests — guard the GPU prover against silent drift +// from the audited gnark CPU Fiat-Shamir construction. +// +// Two layers: +// 1. TestFiatShamirChallengeParity — replays the four main challenges +// (gamma, beta, alpha, zeta) through the GPU's `bindPublicData` / +// `deriveRandomness` helpers and compares each derived fr.Element byte- +// for-byte against an inline reference built directly on gnark-crypto's +// public fiat-shamir API. The reference mirrors gnark CPU's exact bind +// order (see backend/plonk/bn254/verify.go's bindPublicData / +// deriveRandomness and prove.go's deriveGammaAndBeta / deriveAlpha / +// deriveZeta). Any change to the GPU helpers that deviates from the +// audited byte stream fails this test deterministically. +// +// 2. TestFiatShamirBatchOpenParity — exercises gpuBatchOpen's second FS +// instance (kzg folding) against gnark-crypto's kzg.BatchOpenSinglePoint +// on identical synthetic inputs (same polys, digests, claimed values, +// point, dataTranscript, and folding hash). When the gamma folding +// challenge matches, the quotient commitment H is bit-identical. + +import ( + "crypto/sha256" + "math/big" + "testing" + + curve "github.com/consensys/gnark-crypto/ecc/bn254" + fr "github.com/consensys/gnark-crypto/ecc/bn254/fr" + kzg "github.com/consensys/gnark-crypto/ecc/bn254/kzg" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + curplonk "github.com/consensys/gnark/backend/plonk/bn254" + "github.com/stretchr/testify/require" +) + +// genG1 returns [s+1]·G for a deterministic test point. s+1 avoids the +// identity element when s == 0. +func genG1(s uint64) curve.G1Affine { + var bi big.Int + bi.SetUint64(s + 1) + var p curve.G1Affine + p.ScalarMultiplicationBase(&bi) + return p +} + +// genFr returns a deterministic non-zero fr.Element. +func genFr(s uint64) fr.Element { + var x fr.Element + x.SetUint64(s + 1) + return x +} + +// referenceChallenges replays the gnark CPU FS bind sequence using only the +// public fiat-shamir API, returning (gamma, beta, alpha, zeta). Any deviation +// from gnark CPU's audited byte stream — see backend/plonk/bn254/{prove, +// verify}.go — is captured here. +func referenceChallenges( + t *testing.T, + vk *curplonk.VerifyingKey, + publicInputs []fr.Element, + lro [3]curve.G1Affine, + bsb22 []curve.G1Affine, + z curve.G1Affine, + h [3]curve.G1Affine, +) (gamma, beta, alpha, zeta fr.Element) { + t.Helper() + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + + bind := func(name string, b []byte) { + require.NoError(t, fs.Bind(name, b)) + } + bindPoint := func(name string, p *curve.G1Affine) { + buf := p.RawBytes() + bind(name, buf[:]) + } + + // ── gamma: bindPublicData order is S[0..2], Ql, Qr, Qm, Qo, Qk, Qcp..., PI... + bind("gamma", vk.S[0].Marshal()) + bind("gamma", vk.S[1].Marshal()) + bind("gamma", vk.S[2].Marshal()) + bind("gamma", vk.Ql.Marshal()) + bind("gamma", vk.Qr.Marshal()) + bind("gamma", vk.Qm.Marshal()) + bind("gamma", vk.Qo.Marshal()) + bind("gamma", vk.Qk.Marshal()) + for i := range vk.Qcp { + bind("gamma", vk.Qcp[i].Marshal()) + } + for i := range publicInputs { + bind("gamma", publicInputs[i].Marshal()) + } + // then deriveRandomness on LRO + bindPoint("gamma", &lro[0]) + bindPoint("gamma", &lro[1]) + bindPoint("gamma", &lro[2]) + bGamma, err := fs.ComputeChallenge("gamma") + require.NoError(t, err) + gamma.SetBytes(bGamma) + + // ── beta: no bind (gnark CPU calls fs.ComputeChallenge("beta") directly) + bBeta, err := fs.ComputeChallenge("beta") + require.NoError(t, err) + beta.SetBytes(bBeta) + + // ── alpha: bind Bsb22... then Z + for i := range bsb22 { + bindPoint("alpha", &bsb22[i]) + } + bindPoint("alpha", &z) + bAlpha, err := fs.ComputeChallenge("alpha") + require.NoError(t, err) + alpha.SetBytes(bAlpha) + + // ── zeta: bind H[0..2] + bindPoint("zeta", &h[0]) + bindPoint("zeta", &h[1]) + bindPoint("zeta", &h[2]) + bZeta, err := fs.ComputeChallenge("zeta") + require.NoError(t, err) + zeta.SetBytes(bZeta) + + return +} + +func TestFiatShamirChallengeParity(t *testing.T) { + // Synthetic but deterministic inputs. Distinct points so a bind-order + // regression cannot be masked by aliasing. + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + vk.Qcp = []kzg.Digest{genG1(9), genG1(10), genG1(11)} + + publicInputs := make([]fr.Element, 5) + for i := range publicInputs { + publicInputs[i] = genFr(uint64(100 + i)) + } + + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + bsb22 := []curve.G1Affine{genG1(30), genG1(31)} + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + // GPU path — drive the same transcript through the GPU helpers exactly + // as gpuProver does (see deriveGammaBeta / buildZAndCommit / + // computeQuotientAndCommit in prove.go). + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaDeps := make([]*curve.G1Affine, 0, len(bsb22)+1) + for i := range bsb22 { + alphaDeps = append(alphaDeps, &bsb22[i]) + } + alphaDeps = append(alphaDeps, &z) + alphaGPU, err := deriveRandomness(fs, "alpha", alphaDeps...) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), + "gamma diverged from gnark CPU FS pattern: ref=%s gpu=%s", gammaRef.String(), gammaGPU.String()) + require.True(t, betaRef.Equal(&betaGPU), + "beta diverged: ref=%s gpu=%s", betaRef.String(), betaGPU.String()) + require.True(t, alphaRef.Equal(&alphaGPU), + "alpha diverged: ref=%s gpu=%s", alphaRef.String(), alphaGPU.String()) + require.True(t, zetaRef.Equal(&zetaGPU), + "zeta diverged: ref=%s gpu=%s", zetaRef.String(), zetaGPU.String()) +} + +// TestFiatShamirChallengeParity_NoBsb22 covers the no-commitment branch: +// alpha is then bound to Z alone, which is the common case for circuits +// without custom Bsb22 commitments. +func TestFiatShamirChallengeParity_NoBsb22(t *testing.T) { + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + // vk.Qcp empty. + + publicInputs := []fr.Element{genFr(100)} + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + var bsb22 []curve.G1Affine + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaGPU, err := deriveRandomness(fs, "alpha", &z) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), "gamma diverged (no-Bsb22 path)") + require.True(t, betaRef.Equal(&betaGPU), "beta diverged (no-Bsb22 path)") + require.True(t, alphaRef.Equal(&alphaGPU), "alpha diverged (no-Bsb22 path)") + require.True(t, zetaRef.Equal(&zetaGPU), "zeta diverged (no-Bsb22 path)") +} + +// TestFiatShamirBatchOpenParity exercises gpuBatchOpen's KZG-folding FS +// instance against gnark-crypto's kzg.BatchOpenSinglePoint. The two are run +// on identical synthetic inputs sharing a synthetic SRS. When the folding +// gamma matches byte-for-byte, the quotient commitment H is identical; +// any FS drift produces a different H. +func TestFiatShamirBatchOpenParity(t *testing.T) { + // Synthetic KZG SRS: G1[i] = [i+1]·G. Not a valid powers-of-tau SRS, but + // kzg.Commit just performs an MSM over pk.G1[:len(poly)] — both + // BatchOpenSinglePoint and gpuBatchOpen consume the same SRS, so the + // shared bias cancels and H equality reduces to FS-gamma equality. + const srsSize = 64 + pkG1 := make([]curve.G1Affine, srsSize) + for i := range pkG1 { + pkG1[i] = genG1(uint64(i)) + } + srsPK := kzg.ProvingKey{G1: pkG1} + + // Three polynomials of varying length, deterministic coefficients. + polys := make([][]fr.Element, 3) + polys[0] = make([]fr.Element, 16) + polys[1] = make([]fr.Element, 12) + polys[2] = make([]fr.Element, 8) + for i := range polys { + for j := range polys[i] { + polys[i][j] = genFr(uint64(1000*i + j)) + } + } + + digests := make([]curve.G1Affine, len(polys)) + for i := range polys { + d, err := kzg.Commit(polys[i], srsPK) + require.NoError(t, err) + digests[i] = d + } + + point := genFr(7) + + // Evaluate each poly at `point` (Horner) — gpuBatchOpen consumes these + // as inputs; BatchOpenSinglePoint computes them internally from the + // same coefficients, so the two must agree. + claimedValues := make([]fr.Element, len(polys)) + for i, p := range polys { + var v fr.Element + for j := len(p) - 1; j >= 0; j-- { + v.Mul(&v, &point) + v.Add(&v, &p[j]) + } + claimedValues[i] = v + } + + dataTranscript := []byte{0xde, 0xad, 0xbe, 0xef, 0x01, 0x23, 0x45, 0x67} + + cpuProof, err := kzg.BatchOpenSinglePoint( + polys, digests, point, sha256.New(), srsPK, dataTranscript, + ) + require.NoError(t, err) + + gpuCommit := func(p []fr.Element) (curve.G1Affine, error) { + return kzg.Commit(p, srsPK) + } + gpuProof, err := gpuBatchOpen( + gpuCommit, polys, digests, claimedValues, point, + sha256.New(), dataTranscript, + ) + require.NoError(t, err) + + require.True(t, cpuProof.H.Equal(&gpuProof.H), + "batch-open H differs — FS gamma folding diverged from gnark-crypto deriveGamma") + require.Equal(t, len(cpuProof.ClaimedValues), len(gpuProof.ClaimedValues), + "claimed values length mismatch") + for i := range cpuProof.ClaimedValues { + require.True(t, cpuProof.ClaimedValues[i].Equal(&gpuProof.ClaimedValues[i]), + "claimed value %d mismatch", i) + } +} diff --git a/prover/gpu/plonk2/bw6761/fs_parity_test.go b/prover/gpu/plonk2/bw6761/fs_parity_test.go new file mode 100644 index 00000000000..a2264c79100 --- /dev/null +++ b/prover/gpu/plonk2/bw6761/fs_parity_test.go @@ -0,0 +1,302 @@ +// Code generated by gpu/internal/generator DO NOT EDIT + +//go:build cuda + +package bw6761 + +// FS transcript parity tests — guard the GPU prover against silent drift +// from the audited gnark CPU Fiat-Shamir construction. +// +// Two layers: +// 1. TestFiatShamirChallengeParity — replays the four main challenges +// (gamma, beta, alpha, zeta) through the GPU's `bindPublicData` / +// `deriveRandomness` helpers and compares each derived fr.Element byte- +// for-byte against an inline reference built directly on gnark-crypto's +// public fiat-shamir API. The reference mirrors gnark CPU's exact bind +// order (see backend/plonk/bw6761/verify.go's bindPublicData / +// deriveRandomness and prove.go's deriveGammaAndBeta / deriveAlpha / +// deriveZeta). Any change to the GPU helpers that deviates from the +// audited byte stream fails this test deterministically. +// +// 2. TestFiatShamirBatchOpenParity — exercises gpuBatchOpen's second FS +// instance (kzg folding) against gnark-crypto's kzg.BatchOpenSinglePoint +// on identical synthetic inputs (same polys, digests, claimed values, +// point, dataTranscript, and folding hash). When the gamma folding +// challenge matches, the quotient commitment H is bit-identical. + +import ( + "crypto/sha256" + "math/big" + "testing" + + curve "github.com/consensys/gnark-crypto/ecc/bw6-761" + fr "github.com/consensys/gnark-crypto/ecc/bw6-761/fr" + kzg "github.com/consensys/gnark-crypto/ecc/bw6-761/kzg" + fiatshamir "github.com/consensys/gnark-crypto/fiat-shamir" + curplonk "github.com/consensys/gnark/backend/plonk/bw6-761" + "github.com/stretchr/testify/require" +) + +// genG1 returns [s+1]·G for a deterministic test point. s+1 avoids the +// identity element when s == 0. +func genG1(s uint64) curve.G1Affine { + var bi big.Int + bi.SetUint64(s + 1) + var p curve.G1Affine + p.ScalarMultiplicationBase(&bi) + return p +} + +// genFr returns a deterministic non-zero fr.Element. +func genFr(s uint64) fr.Element { + var x fr.Element + x.SetUint64(s + 1) + return x +} + +// referenceChallenges replays the gnark CPU FS bind sequence using only the +// public fiat-shamir API, returning (gamma, beta, alpha, zeta). Any deviation +// from gnark CPU's audited byte stream — see backend/plonk/bw6761/{prove, +// verify}.go — is captured here. +func referenceChallenges( + t *testing.T, + vk *curplonk.VerifyingKey, + publicInputs []fr.Element, + lro [3]curve.G1Affine, + bsb22 []curve.G1Affine, + z curve.G1Affine, + h [3]curve.G1Affine, +) (gamma, beta, alpha, zeta fr.Element) { + t.Helper() + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + + bind := func(name string, b []byte) { + require.NoError(t, fs.Bind(name, b)) + } + bindPoint := func(name string, p *curve.G1Affine) { + buf := p.RawBytes() + bind(name, buf[:]) + } + + // ── gamma: bindPublicData order is S[0..2], Ql, Qr, Qm, Qo, Qk, Qcp..., PI... + bind("gamma", vk.S[0].Marshal()) + bind("gamma", vk.S[1].Marshal()) + bind("gamma", vk.S[2].Marshal()) + bind("gamma", vk.Ql.Marshal()) + bind("gamma", vk.Qr.Marshal()) + bind("gamma", vk.Qm.Marshal()) + bind("gamma", vk.Qo.Marshal()) + bind("gamma", vk.Qk.Marshal()) + for i := range vk.Qcp { + bind("gamma", vk.Qcp[i].Marshal()) + } + for i := range publicInputs { + bind("gamma", publicInputs[i].Marshal()) + } + // then deriveRandomness on LRO + bindPoint("gamma", &lro[0]) + bindPoint("gamma", &lro[1]) + bindPoint("gamma", &lro[2]) + bGamma, err := fs.ComputeChallenge("gamma") + require.NoError(t, err) + gamma.SetBytes(bGamma) + + // ── beta: no bind (gnark CPU calls fs.ComputeChallenge("beta") directly) + bBeta, err := fs.ComputeChallenge("beta") + require.NoError(t, err) + beta.SetBytes(bBeta) + + // ── alpha: bind Bsb22... then Z + for i := range bsb22 { + bindPoint("alpha", &bsb22[i]) + } + bindPoint("alpha", &z) + bAlpha, err := fs.ComputeChallenge("alpha") + require.NoError(t, err) + alpha.SetBytes(bAlpha) + + // ── zeta: bind H[0..2] + bindPoint("zeta", &h[0]) + bindPoint("zeta", &h[1]) + bindPoint("zeta", &h[2]) + bZeta, err := fs.ComputeChallenge("zeta") + require.NoError(t, err) + zeta.SetBytes(bZeta) + + return +} + +func TestFiatShamirChallengeParity(t *testing.T) { + // Synthetic but deterministic inputs. Distinct points so a bind-order + // regression cannot be masked by aliasing. + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + vk.Qcp = []kzg.Digest{genG1(9), genG1(10), genG1(11)} + + publicInputs := make([]fr.Element, 5) + for i := range publicInputs { + publicInputs[i] = genFr(uint64(100 + i)) + } + + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + bsb22 := []curve.G1Affine{genG1(30), genG1(31)} + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + // GPU path — drive the same transcript through the GPU helpers exactly + // as gpuProver does (see deriveGammaBeta / buildZAndCommit / + // computeQuotientAndCommit in prove.go). + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaDeps := make([]*curve.G1Affine, 0, len(bsb22)+1) + for i := range bsb22 { + alphaDeps = append(alphaDeps, &bsb22[i]) + } + alphaDeps = append(alphaDeps, &z) + alphaGPU, err := deriveRandomness(fs, "alpha", alphaDeps...) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), + "gamma diverged from gnark CPU FS pattern: ref=%s gpu=%s", gammaRef.String(), gammaGPU.String()) + require.True(t, betaRef.Equal(&betaGPU), + "beta diverged: ref=%s gpu=%s", betaRef.String(), betaGPU.String()) + require.True(t, alphaRef.Equal(&alphaGPU), + "alpha diverged: ref=%s gpu=%s", alphaRef.String(), alphaGPU.String()) + require.True(t, zetaRef.Equal(&zetaGPU), + "zeta diverged: ref=%s gpu=%s", zetaRef.String(), zetaGPU.String()) +} + +// TestFiatShamirChallengeParity_NoBsb22 covers the no-commitment branch: +// alpha is then bound to Z alone, which is the common case for circuits +// without custom Bsb22 commitments. +func TestFiatShamirChallengeParity_NoBsb22(t *testing.T) { + var vk curplonk.VerifyingKey + vk.S[0] = genG1(1) + vk.S[1] = genG1(2) + vk.S[2] = genG1(3) + vk.Ql = genG1(4) + vk.Qr = genG1(5) + vk.Qm = genG1(6) + vk.Qo = genG1(7) + vk.Qk = genG1(8) + // vk.Qcp empty. + + publicInputs := []fr.Element{genFr(100)} + lro := [3]curve.G1Affine{genG1(20), genG1(21), genG1(22)} + var bsb22 []curve.G1Affine + z := genG1(40) + h := [3]curve.G1Affine{genG1(50), genG1(51), genG1(52)} + + gammaRef, betaRef, alphaRef, zetaRef := referenceChallenges( + t, &vk, publicInputs, lro, bsb22, z, h, + ) + + fs := fiatshamir.NewTranscript(sha256.New(), "gamma", "beta", "alpha", "zeta") + require.NoError(t, bindPublicData(fs, "gamma", &vk, publicInputs)) + gammaGPU, err := deriveRandomness(fs, "gamma", &lro[0], &lro[1], &lro[2]) + require.NoError(t, err) + betaGPU, err := deriveRandomness(fs, "beta") + require.NoError(t, err) + alphaGPU, err := deriveRandomness(fs, "alpha", &z) + require.NoError(t, err) + zetaGPU, err := deriveRandomness(fs, "zeta", &h[0], &h[1], &h[2]) + require.NoError(t, err) + + require.True(t, gammaRef.Equal(&gammaGPU), "gamma diverged (no-Bsb22 path)") + require.True(t, betaRef.Equal(&betaGPU), "beta diverged (no-Bsb22 path)") + require.True(t, alphaRef.Equal(&alphaGPU), "alpha diverged (no-Bsb22 path)") + require.True(t, zetaRef.Equal(&zetaGPU), "zeta diverged (no-Bsb22 path)") +} + +// TestFiatShamirBatchOpenParity exercises gpuBatchOpen's KZG-folding FS +// instance against gnark-crypto's kzg.BatchOpenSinglePoint. The two are run +// on identical synthetic inputs sharing a synthetic SRS. When the folding +// gamma matches byte-for-byte, the quotient commitment H is identical; +// any FS drift produces a different H. +func TestFiatShamirBatchOpenParity(t *testing.T) { + // Synthetic KZG SRS: G1[i] = [i+1]·G. Not a valid powers-of-tau SRS, but + // kzg.Commit just performs an MSM over pk.G1[:len(poly)] — both + // BatchOpenSinglePoint and gpuBatchOpen consume the same SRS, so the + // shared bias cancels and H equality reduces to FS-gamma equality. + const srsSize = 64 + pkG1 := make([]curve.G1Affine, srsSize) + for i := range pkG1 { + pkG1[i] = genG1(uint64(i)) + } + srsPK := kzg.ProvingKey{G1: pkG1} + + // Three polynomials of varying length, deterministic coefficients. + polys := make([][]fr.Element, 3) + polys[0] = make([]fr.Element, 16) + polys[1] = make([]fr.Element, 12) + polys[2] = make([]fr.Element, 8) + for i := range polys { + for j := range polys[i] { + polys[i][j] = genFr(uint64(1000*i + j)) + } + } + + digests := make([]curve.G1Affine, len(polys)) + for i := range polys { + d, err := kzg.Commit(polys[i], srsPK) + require.NoError(t, err) + digests[i] = d + } + + point := genFr(7) + + // Evaluate each poly at `point` (Horner) — gpuBatchOpen consumes these + // as inputs; BatchOpenSinglePoint computes them internally from the + // same coefficients, so the two must agree. + claimedValues := make([]fr.Element, len(polys)) + for i, p := range polys { + var v fr.Element + for j := len(p) - 1; j >= 0; j-- { + v.Mul(&v, &point) + v.Add(&v, &p[j]) + } + claimedValues[i] = v + } + + dataTranscript := []byte{0xde, 0xad, 0xbe, 0xef, 0x01, 0x23, 0x45, 0x67} + + cpuProof, err := kzg.BatchOpenSinglePoint( + polys, digests, point, sha256.New(), srsPK, dataTranscript, + ) + require.NoError(t, err) + + gpuCommit := func(p []fr.Element) (curve.G1Affine, error) { + return kzg.Commit(p, srsPK) + } + gpuProof, err := gpuBatchOpen( + gpuCommit, polys, digests, claimedValues, point, + sha256.New(), dataTranscript, + ) + require.NoError(t, err) + + require.True(t, cpuProof.H.Equal(&gpuProof.H), + "batch-open H differs — FS gamma folding diverged from gnark-crypto deriveGamma") + require.Equal(t, len(cpuProof.ClaimedValues), len(gpuProof.ClaimedValues), + "claimed values length mismatch") + for i := range cpuProof.ClaimedValues { + require.True(t, cpuProof.ClaimedValues[i].Equal(&gpuProof.ClaimedValues[i]), + "claimed value %d mismatch", i) + } +}