diff --git a/go.mod b/go.mod index 6541c95a7..cb0d9f927 100644 --- a/go.mod +++ b/go.mod @@ -65,7 +65,7 @@ require ( github.com/sylabs/sif/v2 v2.24.0 // indirect github.com/tchap/go-patricia/v2 v2.3.3 // indirect github.com/ulikunitz/xz v0.5.15 // indirect - github.com/vbatts/tar-split v0.12.2 // indirect + github.com/vbatts/tar-split v0.12.3 // indirect go.opentelemetry.io/auto/sdk v1.2.1 // indirect go.opentelemetry.io/contrib/instrumentation/net/http/otelhttp v0.67.0 // indirect go.opentelemetry.io/otel v1.42.0 // indirect diff --git a/go.sum b/go.sum index 9e55c8b59..b534b0296 100644 --- a/go.sum +++ b/go.sum @@ -147,8 +147,8 @@ github.com/tchap/go-patricia/v2 v2.3.3 h1:xfNEsODumaEcCcY3gI0hYPZ/PcpVv5ju6RMAhg github.com/tchap/go-patricia/v2 v2.3.3/go.mod h1:VZRHKAb53DLaG+nA9EaYYiaEx6YztwDlLElMsnSHD4k= github.com/ulikunitz/xz v0.5.15 h1:9DNdB5s+SgV3bQ2ApL10xRc35ck0DuIX/isZvIk+ubY= github.com/ulikunitz/xz v0.5.15/go.mod h1:nbz6k7qbPmH4IRqmfOplQw/tblSgqTqBwxkY0oWt/14= -github.com/vbatts/tar-split v0.12.2 h1:w/Y6tjxpeiFMR47yzZPlPj/FcPLpXbTUi/9H7d3CPa4= -github.com/vbatts/tar-split v0.12.2/go.mod h1:eF6B6i6ftWQcDqEn3/iGFRFRo8cBIMSJVOpnNdfTMFA= +github.com/vbatts/tar-split v0.12.3 h1:Cd46rkGXI3Td4yrVNwU8ripbxFaQbmesqhjBUUYAJSw= +github.com/vbatts/tar-split v0.12.3/go.mod h1:sQOc6OlqGCr7HkGx/IDBeKiTIvqhmj8KffNhEXG4Nq0= github.com/yuin/goldmark v1.1.27/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= github.com/yuin/goldmark v1.2.1/go.mod h1:3hX8gzYuyVAZsxl0MRgGTJEmQBFcNTphYh9decYSb74= go.opentelemetry.io/auto/sdk v1.2.1 h1:jXsnJ4Lmnqd11kwkBV2LgLoFMZKizbCi5fNZ/ipaZ64= diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/format.go b/vendor/github.com/vbatts/tar-split/archive/tar/format.go index 60977980c..6f31845e4 100644 --- a/vendor/github.com/vbatts/tar-split/archive/tar/format.go +++ b/vendor/github.com/vbatts/tar-split/archive/tar/format.go @@ -147,6 +147,12 @@ const ( // Max length of a special file (PAX header, GNU long name or link). // This matches the limit used by libarchive. maxSpecialFileSize = 1 << 20 + + // Maximum number of sparse file entries. + // We should never actually hit this limit + // (every sparse encoding will first be limited by maxSpecialFileSize), + // but this adds an additional layer of defense. + maxSparseFileEntries = 1 << 20 ) // blockPadding computes the number of bytes needed to pad offset up to the diff --git a/vendor/github.com/vbatts/tar-split/archive/tar/reader.go b/vendor/github.com/vbatts/tar-split/archive/tar/reader.go index a645c4160..ebe857977 100644 --- a/vendor/github.com/vbatts/tar-split/archive/tar/reader.go +++ b/vendor/github.com/vbatts/tar-split/archive/tar/reader.go @@ -537,7 +537,8 @@ func (tr *Reader) readOldGNUSparseMap(hdr *Header, blk *block) (sparseDatas, err } s := blk.GNU().Sparse() spd := make(sparseDatas, 0, s.MaxEntries()) - for { + totalSize := len(s) + for totalSize < maxSpecialFileSize { for i := 0; i < s.MaxEntries(); i++ { // This termination condition is identical to GNU and BSD tar. if s.Entry(i).Offset()[0] == 0x00 { @@ -548,7 +549,11 @@ func (tr *Reader) readOldGNUSparseMap(hdr *Header, blk *block) (sparseDatas, err if p.err != nil { return nil, p.err } - spd = append(spd, sparseEntry{Offset: offset, Length: length}) + var err error + spd, err = appendSparseEntry(spd, sparseEntry{Offset: offset, Length: length}) + if err != nil { + return nil, err + } } if s.IsExtended()[0] > 0 { @@ -560,10 +565,12 @@ func (tr *Reader) readOldGNUSparseMap(hdr *Header, blk *block) (sparseDatas, err tr.rawBytes.Write(blk[:]) } s = blk.Sparse() + totalSize += len(s) continue } return spd, nil // Done } + return nil, errSparseTooLong } // readGNUSparseMap1x0 reads the sparse map as stored in GNU's PAX sparse format @@ -636,7 +643,10 @@ func readGNUSparseMap1x0(r io.Reader) (sparseDatas, error) { if err1 != nil || err2 != nil { return nil, ErrHeader } - spd = append(spd, sparseEntry{Offset: offset, Length: length}) + spd, err = appendSparseEntry(spd, sparseEntry{Offset: offset, Length: length}) + if err != nil { + return nil, err + } } return spd, nil } @@ -670,12 +680,22 @@ func readGNUSparseMap0x1(paxHdrs map[string]string) (sparseDatas, error) { if err1 != nil || err2 != nil { return nil, ErrHeader } - spd = append(spd, sparseEntry{Offset: offset, Length: length}) + spd, err = appendSparseEntry(spd, sparseEntry{Offset: offset, Length: length}) + if err != nil { + return nil, err + } sparseMap = sparseMap[2:] } return spd, nil } +func appendSparseEntry(spd sparseDatas, ent sparseEntry) (sparseDatas, error) { + if len(spd) >= maxSparseFileEntries { + return nil, errSparseTooLong + } + return append(spd, ent), nil +} + // Read reads from the current file in the tar archive. // It returns (0, io.EOF) when it reaches the end of that file, // until Next is called to advance to the next file. diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go index 80c2522af..a17b6eac1 100644 --- a/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go +++ b/vendor/github.com/vbatts/tar-split/tar/asm/disassemble.go @@ -1,156 +1,237 @@ package asm import ( + "errors" "io" "github.com/vbatts/tar-split/archive/tar" "github.com/vbatts/tar-split/tar/storage" ) -// NewInputTarStream wraps the Reader stream of a tar archive and provides a -// Reader stream of the same. +// runInputTarStreamGoroutine is the goroutine entrypoint. // -// In the middle it will pack the segments and file metadata to storage.Packer -// `p`. +// It centralizes the goroutine protocol so the core parsing logic can be +// written as ordinary Go code that just "returns an error". // -// The the storage.FilePutter is where payload of files in the stream are -// stashed. If this stashing is not needed, you can provide a nil -// storage.FilePutter. Since the checksumming is still needed, then a default -// of NewDiscardFilePutter will be used internally -func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) { - // What to do here... folks will want their own access to the Reader that is - // their tar archive stream, but we'll need that same stream to use our - // forked 'archive/tar'. - // Perhaps do an io.TeeReader that hands back an io.Reader for them to read - // from, and we'll MITM the stream to store metadata. - // We'll need a storage.FilePutter too ... +// Protocol guarantees: +// - pW is always closed exactly once (CloseWithError(nil) == Close()). +// - if done != nil, exactly one value is sent (nil on success, non-nil on failure). +// - panics are converted into a non-nil error (and the panic is rethrown). +func runInputTarStreamGoroutine(outputRdr io.Reader, pW *io.PipeWriter, p storage.Packer, fp storage.FilePutter, done chan<- error) { + // Default to a non-nil error so a panic can't accidentally look like success. + err := errors.New("panic in runInputTarStream") + defer func() { + // CloseWithError(nil) is equivalent to Close(). + pW.CloseWithError(err) - // Another concern, whether to do any storage.FilePutter operations, such that we - // don't extract any amount of the archive. But then again, we're not making - // files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter. - // Perhaps we have a DiscardFilePutter that is a bit bucket. + if done != nil { + done <- err + } - // we'll return the pipe reader, since TeeReader does not buffer and will - // only read what the outputRdr Read's. Since Tar archives have padding on - // the end, we want to be the one reading the padding, even if the user's - // `archive/tar` doesn't care. - pR, pW := io.Pipe() - outputRdr := io.TeeReader(r, pW) + // Preserve panic semantics while still ensuring the protocol above runs. + if r := recover(); r != nil { + panic(r) + } + }() - // we need a putter that will generate the crc64 sums of file payloads - if fp == nil { - fp = storage.NewDiscardFilePutter() - } + err = runInputTarStream(outputRdr, p, fp) +} - go func() { - tr := tar.NewReader(outputRdr) - tr.RawAccounting = true - for { - hdr, err := tr.Next() - if err != nil { - if err != io.EOF { - pW.CloseWithError(err) - return - } - // even when an EOF is reached, there is often 1024 null bytes on - // the end of an archive. Collect them too. - if b := tr.RawBytes(); len(b) > 0 { - _, err := p.AddEntry(storage.Entry{ - Type: storage.SegmentType, - Payload: b, - }) - if err != nil { - pW.CloseWithError(err) - return - } - } - break // not return. We need the end of the reader. - } - if hdr == nil { - break // not return. We need the end of the reader. - } +// runInputTarStream drives tar-split parsing. +// +// It reads a tar stream from outputRdr and records tar-split metadata into the +// provided storage.Packer. +// +// Abort behavior: if the consumer closes the read end early, the tee reader will +// stop producing bytes (due to pipe write failure) and tar parsing will return +// an error. We propagate that error so the goroutine terminates promptly rather +// than draining the input stream for no benefit. +func runInputTarStream(outputRdr io.Reader, p storage.Packer, fp storage.FilePutter) error { + tr := tar.NewReader(outputRdr) + tr.RawAccounting = true + for { + hdr, err := tr.Next() + if err != nil { + if err != io.EOF { + return err + } + // Even when EOF is reached, there is often 1024 null bytes at the end + // of an archive. Collect them too. if b := tr.RawBytes(); len(b) > 0 { - _, err := p.AddEntry(storage.Entry{ + if _, err := p.AddEntry(storage.Entry{ Type: storage.SegmentType, Payload: b, - }) - if err != nil { - pW.CloseWithError(err) - return - } - } - - var csum []byte - if hdr.Size > 0 { - var err error - _, csum, err = fp.Put(hdr.Name, tr) - if err != nil { - pW.CloseWithError(err) - return + }); err != nil { + return err } } + break // Not return: we still need to drain any additional padding. + } + if hdr == nil { + break // Not return: we still need to drain any additional padding. + } - entry := storage.Entry{ - Type: storage.FileType, - Size: hdr.Size, - Payload: csum, + if b := tr.RawBytes(); len(b) > 0 { + if _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }); err != nil { + return err } - // For proper marshalling of non-utf8 characters - entry.SetName(hdr.Name) + } - // File entries added, regardless of size - _, err = p.AddEntry(entry) + var csum []byte + if hdr.Size > 0 { + _, csum, err = fp.Put(hdr.Name, tr) if err != nil { - pW.CloseWithError(err) - return + return err } + } - if b := tr.RawBytes(); len(b) > 0 { - _, err = p.AddEntry(storage.Entry{ - Type: storage.SegmentType, - Payload: b, - }) - if err != nil { - pW.CloseWithError(err) - return - } - } + entry := storage.Entry{ + Type: storage.FileType, + Size: hdr.Size, + Payload: csum, } + // For proper marshalling of non-utf8 characters + entry.SetName(hdr.Name) - // It is allowable, and not uncommon that there is further padding on - // the end of an archive, apart from the expected 1024 null bytes. We - // do this in chunks rather than in one go to avoid cases where a - // maliciously crafted tar file tries to trick us into reading many GBs - // into memory. - const paddingChunkSize = 1024 * 1024 - var paddingChunk [paddingChunkSize]byte - for { - var isEOF bool - n, err := outputRdr.Read(paddingChunk[:]) - if err != nil { - if err != io.EOF { - pW.CloseWithError(err) - return - } - isEOF = true + // File entries added, regardless of size + if _, err := p.AddEntry(entry); err != nil { + return err + } + + if b := tr.RawBytes(); len(b) > 0 { + if _, err := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: b, + }); err != nil { + return err } - if n != 0 { - _, err = p.AddEntry(storage.Entry{ - Type: storage.SegmentType, - Payload: paddingChunk[:n], - }) - if err != nil { - pW.CloseWithError(err) - return - } + } + } + + // It is allowable, and not uncommon that there is further padding on + // the end of an archive, apart from the expected 1024 null bytes. We + // do this in chunks rather than in one go to avoid cases where a + // maliciously crafted tar file tries to trick us into reading many GBs + // into memory. + const paddingChunkSize = 1024 * 1024 + var paddingChunk [paddingChunkSize]byte + for { + n, err := outputRdr.Read(paddingChunk[:]) + if n != 0 { + if _, aerr := p.AddEntry(storage.Entry{ + Type: storage.SegmentType, + Payload: paddingChunk[:n], + }); aerr != nil { + return aerr } - if isEOF { + } + if err != nil { + if err == io.EOF { break } + return err } - pW.Close() - }() + } + + return nil +} + +// newInputTarStreamCommon sets up the shared plumbing for NewInputTarStream and +// NewInputTarStreamWithDone. +// +// It constructs an io.Pipe and an io.TeeReader such that: +// +// - The caller reads tar bytes from the returned *io.PipeReader. +// - The background goroutine simultaneously reads the same stream from the +// TeeReader to perform tar-split parsing and metadata packing. +// +// Abort and synchronization semantics: +// +// - Closing the returned PipeReader causes the TeeReader to fail its write to +// the pipe, which in turn causes the background goroutine to exit promptly. +// - If withDone is true, a done channel is returned that receives exactly one +// error value (nil on success) once the background goroutine has fully +// terminated. This allows callers to safely wait until the input reader `r` +// is no longer in use. +func newInputTarStreamCommon( + r io.Reader, + p storage.Packer, + fp storage.FilePutter, + done chan<- error, +) (pr *io.PipeReader) { + // What to do here... folks will want their own access to the Reader that is + // their tar archive stream, but we'll need that same stream to use our + // forked 'archive/tar'. + // Perhaps do an io.TeeReader that hands back an io.Reader for them to read + // from, and we'll MITM the stream to store metadata. + // We'll need a storage.FilePutter too ... + + // Another concern, whether to do any storage.FilePutter operations, such that we + // don't extract any amount of the archive. But then again, we're not making + // files/directories, hardlinks, etc. Just writing the io to the storage.FilePutter. + // Perhaps we have a DiscardFilePutter that is a bit bucket. - return pR, nil + // we'll return the pipe reader, since TeeReader does not buffer and will + // only read what the outputRdr Read's. Since Tar archives have padding on + // the end, we want to be the one reading the padding, even if the user's + // `archive/tar` doesn't care. + pr, pw := io.Pipe() + + if fp == nil { + fp = storage.NewDiscardFilePutter() + } + + outputRdr := io.TeeReader(r, pw) + go runInputTarStreamGoroutine(outputRdr, pw, p, fp, done) + + return pr +} + +// NewInputTarStream wraps the Reader stream of a tar archive and provides a +// Reader stream of the same. +// +// In the middle it will pack the segments and file metadata to storage.Packer +// `p`. +// +// The storage.FilePutter is where payload of files in the stream are +// stashed. If this stashing is not needed, you can provide a nil +// storage.FilePutter. Since the checksumming is still needed, then a default +// of NewDiscardFilePutter will be used internally +// +// If callers need to be able to abort early and/or wait for goroutine termination, +// prefer NewInputTarStreamWithDone. +// +// Deprecated: This leaves a goroutine around if the consumer aborts without consuming +// the whole stream, and does not allow the caller to know when r is safe to deallocate +// or when p has written everything. Use NewInputTarStreamWithDone instead. +func NewInputTarStream(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.Reader, error) { + pr := newInputTarStreamCommon(r, p, fp, nil) + return pr, nil +} + +// NewInputTarStreamWithDone wraps the Reader stream of a tar archive and provides a +// Reader stream of the same. +// +// In the middle it will pack the segments and file metadata to storage.Packer `p`. +// +// It also returns a done channel that will receive exactly one error value +// (nil on success) when the internal goroutine has fully completed parsing +// the tar stream (including the final paddingChunk draining loop) and has +// finished writing all entries to `p`. +// +// The returned reader is an io.ReadCloser so callers can stop early; closing it +// aborts the pipe so the internal goroutine can terminate promptly (rather than +// hanging on a blocked pipe write). +// +// The caller is expected to consume the returned reader fully until EOF +// (not just the tar EOF marker); closing the returned reader earlier will +// cause the done channel to return a failure. +func NewInputTarStreamWithDone(r io.Reader, p storage.Packer, fp storage.FilePutter) (io.ReadCloser, <-chan error, error) { + done := make(chan error, 1) + pr := newInputTarStreamCommon(r, p, fp, done) + return pr, done, nil } diff --git a/vendor/github.com/vbatts/tar-split/tar/asm/iterate.go b/vendor/github.com/vbatts/tar-split/tar/asm/iterate.go index 8a65887cf..9db3ab509 100644 --- a/vendor/github.com/vbatts/tar-split/tar/asm/iterate.go +++ b/vendor/github.com/vbatts/tar-split/tar/asm/iterate.go @@ -11,7 +11,7 @@ import ( // IterateHeaders calls handler for each tar header provided by Unpacker func IterateHeaders(unpacker storage.Unpacker, handler func(hdr *tar.Header) error) error { - // We assume about NewInputTarStream: + // We assume about NewInputTarStreamWithDone: // - There is a separate SegmentType entry for every tar header, but only one SegmentType entry for the full header incl. any extensions // - (There is a FileType entry for every tar header, we ignore it) // - Trailing padding of a file, if any, is included in the next SegmentType entry diff --git a/vendor/modules.txt b/vendor/modules.txt index 079650e69..7c2dbbbeb 100644 --- a/vendor/modules.txt +++ b/vendor/modules.txt @@ -243,8 +243,8 @@ github.com/ulikunitz/xz github.com/ulikunitz/xz/internal/hash github.com/ulikunitz/xz/internal/xlog github.com/ulikunitz/xz/lzma -# github.com/vbatts/tar-split v0.12.2 -## explicit; go 1.17 +# github.com/vbatts/tar-split v0.12.3 +## explicit; go 1.22.0 github.com/vbatts/tar-split/archive/tar github.com/vbatts/tar-split/tar/asm github.com/vbatts/tar-split/tar/storage