diff --git a/docs/monitoring/metrics.md b/docs/monitoring/metrics.md index 2c8a0c944a..1accd98788 100644 --- a/docs/monitoring/metrics.md +++ b/docs/monitoring/metrics.md @@ -38,6 +38,7 @@ curl https://localhost:7979/metrics | endpoints_total | Gauge | registry | | Number of Endpoints in the registry | | errors_total | Counter | registry | | Number of Registry errors. | | records | Gauge | registry | record_type | Number of registry records partitioned by label name (vector). | +| skipped_records_label_too_long_total | Counter | registry | record_type, domain | Total number of records skipped because the projected TXT registry name has a DNS label exceeding RFC 1035's 63-char limit (vector). | | skipped_records_owner_mismatch_per_sync | Gauge | registry | record_type, owner, foreign_owner, domain | Number of records skipped with owner mismatch for each record type, owner mismatch ID and domain (vector). | | deduplicated_endpoints | Gauge | source | record_type, source_type | Number of endpoints currently removed as duplicates, partitioned by record type and source. | | endpoints_total | Gauge | source | | Number of Endpoints in all sources | diff --git a/docs/registry/txt.md b/docs/registry/txt.md index a05f51acd9..1f129a3154 100644 --- a/docs/registry/txt.md +++ b/docs/registry/txt.md @@ -103,6 +103,25 @@ the record type of the DNS record for which it is storing metadata. The prefix is specified using the `--txt-prefix` flag and the suffix is specified using the `--txt-suffix` flag. The two flags are mutually exclusive. +## Label length limits + +DNS labels are capped at 63 octets per RFC 1035. The TXT registry stores ownership at a +sibling name with a record-type prefix on the first label (e.g., `cname-foo.example.com` +for a CNAME `foo.example.com`). Any configured `--txt-prefix` or `--txt-suffix` further +extends that label. + +If any label in the projected TXT name exceeds 63 characters, external-dns skips both +the parent record and its ownership TXT. Creating the parent without a TXT would leave +an unmanageable record in the zone, since later reconciles could not identify it as +managed. + +Each skip is logged at error level and increments the +`external_dns_registry_skipped_records_label_too_long_total` counter, labeled by +`record_type` and apex `domain`. + +To resolve, shorten the source hostname's first label or trim `--txt-prefix`/`--txt-suffix`. +The inline record-type prefix alone consumes 2–6 characters; the longest is `cname-`. + ## Wildcard Replacement The `--txt-wildcard-replacement` flag specifies a string to use to replace the "\*" in diff --git a/endpoint/endpoint.go b/endpoint/endpoint.go index 438e53c956..df64b2f2c2 100644 --- a/endpoint/endpoint.go +++ b/endpoint/endpoint.go @@ -504,6 +504,37 @@ func FilterEndpointsByOwnerID(ownerID string, eps []*Endpoint) []*Endpoint { return filtered } +// FilterEndpointsByDNSCompliance drops endpoints whose projected registry +// name (from toRegistryName) has any DNS label over RFC 1035's 63-char limit. +// Such records cannot be owned by the registry and would orphan in the zone. +// onSkip, if non-nil, is invoked per drop with the offending label so callers +// can attach metrics or events without importing this package. +func FilterEndpointsByDNSCompliance(toRegistryName func(*Endpoint) string, eps []*Endpoint, onSkip func(skipped *Endpoint, badLabel string)) []*Endpoint { + filtered := make([]*Endpoint, 0, len(eps)) + for _, ep := range eps { + registryName := toRegistryName(ep) + if badLabel, ok := overflowingLabel(registryName); ok { + log.Errorf(`Skipping endpoint %s %s: projected registry name %q has label %q exceeding RFC 1035's 63-char limit; record would be unmanageable`, ep.RecordType, ep.DNSName, registryName, badLabel) + if onSkip != nil { + onSkip(ep, badLabel) + } + continue + } + filtered = append(filtered, ep) + } + return filtered +} + +// overflowingLabel returns the first label in name longer than 63 chars. +func overflowingLabel(name string) (string, bool) { + for label := range strings.SplitSeq(name, ".") { + if len(label) > 63 { + return label, true + } + } + return "", false +} + // RemoveDuplicates returns a slice holding the unique endpoints. // This function doesn't contemplate the Targets of an Endpoint // as part of the primary Key diff --git a/internal/gen/docs/metrics/main_test.go b/internal/gen/docs/metrics/main_test.go index e9a132e516..e5fb67222d 100644 --- a/internal/gen/docs/metrics/main_test.go +++ b/internal/gen/docs/metrics/main_test.go @@ -31,7 +31,7 @@ import ( const ( pathToDocs = "%s/../../../../docs/monitoring" - knownMetricsCount = 24 + knownMetricsCount = 25 ) func TestComputeMetrics(t *testing.T) { diff --git a/registry/txt/metrics.go b/registry/txt/metrics.go new file mode 100644 index 0000000000..fe9949a557 --- /dev/null +++ b/registry/txt/metrics.go @@ -0,0 +1,44 @@ +/* +Copyright 2026 The Kubernetes Authors. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. +*/ + +package txt + +import ( + "github.com/prometheus/client_golang/prometheus" + + "sigs.k8s.io/external-dns/endpoint" + "sigs.k8s.io/external-dns/pkg/metrics" +) + +// registrySkippedLabelTooLongTotal counts records dropped because the projected +// TXT name has a label exceeding the 63-char RFC 1035 limit. +var registrySkippedLabelTooLongTotal = metrics.NewCounterVecWithOpts( + prometheus.CounterOpts{ + Subsystem: "registry", + Name: "skipped_records_label_too_long_total", + Help: "Total number of records skipped because the projected TXT registry name has a DNS label exceeding RFC 1035's 63-char limit (vector).", + }, + []string{"record_type", "domain"}, +) + +func init() { + metrics.RegisterMetric.MustRegister(registrySkippedLabelTooLongTotal) +} + +// recordSkippedLabelTooLong is the FilterEndpointsByDNSCompliance skip callback. +func recordSkippedLabelTooLong(skipped *endpoint.Endpoint, _ string) { + registrySkippedLabelTooLongTotal.CounterVec.WithLabelValues(skipped.RecordType, skipped.GetNakedDomain()).Inc() +} diff --git a/registry/txt/registry.go b/registry/txt/registry.go index a52ec2940e..249001d76a 100644 --- a/registry/txt/registry.go +++ b/registry/txt/registry.go @@ -303,6 +303,16 @@ func (im *TXTRegistry) generateTXTRecord(r *endpoint.Endpoint) []*endpoint.Endpo return im.generateTXTRecordWithFilter(r, func(_ *endpoint.Endpoint) bool { return true }) } +// txtNameFor returns the projected TXT registry name for r, applying the +// alias-A → CNAME rewrite that generateTXTRecordWithFilter would also apply. +func (im *TXTRegistry) txtNameFor(r *endpoint.Endpoint) string { + recordType := r.RecordType + if isAlias, found := r.GetBoolProviderSpecificProperty(endpoint.ProviderSpecificAlias); found && isAlias && recordType == endpoint.RecordTypeA { + recordType = endpoint.RecordTypeCNAME + } + return im.mapper.ToTXTName(r.DNSName, recordType) +} + func (im *TXTRegistry) generateTXTRecordWithFilter(r *endpoint.Endpoint, filter func(*endpoint.Endpoint) bool) []*endpoint.Endpoint { endpoints := make([]*endpoint.Endpoint, 0) @@ -333,7 +343,7 @@ func (im *TXTRegistry) generateTXTRecordWithFilter(r *endpoint.Endpoint, filter // for each created/deleted record it will also take into account TXT records for creation/deletion func (im *TXTRegistry) ApplyChanges(ctx context.Context, changes *plan.Changes) error { filteredChanges := &plan.Changes{ - Create: changes.Create, + Create: endpoint.FilterEndpointsByDNSCompliance(im.txtNameFor, changes.Create, recordSkippedLabelTooLong), UpdateNew: endpoint.FilterEndpointsByOwnerID(im.ownerID, changes.UpdateNew), UpdateOld: endpoint.FilterEndpointsByOwnerID(im.ownerID, changes.UpdateOld), Delete: endpoint.FilterEndpointsByOwnerID(im.ownerID, changes.Delete), diff --git a/registry/txt/registry_test.go b/registry/txt/registry_test.go index 9a81f9ade8..2e528a66e3 100644 --- a/registry/txt/registry_test.go +++ b/registry/txt/registry_test.go @@ -24,6 +24,7 @@ import ( "testing" "time" + promtestutil "github.com/prometheus/client_golang/prometheus/testutil" "github.com/stretchr/testify/assert" "github.com/stretchr/testify/require" @@ -1138,6 +1139,119 @@ func testTXTRegistryApplyChangesNoPrefix(t *testing.T) { require.NoError(t, err) } +// TestTXTRegistryApplyChangesSkipsRecordWithoutOwnership verifies that the +// registry refuses to create a parent record whenever its ownership TXT cannot +// be produced. Such a parent would be unreclaimable by future reconciles and +// would leak in the zone. +// +// The subtests below exercise the only failure mode that exists today — the +// projected TXT name exceeding the RFC 1035 63-char label limit — across the +// record types and registry configurations whose differing TXT prefixes shift +// the overflow threshold. Any future failure path in TXT generation should +// hit the same guard. +func TestTXTRegistryApplyChangesSkipsRecordWithoutOwnership(t *testing.T) { + tests := []struct { + name string + txtPrefix string + newParent func(dnsName string) *endpoint.Endpoint + labelLen int + txtPrefx string // the prefix the mapper prepends, used to assert the TXT name is not in the change set + }{ + { + name: "CNAME", + newParent: func(dn string) *endpoint.Endpoint { + return endpoint.NewEndpoint(dn, endpoint.RecordTypeCNAME, "lb.example.com") + }, + labelLen: 60, // cname- (6) + 60 = 66 -> overflow + txtPrefx: "cname-", + }, + { + name: "AAAA", + newParent: func(dn string) *endpoint.Endpoint { + return endpoint.NewEndpoint(dn, endpoint.RecordTypeAAAA, "fe80::1") + }, + labelLen: 60, // aaaa- (5) + 60 = 65 -> overflow + txtPrefx: "aaaa-", + }, + { + name: "A", + newParent: func(dn string) *endpoint.Endpoint { return endpoint.NewEndpoint(dn, endpoint.RecordTypeA, "1.2.3.4") }, + labelLen: 62, // a- (2) + 62 = 64 -> overflow + txtPrefx: "a-", + }, + { + name: "AliasA", + newParent: func(dn string) *endpoint.Endpoint { + return endpoint.NewEndpoint(dn, endpoint.RecordTypeA, "lb.example.com").WithProviderSpecific(endpoint.ProviderSpecificAlias, "true") + }, + labelLen: 60, // alias-A is encoded as cname- (6) + 60 = 66 -> overflow + txtPrefx: "cname-", + }, + { + name: "WithTxtPrefix", + txtPrefix: "ext-", + newParent: func(dn string) *endpoint.Endpoint { + return endpoint.NewEndpoint(dn, endpoint.RecordTypeCNAME, "lb.example.com") + }, + labelLen: 56, // ext- (4) + cname- (6) + 56 = 66 -> overflow + txtPrefx: "ext-cname-", + }, + } + for _, tc := range tests { + t.Run(tc.name, func(t *testing.T) { + registrySkippedLabelTooLongTotal.CounterVec.Reset() + + ctx := t.Context() + p := inmemory.NewInMemoryProvider() + require.NoError(t, p.CreateZone(testZone)) + + r, err := newRegistry(p, tc.txtPrefix, "", "owner", time.Hour, "", []string{}, []string{}, false, nil, "") + require.NoError(t, err) + + overflowLabel := strings.Repeat("a", tc.labelLen) + overflow := tc.newParent(overflowLabel + "." + testZone) + require.NotNil(t, overflow, "test setup: parent record at %d-char label must itself pass RFC 1035", tc.labelLen) + fits := tc.newParent("ok." + testZone) + + hook := logtest.LogsUnderTestWithLogLevel(log.ErrorLevel, t) + + var got *plan.Changes + p.OnApplyChanges = func(_ context.Context, ch *plan.Changes) { + got = ch + } + require.NoError(t, r.ApplyChanges(ctx, &plan.Changes{ + Create: []*endpoint.Endpoint{overflow, fits}, + })) + + require.NotNil(t, got) + overflowTXT := tc.txtPrefx + overflow.DNSName + for _, ep := range got.Create { + assert.NotEqual(t, overflow.DNSName, ep.DNSName, "parent without ownership TXT must be dropped") + assert.NotEqual(t, overflowTXT, ep.DNSName, "TXT companion for dropped parent must not be created") + } + + fitsParent := false + fitsTXT := false + for _, ep := range got.Create { + if ep.DNSName == fits.DNSName && ep.RecordType == fits.RecordType { + fitsParent = true + } + if ep.DNSName == tc.txtPrefx+fits.DNSName && ep.RecordType == endpoint.RecordTypeTXT { + fitsTXT = true + } + } + assert.True(t, fitsParent, "fits-fine parent must still be created") + assert.True(t, fitsTXT, "fits-fine TXT companion must still be created") + + logtest.TestHelperLogContainsWithLogLevel("exceeding RFC 1035's 63-char limit", log.ErrorLevel, hook, t) + assert.InDelta(t, 1.0, + promtestutil.ToFloat64(registrySkippedLabelTooLongTotal.CounterVec.WithLabelValues(overflow.RecordType, overflow.GetNakedDomain())), + 0, + "counter must record one skip for the dropped parent") + }) + } +} + func testTXTRegistryMissingRecords(t *testing.T) { t.Run("No prefix", testTXTRegistryMissingRecordsNoPrefix) t.Run("With Prefix", testTXTRegistryMissingRecordsWithPrefix)