WARNING: THIS SITE IS A MIRROR OF GITHUB.COM / IT CANNOT LOGIN OR REGISTER ACCOUNTS / THE CONTENTS ARE PROVIDED AS-IS / THIS SITE ASSUMES NO RESPONSIBILITY FOR ANY DISPLAYED CONTENT OR LINKS / IF YOU FOUND SOMETHING MAY NOT GOOD FOR EVERYONE, CONTACT ADMIN AT ilovescratch@foxmail.com
Skip to content
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,45 @@
# REQUIRED
# Kind can be one of:
# - breaking-change: a change to previously-documented behavior
# - deprecation: functionality that is being removed in a later release
# - bug-fix: fixes a problem in a previous version
# - enhancement: extends functionality but does not break or fix existing behavior
# - feature: new functionality
# - known-issue: problems that we are aware of in a given version
# - security: impacts on the security of a product or a user’s deployment.
# - upgrade: important information for someone upgrading from a prior version
# - other: does not fit into any of the other categories
kind: bug-fix

# REQUIRED for all kinds
# Change summary; a 80ish characters long description of the change.
summary: report crashing otel process cleanly with proper status reporting

# REQUIRED for breaking-change, deprecation, known-issue
# Long description; in case the summary is not enough to describe the change
# this field accommodate a description without length limits.
# description:

# REQUIRED for breaking-change, deprecation, known-issue
# impact:

# REQUIRED for breaking-change, deprecation, known-issue
# action:

# REQUIRED for all kinds
# Affected component; usually one of "elastic-agent", "fleet-server", "filebeat", "metricbeat", "auditbeat", "all", etc.
component:

# AUTOMATED
# OPTIONAL to manually add other PR URLs
# PR URL: A link the PR that added the changeset.
# If not present is automatically filled by the tooling finding the PR where this changelog fragment has been added.
# NOTE: the tooling supports backports, so it's able to fill the original PR number instead of the backport PR number.
# Please provide it if you are adding a fragment for a different PR.
# pr: https://github.com/owner/repo/1234

# AUTOMATED
# OPTIONAL to manually add other issue URLs
# Issue URL; optional; the GitHub issue related to this changeset (either closes or is part of).
# If not present is automatically filled by the tooling with the issue linked to the PR number.
# issue: https://github.com/owner/repo/1234
134 changes: 134 additions & 0 deletions internal/pkg/otel/manager/common.go
Original file line number Diff line number Diff line change
Expand Up @@ -9,8 +9,14 @@ import (
"errors"
"fmt"
"net"
"strings"

"github.com/open-telemetry/opentelemetry-collector-contrib/pkg/status"
"go.opentelemetry.io/collector/component"
"go.opentelemetry.io/collector/component/componentstatus"
"go.opentelemetry.io/collector/confmap"
"go.opentelemetry.io/collector/otelcol"
"go.opentelemetry.io/collector/pipeline"
)

// for testing purposes
Expand Down Expand Up @@ -80,3 +86,131 @@ func findRandomTCPPorts(count int) (ports []int, err error) {

return ports, err
}

// otelConfigToStatus converts the `cfg` to `status.AggregateStatus` using the reported error.
//
// The flow of this function comes from https://github.com/open-telemetry/opentelemetry-collector/blob/main/service/internal/graph/graph.go
// It's a much simpler version, but follows the same for loop ordering and building of connectors of the internal
// graph system that OTEL uses to build its component graph.
func otelConfigToStatus(cfg *confmap.Conf, err error) (*status.AggregateStatus, error) {
// marshall into config
var c otelcol.Config
if unmarshalErr := cfg.Unmarshal(&c); unmarshalErr != nil {
return nil, fmt.Errorf("could not unmarshal config: %w", unmarshalErr)
}

// should at least define a single pipeline
if len(c.Service.Pipelines) == 0 {
return nil, fmt.Errorf("no pipelines defined")
}

// aggregators are used to create the overall status structure
// aggGeneric is used to for a generic aggregator status where all instances get the same error
// aggSpecific is used to provide status to the specific instance that caused the error
// aggSpecific is only used if matchOccurred is true
aggGeneric := status.NewAggregator(status.PriorityPermanent)
aggSpecific := status.NewAggregator(status.PriorityPermanent)
matchOccurred := false

// extensions
for _, id := range c.Service.Extensions {
instanceID := componentstatus.NewInstanceID(id, component.KindExtension)
aggGeneric.RecordStatus(instanceID, componentstatus.NewFatalErrorEvent(err))
if recordSpecificErr(aggSpecific, instanceID, err) {
matchOccurred = true
}
}

// track connectors
connectors := make(map[component.ID]struct{})
connectorsAsReceiver := make(map[component.ID][]pipeline.ID)
connectorsAsExporter := make(map[component.ID][]pipeline.ID)

// pipelines
for pipelineID, pipelineCfg := range c.Service.Pipelines {
for _, recvID := range pipelineCfg.Receivers {
// upstream graph creates a single component instance for a set of pipelines, then status reporting
// copies the instance for each pipeline. creating a unique instance per-pipeline provides the same
// behavior.
instanceID := componentstatus.NewInstanceID(recvID, component.KindReceiver, pipelineID)
_, isConnector := c.Connectors[recvID]
if isConnector {
connectors[recvID] = struct{}{}
connectorsAsReceiver[recvID] = append(connectorsAsReceiver[recvID], pipelineID)
}
aggGeneric.RecordStatus(instanceID, componentstatus.NewFatalErrorEvent(err))
if recordSpecificErr(aggSpecific, instanceID, err) {
matchOccurred = true
}
}
for _, procID := range pipelineCfg.Processors {
instanceID := componentstatus.NewInstanceID(procID, component.KindProcessor, pipelineID)
aggGeneric.RecordStatus(instanceID, componentstatus.NewFatalErrorEvent(err))
if recordSpecificErr(aggSpecific, instanceID, err) {
matchOccurred = true
}
}
for _, exporterID := range pipelineCfg.Exporters {
instanceID := componentstatus.NewInstanceID(exporterID, component.KindExporter, pipelineID)
_, isConnector := c.Connectors[exporterID]
if isConnector {
connectors[exporterID] = struct{}{}
connectorsAsExporter[exporterID] = append(connectorsAsExporter[exporterID], pipelineID)
}
aggGeneric.RecordStatus(instanceID, componentstatus.NewFatalErrorEvent(err))
if recordSpecificErr(aggSpecific, instanceID, err) {
matchOccurred = true
}
}
}

// connectors
for connID := range connectors {
extraMatchStr := fmt.Sprintf("connector %q used as", connID)
for _, eID := range connectorsAsExporter[connID] {
for _, rID := range connectorsAsReceiver[connID] {
instanceID := componentstatus.NewInstanceID(
connID, component.KindConnector, eID, rID,
)
aggGeneric.RecordStatus(instanceID, componentstatus.NewFatalErrorEvent(err))
if recordSpecificErr(aggSpecific, instanceID, err, extraMatchStr) {
matchOccurred = true
}
}
}
}

if matchOccurred {
// specific for the matched error
aggStatus, _ := aggSpecific.AggregateStatus(status.ScopeAll, status.Verbose)
return aggStatus, nil
}
// no match found so generic failed on all instances
aggStatus, _ := aggGeneric.AggregateStatus(status.ScopeAll, status.Verbose)
return aggStatus, nil
}

func recordSpecificErr(agg *status.Aggregator, instanceID *componentstatus.InstanceID, err error, extraMatchStrs ...string) bool {
// matches configuration errors for a specific component
forIDStr := fmt.Sprintf("for id: %q", instanceID.ComponentID().String())
// occurs when a specific component fails to start
failedMatchStr := fmt.Sprintf("failed to start %q %s:", instanceID.ComponentID().String(), strings.ToLower(instanceID.Kind().String()))
// occurs when a component factory is not available (unknown component type)
factoryNotAvailableStr := fmt.Sprintf("factory not available for: %q", instanceID.ComponentID().String())
if strings.Contains(err.Error(), forIDStr) || strings.Contains(err.Error(), failedMatchStr) || strings.Contains(err.Error(), factoryNotAvailableStr) {
// specific so this instance gets the reported error
agg.RecordStatus(instanceID, componentstatus.NewFatalErrorEvent(err))
return true
}
// extra matchers
for _, matchStr := range extraMatchStrs {
if strings.Contains(err.Error(), matchStr) {
// specific so this instance gets the reported error
agg.RecordStatus(instanceID, componentstatus.NewFatalErrorEvent(err))
return true
}
}
// not specific to this instance, so we record this one as starting
agg.RecordStatus(instanceID, componentstatus.NewEvent(componentstatus.StatusStarting))
return false
}
Loading