Source file src/cmd/vendor/golang.org/x/telemetry/internal/crashmonitor/monitor.go

     1  // Copyright 2024 The Go Authors. All rights reserved.
     2  // Use of this source code is governed by a BSD-style
     3  // license that can be found in the LICENSE file.
     4  
     5  package crashmonitor
     6  
     7  // This file defines a monitor that reports arbitrary Go runtime
     8  // crashes to telemetry.
     9  
    10  import (
    11  	"bytes"
    12  	"fmt"
    13  	"io"
    14  	"log"
    15  	"os"
    16  	"reflect"
    17  	"runtime/debug"
    18  	"strconv"
    19  	"strings"
    20  
    21  	"golang.org/x/telemetry/internal/counter"
    22  )
    23  
    24  // Parent sets up the parent side of the crashmonitor. It requires
    25  // exclusive use of a writable pipe connected to the child process's stdin.
    26  func Parent(pipe *os.File) {
    27  	writeSentinel(pipe)
    28  	// Ensure that we get pc=0x%x values in the traceback.
    29  	debug.SetTraceback("system")
    30  	debug.SetCrashOutput(pipe, debug.CrashOptions{}) // ignore error
    31  }
    32  
    33  // Child runs the part of the crashmonitor that runs in the child process.
    34  // It expects its stdin to be connected via a pipe to the parent which has
    35  // run Parent.
    36  func Child() {
    37  	// Wait for parent process's dying gasp.
    38  	// If the parent dies for any reason this read will return.
    39  	data, err := io.ReadAll(os.Stdin)
    40  	if err != nil {
    41  		log.Fatalf("failed to read from input pipe: %v", err)
    42  	}
    43  
    44  	// If the only line is the sentinel, it wasn't a crash.
    45  	if bytes.Count(data, []byte("\n")) < 2 {
    46  		childExitHook()
    47  		os.Exit(0) // parent exited without crash report
    48  	}
    49  
    50  	log.Printf("parent reported crash:\n%s", data)
    51  
    52  	// Parse the stack out of the crash report
    53  	// and record a telemetry count for it.
    54  	name, err := telemetryCounterName(data)
    55  	if err != nil {
    56  		// Keep count of how often this happens
    57  		// so that we can investigate if necessary.
    58  		incrementCounter("crash/malformed")
    59  
    60  		// Something went wrong.
    61  		// Save the crash securely in the file system.
    62  		f, err := os.CreateTemp(os.TempDir(), "*.crash")
    63  		if err != nil {
    64  			log.Fatal(err)
    65  		}
    66  		if _, err := f.Write(data); err != nil {
    67  			log.Fatal(err)
    68  		}
    69  		if err := f.Close(); err != nil {
    70  			log.Fatal(err)
    71  		}
    72  		log.Printf("failed to report crash to telemetry: %v", err)
    73  		log.Fatalf("crash report saved at %s", f.Name())
    74  	}
    75  
    76  	incrementCounter(name)
    77  
    78  	childExitHook()
    79  	log.Fatalf("telemetry crash recorded")
    80  }
    81  
    82  // (stubbed by test)
    83  var (
    84  	incrementCounter = func(name string) { counter.New(name).Inc() }
    85  	childExitHook    = func() {}
    86  )
    87  
    88  // The sentinel function returns its address. The difference between
    89  // this value as observed by calls in two different processes of the
    90  // same executable tells us the relative offset of their text segments.
    91  //
    92  // It would be nice if SetCrashOutput took care of this as it's fiddly
    93  // and likely to confuse every user at first.
    94  func sentinel() uint64 {
    95  	return uint64(reflect.ValueOf(sentinel).Pointer())
    96  }
    97  
    98  func writeSentinel(out io.Writer) {
    99  	fmt.Fprintf(out, "sentinel %x\n", sentinel())
   100  }
   101  
   102  // telemetryCounterName parses a crash report produced by the Go
   103  // runtime, extracts the stack of the first runnable goroutine,
   104  // converts each line into telemetry form ("symbol:relative-line"),
   105  // and returns this as the name of a counter.
   106  func telemetryCounterName(crash []byte) (string, error) {
   107  	pcs, err := parseStackPCs(string(crash))
   108  	if err != nil {
   109  		return "", err
   110  	}
   111  
   112  	// Limit the number of frames we request.
   113  	pcs = pcs[:min(len(pcs), 16)]
   114  
   115  	if len(pcs) == 0 {
   116  		// This can occur if all goroutines are idle, as when
   117  		// caught in a deadlock, or killed by an async signal
   118  		// while blocked.
   119  		//
   120  		// TODO(adonovan): consider how to report such
   121  		// situations. Reporting a goroutine in [sleep] or
   122  		// [select] state could be quite confusing without
   123  		// further information about the nature of the crash,
   124  		// as the problem is not local to the code location.
   125  		//
   126  		// For now, we keep count of this situation so that we
   127  		// can access whether it needs a more involved solution.
   128  		return "crash/no-running-goroutine", nil
   129  	}
   130  
   131  	// This string appears at the start of all
   132  	// crashmonitor-generated counter names.
   133  	//
   134  	// It is tempting to expose this as a parameter of Start, but
   135  	// it is not without risk. What value should most programs
   136  	// provide? There's no point giving the name of the executable
   137  	// as this is already recorded by telemetry. What if the
   138  	// application runs in multiple modes? Then it might be useful
   139  	// to record the mode. The problem is that an application with
   140  	// multiple modes probably doesn't know its mode by line 1 of
   141  	// main.main: it might require flag or argument parsing, or
   142  	// even validation of an environment variable, and we really
   143  	// want to steer users aware from any logic before Start. The
   144  	// flags and arguments will be wrong in the child process, and
   145  	// every extra conditional branch creates a risk that the
   146  	// recursively executed child program will behave not like the
   147  	// monitor but like the application. If the child process
   148  	// exits before calling Start, then the parent application
   149  	// will not have a monitor, and its crash reports will be
   150  	// discarded (written in to a pipe that is never read).
   151  	//
   152  	// So for now, we use this constant string.
   153  	const prefix = "crash/crash"
   154  	return counter.EncodeStack(pcs, prefix), nil
   155  }
   156  
   157  // parseStackPCs parses the parent process's program counters for the
   158  // first running goroutine out of a GOTRACEBACK=system traceback,
   159  // adjusting them so that they are valid for the child process's text
   160  // segment.
   161  //
   162  // This function returns only program counter values, ensuring that
   163  // there is no possibility of strings from the crash report (which may
   164  // contain PII) leaking into the telemetry system.
   165  func parseStackPCs(crash string) ([]uintptr, error) {
   166  	// getSymbol parses the symbol name out of a line of the form:
   167  	// SYMBOL(ARGS)
   168  	//
   169  	// Note: SYMBOL may contain parens "pkg.(*T).method". However, type
   170  	// parameters are always replaced with ..., so they cannot introduce
   171  	// more parens. e.g., "pkg.(*T[...]).method".
   172  	//
   173  	// ARGS can contain parens. We want the first paren that is not
   174  	// immediately preceded by a ".".
   175  	//
   176  	// TODO(prattmic): This is mildly complicated and is only used to find
   177  	// runtime.sigpanic, so perhaps simplify this by checking explicitly
   178  	// for sigpanic.
   179  	getSymbol := func(line string) (string, error) {
   180  		var prev rune
   181  		for i, c := range line {
   182  			if line[i] != '(' {
   183  				prev = c
   184  				continue
   185  			}
   186  			if prev == '.' {
   187  				prev = c
   188  				continue
   189  			}
   190  			return line[:i], nil
   191  		}
   192  		return "", fmt.Errorf("no symbol for stack frame: %s", line)
   193  	}
   194  
   195  	// getPC parses the PC out of a line of the form:
   196  	//     \tFILE:LINE +0xRELPC sp=... fp=... pc=...
   197  	getPC := func(line string) (uint64, error) {
   198  		_, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x
   199  		if !ok {
   200  			return 0, fmt.Errorf("no pc= for stack frame: %s", line)
   201  		}
   202  		return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix
   203  	}
   204  
   205  	var (
   206  		pcs            []uintptr
   207  		parentSentinel uint64
   208  		childSentinel  = sentinel()
   209  		on             = false // are we in the first running goroutine?
   210  		lines          = strings.Split(crash, "\n")
   211  		symLine        = true // within a goroutine, every other line is a symbol or file/line/pc location, starting with symbol.
   212  		currSymbol     string
   213  		prevSymbol     string // symbol of the most recent previous frame with a PC.
   214  	)
   215  	for i := 0; i < len(lines); i++ {
   216  		line := lines[i]
   217  
   218  		// Read sentinel value.
   219  		if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") {
   220  			_, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel)
   221  			if err != nil {
   222  				return nil, fmt.Errorf("can't read sentinel line")
   223  			}
   224  			continue
   225  		}
   226  
   227  		// Search for "goroutine GID [STATUS]"
   228  		if !on {
   229  			if strings.HasPrefix(line, "goroutine ") &&
   230  				strings.Contains(line, " [running]:") {
   231  				on = true
   232  
   233  				if parentSentinel == 0 {
   234  					return nil, fmt.Errorf("no sentinel value in crash report")
   235  				}
   236  			}
   237  			continue
   238  		}
   239  
   240  		// A blank line marks end of a goroutine stack.
   241  		if line == "" {
   242  			break
   243  		}
   244  
   245  		// Skip the final "created by SYMBOL in goroutine GID" part.
   246  		if strings.HasPrefix(line, "created by ") {
   247  			break
   248  		}
   249  
   250  		// Expect a pair of lines:
   251  		//   SYMBOL(ARGS)
   252  		//   \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x
   253  		// Note: SYMBOL may contain parens "pkg.(*T).method"
   254  		// The RELPC is sometimes missing.
   255  
   256  		if symLine {
   257  			var err error
   258  			currSymbol, err = getSymbol(line)
   259  			if err != nil {
   260  				return nil, fmt.Errorf("error extracting symbol: %v", err)
   261  			}
   262  
   263  			symLine = false // Next line is FILE:LINE.
   264  		} else {
   265  			// Parse the PC, and correct for the parent and child's
   266  			// different mappings of the text section.
   267  			pc, err := getPC(line)
   268  			if err != nil {
   269  				// Inlined frame, perhaps; skip it.
   270  
   271  				// Done with this frame. Next line is a new frame.
   272  				//
   273  				// Don't update prevSymbol; we only want to
   274  				// track frames with a PC.
   275  				currSymbol = ""
   276  				symLine = true
   277  				continue
   278  			}
   279  
   280  			pc = pc - parentSentinel + childSentinel
   281  
   282  			// If the previous frame was sigpanic, then this frame
   283  			// was a trap (e.g., SIGSEGV).
   284  			//
   285  			// Typically all middle frames are calls, and report
   286  			// the "return PC". That is, the instruction following
   287  			// the CALL where the callee will eventually return to.
   288  			//
   289  			// runtime.CallersFrames is aware of this property and
   290  			// will decrement each PC by 1 to "back up" to the
   291  			// location of the CALL, which is the actual line
   292  			// number the user expects.
   293  			//
   294  			// This does not work for traps, as a trap is not a
   295  			// call, so the reported PC is not the return PC, but
   296  			// the actual PC of the trap.
   297  			//
   298  			// runtime.Callers is aware of this and will
   299  			// intentionally increment trap PCs in order to correct
   300  			// for the decrement performed by
   301  			// runtime.CallersFrames. See runtime.tracebackPCs and
   302  			// runtume.(*unwinder).symPC.
   303  			//
   304  			// We must emulate the same behavior, otherwise we will
   305  			// report the location of the instruction immediately
   306  			// prior to the trap, which may be on a different line,
   307  			// or even a different inlined functions.
   308  			//
   309  			// TODO(prattmic): The runtime applies the same trap
   310  			// behavior for other "injected calls", see injectCall
   311  			// in runtime.(*unwinder).next. Do we want to handle
   312  			// those as well? I don't believe we'd ever see
   313  			// runtime.asyncPreempt or runtime.debugCallV2 in a
   314  			// typical crash.
   315  			if prevSymbol == "runtime.sigpanic" {
   316  				pc++
   317  			}
   318  
   319  			pcs = append(pcs, uintptr(pc))
   320  
   321  			// Done with this frame. Next line is a new frame.
   322  			prevSymbol = currSymbol
   323  			currSymbol = ""
   324  			symLine = true
   325  		}
   326  	}
   327  	return pcs, nil
   328  }
   329  
   330  func min(x, y int) int {
   331  	if x < y {
   332  		return x
   333  	} else {
   334  		return y
   335  	}
   336  }
   337  

View as plain text