Source file src/cmd/vendor/golang.org/x/telemetry/internal/crashmonitor/monitor.go
1 // Copyright 2024 The Go Authors. All rights reserved. 2 // Use of this source code is governed by a BSD-style 3 // license that can be found in the LICENSE file. 4 5 package crashmonitor 6 7 // This file defines a monitor that reports arbitrary Go runtime 8 // crashes to telemetry. 9 10 import ( 11 "bytes" 12 "fmt" 13 "io" 14 "log" 15 "os" 16 "reflect" 17 "runtime/debug" 18 "strconv" 19 "strings" 20 21 "golang.org/x/telemetry/internal/counter" 22 ) 23 24 // Parent sets up the parent side of the crashmonitor. It requires 25 // exclusive use of a writable pipe connected to the child process's stdin. 26 func Parent(pipe *os.File) { 27 writeSentinel(pipe) 28 // Ensure that we get pc=0x%x values in the traceback. 29 debug.SetTraceback("system") 30 debug.SetCrashOutput(pipe, debug.CrashOptions{}) // ignore error 31 } 32 33 // Child runs the part of the crashmonitor that runs in the child process. 34 // It expects its stdin to be connected via a pipe to the parent which has 35 // run Parent. 36 func Child() { 37 // Wait for parent process's dying gasp. 38 // If the parent dies for any reason this read will return. 39 data, err := io.ReadAll(os.Stdin) 40 if err != nil { 41 log.Fatalf("failed to read from input pipe: %v", err) 42 } 43 44 // If the only line is the sentinel, it wasn't a crash. 45 if bytes.Count(data, []byte("\n")) < 2 { 46 childExitHook() 47 os.Exit(0) // parent exited without crash report 48 } 49 50 log.Printf("parent reported crash:\n%s", data) 51 52 // Parse the stack out of the crash report 53 // and record a telemetry count for it. 54 name, err := telemetryCounterName(data) 55 if err != nil { 56 // Keep count of how often this happens 57 // so that we can investigate if necessary. 58 incrementCounter("crash/malformed") 59 60 // Something went wrong. 61 // Save the crash securely in the file system. 62 f, err := os.CreateTemp(os.TempDir(), "*.crash") 63 if err != nil { 64 log.Fatal(err) 65 } 66 if _, err := f.Write(data); err != nil { 67 log.Fatal(err) 68 } 69 if err := f.Close(); err != nil { 70 log.Fatal(err) 71 } 72 log.Printf("failed to report crash to telemetry: %v", err) 73 log.Fatalf("crash report saved at %s", f.Name()) 74 } 75 76 incrementCounter(name) 77 78 childExitHook() 79 log.Fatalf("telemetry crash recorded") 80 } 81 82 // (stubbed by test) 83 var ( 84 incrementCounter = func(name string) { counter.New(name).Inc() } 85 childExitHook = func() {} 86 ) 87 88 // The sentinel function returns its address. The difference between 89 // this value as observed by calls in two different processes of the 90 // same executable tells us the relative offset of their text segments. 91 // 92 // It would be nice if SetCrashOutput took care of this as it's fiddly 93 // and likely to confuse every user at first. 94 func sentinel() uint64 { 95 return uint64(reflect.ValueOf(sentinel).Pointer()) 96 } 97 98 func writeSentinel(out io.Writer) { 99 fmt.Fprintf(out, "sentinel %x\n", sentinel()) 100 } 101 102 // telemetryCounterName parses a crash report produced by the Go 103 // runtime, extracts the stack of the first runnable goroutine, 104 // converts each line into telemetry form ("symbol:relative-line"), 105 // and returns this as the name of a counter. 106 func telemetryCounterName(crash []byte) (string, error) { 107 pcs, err := parseStackPCs(string(crash)) 108 if err != nil { 109 return "", err 110 } 111 112 // Limit the number of frames we request. 113 pcs = pcs[:min(len(pcs), 16)] 114 115 if len(pcs) == 0 { 116 // This can occur if all goroutines are idle, as when 117 // caught in a deadlock, or killed by an async signal 118 // while blocked. 119 // 120 // TODO(adonovan): consider how to report such 121 // situations. Reporting a goroutine in [sleep] or 122 // [select] state could be quite confusing without 123 // further information about the nature of the crash, 124 // as the problem is not local to the code location. 125 // 126 // For now, we keep count of this situation so that we 127 // can access whether it needs a more involved solution. 128 return "crash/no-running-goroutine", nil 129 } 130 131 // This string appears at the start of all 132 // crashmonitor-generated counter names. 133 // 134 // It is tempting to expose this as a parameter of Start, but 135 // it is not without risk. What value should most programs 136 // provide? There's no point giving the name of the executable 137 // as this is already recorded by telemetry. What if the 138 // application runs in multiple modes? Then it might be useful 139 // to record the mode. The problem is that an application with 140 // multiple modes probably doesn't know its mode by line 1 of 141 // main.main: it might require flag or argument parsing, or 142 // even validation of an environment variable, and we really 143 // want to steer users aware from any logic before Start. The 144 // flags and arguments will be wrong in the child process, and 145 // every extra conditional branch creates a risk that the 146 // recursively executed child program will behave not like the 147 // monitor but like the application. If the child process 148 // exits before calling Start, then the parent application 149 // will not have a monitor, and its crash reports will be 150 // discarded (written in to a pipe that is never read). 151 // 152 // So for now, we use this constant string. 153 const prefix = "crash/crash" 154 return counter.EncodeStack(pcs, prefix), nil 155 } 156 157 // parseStackPCs parses the parent process's program counters for the 158 // first running goroutine out of a GOTRACEBACK=system traceback, 159 // adjusting them so that they are valid for the child process's text 160 // segment. 161 // 162 // This function returns only program counter values, ensuring that 163 // there is no possibility of strings from the crash report (which may 164 // contain PII) leaking into the telemetry system. 165 func parseStackPCs(crash string) ([]uintptr, error) { 166 // getSymbol parses the symbol name out of a line of the form: 167 // SYMBOL(ARGS) 168 // 169 // Note: SYMBOL may contain parens "pkg.(*T).method". However, type 170 // parameters are always replaced with ..., so they cannot introduce 171 // more parens. e.g., "pkg.(*T[...]).method". 172 // 173 // ARGS can contain parens. We want the first paren that is not 174 // immediately preceded by a ".". 175 // 176 // TODO(prattmic): This is mildly complicated and is only used to find 177 // runtime.sigpanic, so perhaps simplify this by checking explicitly 178 // for sigpanic. 179 getSymbol := func(line string) (string, error) { 180 var prev rune 181 for i, c := range line { 182 if line[i] != '(' { 183 prev = c 184 continue 185 } 186 if prev == '.' { 187 prev = c 188 continue 189 } 190 return line[:i], nil 191 } 192 return "", fmt.Errorf("no symbol for stack frame: %s", line) 193 } 194 195 // getPC parses the PC out of a line of the form: 196 // \tFILE:LINE +0xRELPC sp=... fp=... pc=... 197 getPC := func(line string) (uint64, error) { 198 _, pcstr, ok := strings.Cut(line, " pc=") // e.g. pc=0x%x 199 if !ok { 200 return 0, fmt.Errorf("no pc= for stack frame: %s", line) 201 } 202 return strconv.ParseUint(pcstr, 0, 64) // 0 => allow 0x prefix 203 } 204 205 var ( 206 pcs []uintptr 207 parentSentinel uint64 208 childSentinel = sentinel() 209 on = false // are we in the first running goroutine? 210 lines = strings.Split(crash, "\n") 211 symLine = true // within a goroutine, every other line is a symbol or file/line/pc location, starting with symbol. 212 currSymbol string 213 prevSymbol string // symbol of the most recent previous frame with a PC. 214 ) 215 for i := 0; i < len(lines); i++ { 216 line := lines[i] 217 218 // Read sentinel value. 219 if parentSentinel == 0 && strings.HasPrefix(line, "sentinel ") { 220 _, err := fmt.Sscanf(line, "sentinel %x", &parentSentinel) 221 if err != nil { 222 return nil, fmt.Errorf("can't read sentinel line") 223 } 224 continue 225 } 226 227 // Search for "goroutine GID [STATUS]" 228 if !on { 229 if strings.HasPrefix(line, "goroutine ") && 230 strings.Contains(line, " [running]:") { 231 on = true 232 233 if parentSentinel == 0 { 234 return nil, fmt.Errorf("no sentinel value in crash report") 235 } 236 } 237 continue 238 } 239 240 // A blank line marks end of a goroutine stack. 241 if line == "" { 242 break 243 } 244 245 // Skip the final "created by SYMBOL in goroutine GID" part. 246 if strings.HasPrefix(line, "created by ") { 247 break 248 } 249 250 // Expect a pair of lines: 251 // SYMBOL(ARGS) 252 // \tFILE:LINE +0xRELPC sp=0x%x fp=0x%x pc=0x%x 253 // Note: SYMBOL may contain parens "pkg.(*T).method" 254 // The RELPC is sometimes missing. 255 256 if symLine { 257 var err error 258 currSymbol, err = getSymbol(line) 259 if err != nil { 260 return nil, fmt.Errorf("error extracting symbol: %v", err) 261 } 262 263 symLine = false // Next line is FILE:LINE. 264 } else { 265 // Parse the PC, and correct for the parent and child's 266 // different mappings of the text section. 267 pc, err := getPC(line) 268 if err != nil { 269 // Inlined frame, perhaps; skip it. 270 271 // Done with this frame. Next line is a new frame. 272 // 273 // Don't update prevSymbol; we only want to 274 // track frames with a PC. 275 currSymbol = "" 276 symLine = true 277 continue 278 } 279 280 pc = pc - parentSentinel + childSentinel 281 282 // If the previous frame was sigpanic, then this frame 283 // was a trap (e.g., SIGSEGV). 284 // 285 // Typically all middle frames are calls, and report 286 // the "return PC". That is, the instruction following 287 // the CALL where the callee will eventually return to. 288 // 289 // runtime.CallersFrames is aware of this property and 290 // will decrement each PC by 1 to "back up" to the 291 // location of the CALL, which is the actual line 292 // number the user expects. 293 // 294 // This does not work for traps, as a trap is not a 295 // call, so the reported PC is not the return PC, but 296 // the actual PC of the trap. 297 // 298 // runtime.Callers is aware of this and will 299 // intentionally increment trap PCs in order to correct 300 // for the decrement performed by 301 // runtime.CallersFrames. See runtime.tracebackPCs and 302 // runtume.(*unwinder).symPC. 303 // 304 // We must emulate the same behavior, otherwise we will 305 // report the location of the instruction immediately 306 // prior to the trap, which may be on a different line, 307 // or even a different inlined functions. 308 // 309 // TODO(prattmic): The runtime applies the same trap 310 // behavior for other "injected calls", see injectCall 311 // in runtime.(*unwinder).next. Do we want to handle 312 // those as well? I don't believe we'd ever see 313 // runtime.asyncPreempt or runtime.debugCallV2 in a 314 // typical crash. 315 if prevSymbol == "runtime.sigpanic" { 316 pc++ 317 } 318 319 pcs = append(pcs, uintptr(pc)) 320 321 // Done with this frame. Next line is a new frame. 322 prevSymbol = currSymbol 323 currSymbol = "" 324 symLine = true 325 } 326 } 327 return pcs, nil 328 } 329 330 func min(x, y int) int { 331 if x < y { 332 return x 333 } else { 334 return y 335 } 336 } 337