1
0
Fork 0
mirror of https://codeberg.org/forgejo/forgejo.git synced 2024-12-23 12:59:11 -05:00
forgejo/modules/web/routing/logger_manager.go
zeripath c88547ce71
Add Goroutine stack inspector to admin/monitor (#19207)
Continues on from #19202.

Following the addition of pprof labels we can now more easily understand the relationship between a goroutine and the requests that spawn them. 

This PR takes advantage of the labels and adds a few others, then provides a mechanism for the monitoring page to query the pprof goroutine profile.

The binary profile that results from this profile is immediately piped in to the google library for parsing this and then stack traces are formed for the goroutines.

If the goroutine is within a context or has been created from a goroutine within a process context it will acquire the process description labels for that process. 

The goroutines are mapped with there associate pids and any that do not have an associated pid are placed in a group at the bottom as unbound.

In this way we should be able to more easily examine goroutines that have been stuck.

A manager command `gitea manager processes` is also provided that can export the processes (with or without stacktraces) to the command line.

Signed-off-by: Andrew Thornton <art27@cantab.net>
2022-03-31 19:01:43 +02:00

125 lines
3.4 KiB
Go

// Copyright 2021 The Gitea Authors. All rights reserved.
// Use of this source code is governed by a MIT-style
// license that can be found in the LICENSE file.
package routing
import (
"context"
"net/http"
"sync"
"time"
"code.gitea.io/gitea/modules/graceful"
"code.gitea.io/gitea/modules/process"
)
// Event indicates when the printer is triggered
type Event int
const (
// StartEvent at the beginning of a request
StartEvent Event = iota
// StillExecutingEvent the request is still executing
StillExecutingEvent
// EndEvent the request has ended (either completed or failed)
EndEvent
)
// Printer is used to output the log for a request
type Printer func(trigger Event, record *requestRecord)
type requestRecordsManager struct {
print Printer
lock sync.Mutex
requestRecords map[uint64]*requestRecord
count uint64
}
func (manager *requestRecordsManager) startSlowQueryDetector(threshold time.Duration) {
go graceful.GetManager().RunWithShutdownContext(func(ctx context.Context) {
ctx, _, finished := process.GetManager().AddTypedContext(ctx, "Service: SlowQueryDetector", process.SystemProcessType, true)
defer finished()
// This go-routine checks all active requests every second.
// If a request has been running for a long time (eg: /user/events), we also print a log with "still-executing" message
// After the "still-executing" log is printed, the record will be removed from the map to prevent from duplicated logs in future
// We do not care about accurate duration here. It just does the check periodically, 0.5s or 1.5s are all OK.
t := time.NewTicker(time.Second)
for {
select {
case <-ctx.Done():
return
case <-t.C:
now := time.Now()
var slowRequests []*requestRecord
// find all slow requests with lock
manager.lock.Lock()
for index, record := range manager.requestRecords {
if now.Sub(record.startTime) < threshold {
continue
}
slowRequests = append(slowRequests, record)
delete(manager.requestRecords, index)
}
manager.lock.Unlock()
// print logs for slow requests
for _, record := range slowRequests {
manager.print(StillExecutingEvent, record)
}
}
}
})
}
func (manager *requestRecordsManager) handler(next http.Handler) http.Handler {
return http.HandlerFunc(func(w http.ResponseWriter, req *http.Request) {
record := &requestRecord{
startTime: time.Now(),
request: req,
responseWriter: w,
}
// generate a record index an insert into the map
manager.lock.Lock()
record.index = manager.count
manager.count++
manager.requestRecords[record.index] = record
manager.lock.Unlock()
defer func() {
// just in case there is a panic. now the panics are all recovered in middleware.go
localPanicErr := recover()
if localPanicErr != nil {
record.lock.Lock()
record.panicError = localPanicErr
record.lock.Unlock()
}
// remove from the record map
manager.lock.Lock()
delete(manager.requestRecords, record.index)
manager.lock.Unlock()
// log the end of the request
manager.print(EndEvent, record)
if localPanicErr != nil {
// the panic wasn't recovered before us, so we should pass it up, and let the framework handle the panic error
panic(localPanicErr)
}
}()
req = req.WithContext(context.WithValue(req.Context(), contextKey, record))
manager.print(StartEvent, record)
next.ServeHTTP(w, req)
})
}