|
5 | 5 | package cmd
|
6 | 6 |
|
7 | 7 | import (
|
| 8 | + "context" |
| 9 | + "fmt" |
| 10 | + "io" |
8 | 11 | "os"
|
9 | 12 | "os/exec"
|
10 | 13 | "os/signal"
|
11 | 14 | "strings"
|
| 15 | + "sync" |
12 | 16 | "syscall"
|
| 17 | + "time" |
13 | 18 |
|
14 | 19 | "github.com/gitpod-io/gitpod/common-go/log"
|
| 20 | + "github.com/gitpod-io/gitpod/common-go/process" |
| 21 | + "github.com/gitpod-io/gitpod/supervisor/pkg/supervisor" |
| 22 | + "github.com/prometheus/procfs" |
| 23 | + reaper "github.com/ramr/go-reaper" |
15 | 24 | "github.com/spf13/cobra"
|
16 |
| - "golang.org/x/sys/unix" |
17 | 25 | )
|
18 | 26 |
|
19 | 27 | var initCmd = &cobra.Command{
|
20 | 28 | Use: "init",
|
21 | 29 | Short: "init the supervisor",
|
22 | 30 | Run: func(cmd *cobra.Command, args []string) {
|
23 | 31 | log.Init(ServiceName, Version, true, false)
|
24 |
| - // Because we're reaping with PID -1, we'll catch the child process for |
25 |
| - // which we've missed the notification anyways. |
| 32 | + cfg, err := supervisor.GetConfig() |
| 33 | + if err != nil { |
| 34 | + log.WithError(err).Info("cannnot load config") |
| 35 | + } |
26 | 36 | var (
|
27 |
| - sigInput = make(chan os.Signal, 1) |
28 |
| - sigReaper = make(chan os.Signal, 1) |
29 |
| - sigSupervisor = make(chan os.Signal, 1) |
| 37 | + sigInput = make(chan os.Signal, 1) |
30 | 38 | )
|
31 |
| - signal.Notify(sigInput, syscall.SIGCHLD, os.Interrupt, syscall.SIGTERM) |
32 |
| - go func() { |
33 |
| - for s := range sigInput { |
34 |
| - switch s { |
35 |
| - default: |
36 |
| - sigSupervisor <- s |
37 |
| - // the reaper needs all signals so that it can turn into |
38 |
| - // a terminating reaper if need be. |
39 |
| - fallthrough |
40 |
| - case syscall.SIGCHLD: |
41 |
| - // we don't want to blob the SIGINT/SIGTERM behaviour because |
42 |
| - // the reaper is still busy. |
43 |
| - select { |
44 |
| - case sigReaper <- s: |
45 |
| - default: |
46 |
| - } |
47 |
| - } |
48 |
| - } |
49 |
| - }() |
50 |
| - |
51 |
| - go reaper(sigReaper) |
| 39 | + signal.Notify(sigInput, os.Interrupt, syscall.SIGTERM) |
52 | 40 |
|
53 | 41 | supervisorPath, err := os.Executable()
|
54 | 42 | if err != nil {
|
@@ -76,66 +64,132 @@ var initCmd = &cobra.Command{
|
76 | 64 | return
|
77 | 65 | }
|
78 | 66 | }()
|
| 67 | + // start the reaper to clean up zombie processes |
| 68 | + reaper.Reap() |
79 | 69 |
|
80 | 70 | select {
|
81 | 71 | case <-supervisorDone:
|
82 | 72 | // supervisor has ended - we're all done here
|
83 | 73 | return
|
84 |
| - case s := <-sigSupervisor: |
| 74 | + case <-sigInput: |
85 | 75 | // we received a terminating signal - pass on to supervisor and wait for it to finish
|
86 |
| - _ = runCommand.Process.Signal(s) |
87 |
| - <-supervisorDone |
| 76 | + ctx, cancel := context.WithTimeout(context.Background(), cfg.GetTerminationGracePeriod()) |
| 77 | + defer cancel() |
| 78 | + slog := newShutdownLogger() |
| 79 | + defer slog.Close() |
| 80 | + slog.write("Shutting down all processes") |
| 81 | + |
| 82 | + terminationDone := make(chan struct{}) |
| 83 | + go func() { |
| 84 | + defer close(terminationDone) |
| 85 | + slog.TerminateSync(ctx, runCommand.Process.Pid) |
| 86 | + terminateAllProcesses(ctx, slog) |
| 87 | + close(supervisorDone) |
| 88 | + }() |
| 89 | + // wait for either successful termination or the timeout |
| 90 | + select { |
| 91 | + case <-ctx.Done(): |
| 92 | + // Time is up, but we give all the goroutines a bit more time to react to this. |
| 93 | + time.Sleep(time.Millisecond * 500) |
| 94 | + case <-terminationDone: |
| 95 | + } |
| 96 | + slog.write("Finished shutting down all processes.") |
88 | 97 | }
|
89 | 98 | },
|
90 | 99 | }
|
91 | 100 |
|
| 101 | +// terminateAllProcesses terminates all processes but ours until there are none anymore or the context is cancelled |
| 102 | +// on context cancellation any still running processes receive a SIGKILL |
| 103 | +func terminateAllProcesses(ctx context.Context, slog shutdownLogger) { |
| 104 | + for { |
| 105 | + processes, err := procfs.AllProcs() |
| 106 | + if err != nil { |
| 107 | + log.WithError(err).Error("Cannot list processes") |
| 108 | + slog.write(fmt.Sprintf("Cannot list processes: %s", err)) |
| 109 | + return |
| 110 | + } |
| 111 | + // only one process (must be us) |
| 112 | + if len(processes) == 1 { |
| 113 | + return |
| 114 | + } |
| 115 | + // terminate all processes but ourself |
| 116 | + var wg sync.WaitGroup |
| 117 | + for _, proc := range processes { |
| 118 | + if proc.PID == os.Getpid() { |
| 119 | + continue |
| 120 | + } |
| 121 | + p := proc |
| 122 | + wg.Add(1) |
| 123 | + go func() { |
| 124 | + defer wg.Done() |
| 125 | + slog.TerminateSync(ctx, p.PID) |
| 126 | + }() |
| 127 | + } |
| 128 | + wg.Wait() |
| 129 | + } |
| 130 | +} |
| 131 | + |
92 | 132 | func init() {
|
93 | 133 | rootCmd.AddCommand(initCmd)
|
94 | 134 | }
|
95 | 135 |
|
96 |
| -func reaper(sigs <-chan os.Signal) { |
97 |
| - // The reaper can be turned into a terminating reaper by writing true to this channel. |
98 |
| - // When in terminating mode, the reaper will send SIGTERM to each child that gets reparented |
99 |
| - // to us and is still running. We use this mechanism to send SIGTERM to a shell child processes |
100 |
| - // that get reparented once their parent shell terminates during shutdown. |
101 |
| - var terminating bool |
| 136 | +type shutdownLogger interface { |
| 137 | + write(s string) |
| 138 | + TerminateSync(ctx context.Context, pid int) |
| 139 | + io.Closer |
| 140 | +} |
102 | 141 |
|
103 |
| - for s := range sigs { |
104 |
| - if s != syscall.SIGCHLD { |
105 |
| - terminating = true |
106 |
| - continue |
107 |
| - } |
| 142 | +func newShutdownLogger() shutdownLogger { |
| 143 | + file := "/workspace/.gitpod/supervisor-termination.log" |
| 144 | + f, err := os.Create(file) |
| 145 | + if err != nil { |
| 146 | + log.WithError(err).WithField("file", file).Error("Couldn't create shutdown log file") |
| 147 | + } |
| 148 | + result := shutdownLoggerImpl{ |
| 149 | + file: f, |
| 150 | + startTime: time.Now(), |
| 151 | + } |
| 152 | + return &result |
| 153 | +} |
108 | 154 |
|
109 |
| - for { |
110 |
| - // wait on the process, hence remove it from the process table |
111 |
| - pid, err := unix.Wait4(-1, nil, 0, nil) |
112 |
| - // if we've been interrupted, try again until we're done |
113 |
| - for err == syscall.EINTR { |
114 |
| - pid, err = unix.Wait4(-1, nil, 0, nil) |
115 |
| - } |
116 |
| - // The calling process does not have any unwaited-for children. Let's wait for a SIGCHLD notification. |
117 |
| - if err == unix.ECHILD { |
118 |
| - break |
119 |
| - } |
120 |
| - if err != nil { |
121 |
| - log.WithField("pid", pid).WithError(err).Debug("cannot call waitpid() for re-parented child") |
122 |
| - } |
123 |
| - if !terminating { |
124 |
| - continue |
125 |
| - } |
126 |
| - proc, err := os.FindProcess(pid) |
127 |
| - if err != nil { |
128 |
| - log.WithField("pid", pid).WithError(err).Debug("cannot find re-parented process") |
129 |
| - continue |
130 |
| - } |
131 |
| - err = proc.Signal(syscall.SIGTERM) |
132 |
| - if err != nil { |
133 |
| - if !strings.Contains(err.Error(), "os: process already finished") { |
134 |
| - log.WithField("pid", pid).WithError(err).Debug("cannot send SIGTERM to re-parented process") |
135 |
| - } |
136 |
| - continue |
137 |
| - } |
138 |
| - log.WithField("pid", pid).Debug("SIGTERM'ed reparented child process") |
| 155 | +type shutdownLoggerImpl struct { |
| 156 | + file *os.File |
| 157 | + startTime time.Time |
| 158 | +} |
| 159 | + |
| 160 | +func (l *shutdownLoggerImpl) write(s string) { |
| 161 | + if l.file != nil { |
| 162 | + _, err := l.file.WriteString(fmt.Sprintf("[%s] %s \n", time.Since(l.startTime), s)) |
| 163 | + if err != nil { |
| 164 | + log.WithError(err).Error("couldn't write to log file") |
| 165 | + } |
| 166 | + } else { |
| 167 | + log.Debug(s) |
| 168 | + } |
| 169 | +} |
| 170 | +func (l *shutdownLoggerImpl) Close() error { |
| 171 | + return l.file.Close() |
| 172 | +} |
| 173 | +func (l *shutdownLoggerImpl) TerminateSync(ctx context.Context, pid int) { |
| 174 | + proc, err := procfs.NewProc(pid) |
| 175 | + if err != nil { |
| 176 | + l.write(fmt.Sprintf("Couldn't obtain process information for PID %d.", pid)) |
| 177 | + return |
| 178 | + } |
| 179 | + stat, err := proc.Stat() |
| 180 | + if err != nil { |
| 181 | + l.write(fmt.Sprintf("Couldn't obtain process information for PID %d.", pid)) |
| 182 | + } else if stat.State == "Z" { |
| 183 | + return |
| 184 | + } else { |
| 185 | + l.write(fmt.Sprintf("Terminating process %s with PID %d (state: %s, cmdlind: %s).", stat.Comm, pid, stat.State, fmt.Sprint(proc.CmdLine()))) |
| 186 | + } |
| 187 | + err = process.TerminateSync(ctx, pid) |
| 188 | + if err != nil { |
| 189 | + if err == process.ErrForceKilled { |
| 190 | + l.write("Terminating process didn't finish, but had to be force killed") |
| 191 | + } else { |
| 192 | + l.write(fmt.Sprintf("Terminating main process errored: %s", err)) |
139 | 193 | }
|
140 | 194 | }
|
141 | 195 | }
|
0 commit comments