Source code

Revision control

Other Tools

1
/* -*- Mode: C++; tab-width: 8; indent-tabs-mode: nil; c-basic-offset: 2 -*-
2
* vim: sw=2 ts=2 et lcs=trail\:.,tab\:>~ :
3
* This Source Code Form is subject to the terms of the Mozilla Public
4
* License, v. 2.0. If a copy of the MPL was not distributed with this
5
* file, You can obtain one at http://mozilla.org/MPL/2.0/. */
6
7
/**
8
* A watchdog designed to terminate shutdown if it lasts too long.
9
*
10
* This watchdog is designed as a worst-case problem container for the
11
* common case in which Firefox just won't shutdown.
12
*
13
* We spawn a thread during quit-application. If any of the shutdown
14
* steps takes more than n milliseconds (63000 by default), kill the
15
* process as fast as possible, without any cleanup.
16
*/
17
18
#include "nsTerminator.h"
19
20
#include "prthread.h"
21
#include "prmon.h"
22
#include "plstr.h"
23
#include "prio.h"
24
25
#include "nsString.h"
26
#include "nsServiceManagerUtils.h"
27
#include "nsDirectoryServiceUtils.h"
28
#include "nsAppDirectoryServiceDefs.h"
29
30
#include "nsIObserverService.h"
31
#include "nsIPrefService.h"
32
#include "nsExceptionHandler.h"
33
#include "GeckoProfiler.h"
34
#include "nsThreadUtils.h"
35
36
#if defined(XP_WIN)
37
# include <windows.h>
38
#else
39
# include <unistd.h>
40
#endif
41
42
#include "mozilla/ArrayUtils.h"
43
#include "mozilla/Atomics.h"
44
#include "mozilla/Attributes.h"
45
#include "mozilla/DebugOnly.h"
46
#include "mozilla/MemoryChecking.h"
47
#include "mozilla/Preferences.h"
48
#include "mozilla/Services.h"
49
#include "mozilla/UniquePtr.h"
50
#include "mozilla/Unused.h"
51
#include "mozilla/Telemetry.h"
52
53
#include "mozilla/dom/workerinternals/RuntimeService.h"
54
55
// Normally, the number of milliseconds that AsyncShutdown waits until
56
// it decides to crash is specified as a preference. We use the
57
// following value as a fallback if for some reason the preference is
58
// absent.
59
#define FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS 60000
60
61
// Additional number of milliseconds to wait until we decide to exit
62
// forcefully.
63
#define ADDITIONAL_WAIT_BEFORE_CRASH_MS 3000
64
65
namespace mozilla {
66
67
namespace {
68
69
/**
70
* A step during shutdown.
71
*
72
* Shutdown is divided in steps, which all map to an observer
73
* notification. The duration of a step is defined as the number of
74
* ticks between the time we receive a notification and the next one.
75
*/
76
struct ShutdownStep {
77
char const* const mTopic;
78
int mTicks;
79
80
constexpr explicit ShutdownStep(const char* const topic)
81
: mTopic(topic), mTicks(-1) {}
82
};
83
84
static ShutdownStep sShutdownSteps[] = {
85
ShutdownStep("quit-application"),
86
ShutdownStep("profile-change-teardown"),
87
ShutdownStep("profile-before-change"),
88
ShutdownStep("xpcom-will-shutdown"),
89
ShutdownStep("xpcom-shutdown"),
90
};
91
92
Atomic<bool> sShutdownNotified;
93
94
// Utility function: create a thread that is non-joinable,
95
// does not prevent the process from terminating, is never
96
// cooperatively scheduled, and uses a default stack size.
97
PRThread* CreateSystemThread(void (*start)(void* arg), void* arg) {
98
PRThread* thread =
99
PR_CreateThread(PR_SYSTEM_THREAD, /* This thread will not prevent the
100
process from terminating */
101
start, arg, PR_PRIORITY_LOW,
102
PR_GLOBAL_THREAD, /* Make sure that the thread is never
103
cooperatively scheduled */
104
PR_UNJOINABLE_THREAD, 0 /* Use default stack size */
105
);
106
MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(
107
thread); // This pointer will never be deallocated.
108
return thread;
109
}
110
111
////////////////////////////////////////////
112
//
113
// The watchdog
114
//
115
// This nspr thread is in charge of crashing the process if any stage of
116
// shutdown lasts more than some predefined duration. As a side-effect, it
117
// measures the duration of each stage of shutdown.
118
//
119
120
// The heartbeat of the operation.
121
//
122
// Main thread:
123
//
124
// * Whenever a shutdown step has been completed, the main thread
125
// swaps gHeartbeat to 0 to mark that the shutdown process is still
126
// progressing. The value swapped away indicates the number of ticks
127
// it took for the shutdown step to advance.
128
//
129
// Watchdog thread:
130
//
131
// * Every tick, the watchdog thread increments gHearbeat atomically.
132
//
133
// A note about precision:
134
// Since gHeartbeat is generally reset to 0 between two ticks, this means
135
// that gHeartbeat stays at 0 less than one tick. Consequently, values
136
// extracted from gHeartbeat must be considered rounded up.
137
Atomic<uint32_t> gHeartbeat(0);
138
139
struct Options {
140
/**
141
* How many ticks before we should crash the process.
142
*/
143
uint32_t crashAfterTicks;
144
};
145
146
/**
147
* Entry point for the watchdog thread
148
*/
149
void RunWatchdog(void* arg) {
150
NS_SetCurrentThreadName("Shutdown Hang Terminator");
151
152
// Let's copy and deallocate options, that's one less leak to worry
153
// about.
154
UniquePtr<Options> options((Options*)arg);
155
uint32_t crashAfterTicks = options->crashAfterTicks;
156
options = nullptr;
157
158
const uint32_t timeToLive = crashAfterTicks;
159
while (true) {
160
//
161
// We do not want to sleep for the entire duration,
162
// as putting the computer to sleep would suddenly
163
// cause us to timeout on wakeup.
164
//
165
// Rather, we prefer sleeping for at most 1 second
166
// at a time. If the computer sleeps then wakes up,
167
// we have lost at most one second, which is much
168
// more reasonable.
169
//
170
#if defined(XP_WIN)
171
Sleep(1000 /* ms */);
172
#else
173
usleep(1000000 /* usec */);
174
#endif
175
176
if (gHeartbeat++ < timeToLive) {
177
continue;
178
}
179
180
// The shutdown steps are not completed yet. Let's report the last one.
181
if (!sShutdownNotified) {
182
const char* lastStep = nullptr;
183
for (size_t i = 0; i < ArrayLength(sShutdownSteps); ++i) {
184
if (sShutdownSteps[i].mTicks == -1) {
185
break;
186
}
187
lastStep = sShutdownSteps[i].mTopic;
188
}
189
190
if (lastStep) {
191
nsCString msg;
192
msg.AppendPrintf(
193
"Shutdown hanging at step %s. "
194
"Something is blocking the main-thread.",
195
lastStep);
196
// This string will be leaked.
197
MOZ_CRASH_UNSAFE(strdup(msg.BeginReading()));
198
}
199
200
MOZ_CRASH("Shutdown hanging before starting.");
201
}
202
203
// Maybe some workers are blocking the shutdown.
204
mozilla::dom::workerinternals::RuntimeService* runtimeService =
205
mozilla::dom::workerinternals::RuntimeService::GetService();
206
if (runtimeService) {
207
runtimeService->CrashIfHanging();
208
}
209
210
// Shutdown is apparently dead. Crash the process.
211
CrashReporter::SetMinidumpAnalysisAllThreads();
212
213
MOZ_CRASH("Shutdown too long, probably frozen, causing a crash.");
214
}
215
}
216
217
////////////////////////////////////////////
218
//
219
// Writer thread
220
//
221
// This nspr thread is in charge of writing to disk statistics produced by the
222
// watchdog thread and collected by the main thread. Note that we use a nspr
223
// thread rather than usual XPCOM I/O simply because we outlive XPCOM and its
224
// threads.
225
//
226
227
// Utility class, used by UniquePtr<> to close nspr files.
228
class PR_CloseDelete {
229
public:
230
constexpr PR_CloseDelete() = default;
231
232
PR_CloseDelete(const PR_CloseDelete& aOther) = default;
233
234
void operator()(PRFileDesc* aPtr) const { PR_Close(aPtr); }
235
};
236
237
//
238
// Communication between the main thread and the writer thread.
239
//
240
// Main thread:
241
//
242
// * Whenever a shutdown step has been completed, the main thread
243
// obtains the number of ticks from the watchdog threads, builds
244
// a string representing all the data gathered so far, places
245
// this string in `gWriteData`, and wakes up the writer thread
246
// using `gWriteReady`. If `gWriteData` already contained a non-null
247
// pointer, this means that the writer thread is lagging behind the
248
// main thread, and the main thread cleans up the memory.
249
//
250
// Writer thread:
251
//
252
// * When awake, the writer thread swaps `gWriteData` to nullptr. If
253
// `gWriteData` contained data to write, the . If so, the writer
254
// thread writes the data to a file named "ShutdownDuration.json.tmp",
255
// then moves that file to "ShutdownDuration.json" and cleans up the
256
// data. If `gWriteData` contains a nullptr, the writer goes to sleep
257
// until it is awkened using `gWriteReady`.
258
//
259
//
260
// The data written by the writer thread will be read by another
261
// module upon the next restart and fed to Telemetry.
262
//
263
Atomic<nsCString*> gWriteData(nullptr);
264
PRMonitor* gWriteReady = nullptr;
265
266
void RunWriter(void* arg) {
267
AUTO_PROFILER_REGISTER_THREAD("Shutdown Statistics Writer");
268
NS_SetCurrentThreadName("Shutdown Statistics Writer");
269
270
MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(arg);
271
// Shutdown will generally complete before we have a chance to
272
// deallocate. This is not a leak.
273
274
// Setup destinationPath and tmpFilePath
275
276
nsCString destinationPath;
277
destinationPath.Adopt(static_cast<char*>(arg));
278
nsAutoCString tmpFilePath;
279
tmpFilePath.Append(destinationPath);
280
tmpFilePath.AppendLiteral(".tmp");
281
282
// Cleanup any file leftover from a previous run
283
Unused << PR_Delete(tmpFilePath.get());
284
Unused << PR_Delete(destinationPath.get());
285
286
while (true) {
287
//
288
// Check whether we have received data from the main thread.
289
//
290
// We perform the check before waiting on `gWriteReady` as we may
291
// have received data while we were busy writing.
292
//
293
// Also note that gWriteData may have been modified several times
294
// since we last checked. That's ok, we are not losing any important
295
// data (since we keep adding data), and we are not leaking memory
296
// (since the main thread deallocates any data that hasn't been
297
// consumed by the writer thread).
298
//
299
UniquePtr<nsCString> data(gWriteData.exchange(nullptr));
300
if (!data) {
301
// Data is not available yet.
302
// Wait until the main thread provides it.
303
PR_EnterMonitor(gWriteReady);
304
PR_Wait(gWriteReady, PR_INTERVAL_NO_TIMEOUT);
305
PR_ExitMonitor(gWriteReady);
306
continue;
307
}
308
309
MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(data.get());
310
// Shutdown may complete before we have a chance to deallocate.
311
// This is not a leak.
312
313
//
314
// Write to a temporary file
315
//
316
// In case of any error, we simply give up. Since the data is
317
// hardly critical, we don't want to spend too much effort
318
// salvaging it.
319
//
320
UniquePtr<PRFileDesc, PR_CloseDelete> tmpFileDesc(PR_Open(
321
tmpFilePath.get(), PR_WRONLY | PR_TRUNCATE | PR_CREATE_FILE, 00600));
322
323
// Shutdown may complete before we have a chance to close the file.
324
// This is not a leak.
325
MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(tmpFileDesc.get());
326
327
if (tmpFileDesc == nullptr) {
328
break;
329
}
330
if (PR_Write(tmpFileDesc.get(), data->get(), data->Length()) == -1) {
331
break;
332
}
333
tmpFileDesc.reset();
334
335
//
336
// Rename on top of destination file.
337
//
338
// This is not sufficient to guarantee that the destination file
339
// will be written correctly, but, again, we don't care enough
340
// about the data to make more efforts.
341
//
342
if (PR_Rename(tmpFilePath.get(), destinationPath.get()) != PR_SUCCESS) {
343
break;
344
}
345
}
346
}
347
348
} // namespace
349
350
NS_IMPL_ISUPPORTS(nsTerminator, nsIObserver)
351
352
nsTerminator::nsTerminator() : mInitialized(false), mCurrentStep(-1) {}
353
354
// During startup, register as an observer for all interesting topics.
355
nsresult nsTerminator::SelfInit() {
356
nsCOMPtr<nsIObserverService> os = mozilla::services::GetObserverService();
357
if (!os) {
358
return NS_ERROR_UNEXPECTED;
359
}
360
361
for (auto& shutdownStep : sShutdownSteps) {
362
DebugOnly<nsresult> rv = os->AddObserver(this, shutdownStep.mTopic, false);
363
NS_WARNING_ASSERTION(NS_SUCCEEDED(rv), "AddObserver failed");
364
}
365
366
return NS_OK;
367
}
368
369
// Actually launch these threads. This takes place at the first sign of
370
// shutdown.
371
void nsTerminator::Start() {
372
MOZ_ASSERT(!mInitialized);
373
StartWatchdog();
374
#if !defined(NS_FREE_PERMANENT_DATA)
375
// Only allow nsTerminator to write on non-leak-checked builds so we don't
376
// get leak warnings on shutdown for intentional leaks (see bug 1242084).
377
// This will be enabled again by bug 1255484 when 1255478 lands.
378
StartWriter();
379
#endif // !defined(NS_FREE_PERMANENT_DATA)
380
mInitialized = true;
381
sShutdownNotified = false;
382
}
383
384
// Prepare, allocate and start the watchdog thread.
385
// By design, it will never finish, nor be deallocated.
386
void nsTerminator::StartWatchdog() {
387
int32_t crashAfterMS =
388
Preferences::GetInt("toolkit.asyncshutdown.crash_timeout",
389
FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS);
390
// Ignore negative values
391
if (crashAfterMS <= 0) {
392
crashAfterMS = FALLBACK_ASYNCSHUTDOWN_CRASH_AFTER_MS;
393
}
394
395
// Add a little padding, to ensure that we do not crash before
396
// AsyncShutdown.
397
if (crashAfterMS > INT32_MAX - ADDITIONAL_WAIT_BEFORE_CRASH_MS) {
398
// Defend against overflow
399
crashAfterMS = INT32_MAX;
400
} else {
401
crashAfterMS += ADDITIONAL_WAIT_BEFORE_CRASH_MS;
402
}
403
404
#ifdef MOZ_VALGRIND
405
// If we're running on Valgrind, we'll be making forward progress at a
406
// rate of somewhere between 1/25th and 1/50th of normal. This can cause
407
// timeouts frequently enough to be a problem for the Valgrind runs on
408
// automation: see bug 1296819. As an attempt to avoid the worst of this,
409
// scale up the presented timeout by a factor of three. For a
410
// non-Valgrind-enabled build, or for an enabled build which isn't running
411
// on Valgrind, the timeout is unchanged.
412
if (RUNNING_ON_VALGRIND) {
413
const int32_t scaleUp = 3;
414
if (crashAfterMS >= (INT32_MAX / scaleUp) - 1) {
415
// Defend against overflow
416
crashAfterMS = INT32_MAX;
417
} else {
418
crashAfterMS *= scaleUp;
419
}
420
}
421
#endif
422
423
UniquePtr<Options> options(new Options());
424
const PRIntervalTime ticksDuration = PR_MillisecondsToInterval(1000);
425
options->crashAfterTicks = crashAfterMS / ticksDuration;
426
// Handle systems where ticksDuration is greater than crashAfterMS.
427
if (options->crashAfterTicks == 0) {
428
options->crashAfterTicks = crashAfterMS / 1000;
429
}
430
431
DebugOnly<PRThread*> watchdogThread =
432
CreateSystemThread(RunWatchdog, options.release());
433
MOZ_ASSERT(watchdogThread);
434
}
435
436
// Prepare, allocate and start the writer thread. By design, it will never
437
// finish, nor be deallocated. In case of error, we degrade
438
// gracefully to not writing Telemetry data.
439
void nsTerminator::StartWriter() {
440
if (!Telemetry::CanRecordExtended()) {
441
return;
442
}
443
nsCOMPtr<nsIFile> profLD;
444
nsresult rv = NS_GetSpecialDirectory(NS_APP_USER_PROFILE_LOCAL_50_DIR,
445
getter_AddRefs(profLD));
446
if (NS_FAILED(rv)) {
447
return;
448
}
449
450
rv = profLD->Append(NS_LITERAL_STRING("ShutdownDuration.json"));
451
if (NS_FAILED(rv)) {
452
return;
453
}
454
455
nsAutoString path;
456
rv = profLD->GetPath(path);
457
if (NS_FAILED(rv)) {
458
return;
459
}
460
461
gWriteReady = PR_NewMonitor();
462
MOZ_LSAN_INTENTIONALLY_LEAK_OBJECT(
463
gWriteReady); // We will never deallocate this object
464
PRThread* writerThread = CreateSystemThread(RunWriter, ToNewUTF8String(path));
465
466
if (!writerThread) {
467
return;
468
}
469
}
470
471
NS_IMETHODIMP
472
nsTerminator::Observe(nsISupports*, const char* aTopic, const char16_t*) {
473
if (strcmp(aTopic, "profile-after-change") == 0) {
474
return SelfInit();
475
}
476
477
// Other notifications are shutdown-related.
478
479
// As we have seen examples in the wild of shutdown notifications
480
// not being sent (or not being sent in the expected order), we do
481
// not assume a specific order.
482
if (!mInitialized) {
483
Start();
484
}
485
486
UpdateHeartbeat(aTopic);
487
#if !defined(NS_FREE_PERMANENT_DATA)
488
// Only allow nsTerminator to write on non-leak checked builds so we don't get
489
// leak warnings on shutdown for intentional leaks (see bug 1242084). This
490
// will be enabled again by bug 1255484 when 1255478 lands.
491
UpdateTelemetry();
492
#endif // !defined(NS_FREE_PERMANENT_DATA)
493
UpdateCrashReport(aTopic);
494
495
// Perform a little cleanup
496
nsCOMPtr<nsIObserverService> os = mozilla::services::GetObserverService();
497
MOZ_RELEASE_ASSERT(os);
498
(void)os->RemoveObserver(this, aTopic);
499
500
return NS_OK;
501
}
502
503
void nsTerminator::UpdateHeartbeat(const char* aTopic) {
504
// Reset the clock, find out how long the current phase has lasted.
505
uint32_t ticks = gHeartbeat.exchange(0);
506
if (mCurrentStep > 0) {
507
sShutdownSteps[mCurrentStep].mTicks = ticks;
508
}
509
510
// Find out where we now are in the current shutdown.
511
// Don't assume that shutdown takes place in the expected order.
512
int nextStep = -1;
513
for (size_t i = 0; i < ArrayLength(sShutdownSteps); ++i) {
514
if (strcmp(sShutdownSteps[i].mTopic, aTopic) == 0) {
515
nextStep = i;
516
break;
517
}
518
}
519
MOZ_ASSERT(nextStep != -1);
520
mCurrentStep = nextStep;
521
}
522
523
void nsTerminator::UpdateTelemetry() {
524
if (!Telemetry::CanRecordExtended() || !gWriteReady) {
525
return;
526
}
527
528
//
529
// We need Telemetry data on the effective duration of each step,
530
// to be able to tune the time-to-crash of each of both the
531
// Terminator and AsyncShutdown. However, at this stage, it is too
532
// late to record such data into Telemetry, so we write it to disk
533
// and read it upon the next startup.
534
//
535
536
// Build JSON.
537
UniquePtr<nsCString> telemetryData(new nsCString());
538
telemetryData->AppendLiteral("{");
539
size_t fields = 0;
540
for (auto& shutdownStep : sShutdownSteps) {
541
if (shutdownStep.mTicks < 0) {
542
// Ignore this field.
543
continue;
544
}
545
if (fields++ > 0) {
546
telemetryData->AppendLiteral(", ");
547
}
548
telemetryData->AppendLiteral(R"(")");
549
telemetryData->Append(shutdownStep.mTopic);
550
telemetryData->AppendLiteral(R"(": )");
551
telemetryData->AppendInt(shutdownStep.mTicks);
552
}
553
telemetryData->AppendLiteral("}");
554
555
if (fields == 0) {
556
// Nothing to write
557
return;
558
}
559
560
//
561
// Send data to the worker thread.
562
//
563
delete gWriteData.exchange(
564
telemetryData.release()); // Clear any data that hasn't been written yet
565
566
// In case the worker thread was sleeping, wake it up.
567
PR_EnterMonitor(gWriteReady);
568
PR_Notify(gWriteReady);
569
PR_ExitMonitor(gWriteReady);
570
}
571
572
void nsTerminator::UpdateCrashReport(const char* aTopic) {
573
// In case of crash, we wish to know where in shutdown we are
574
nsAutoCString report(aTopic);
575
576
Unused << CrashReporter::AnnotateCrashReport(
577
CrashReporter::Annotation::ShutdownProgress, report);
578
}
579
580
void XPCOMShutdownNotified() {
581
MOZ_DIAGNOSTIC_ASSERT(sShutdownNotified == false);
582
sShutdownNotified = true;
583
}
584
585
} // namespace mozilla