WatchDog
初始化
startOtherServices
private void startOtherServices() {
---------------------------------------------------------
traceBeginAndSlog("InitWatchdog");
final Watchdog watchdog = Watchdog.getInstance();
watchdog.init(context, mActivityManagerService);
---------------------------------------------------------
mActivityManagerService.systemReady(new Runnable() {
@Override
public void run() {
Slog.i(TAG, "Making services ready");
------------------------------------------------------------------
Watchdog.getInstance().start();
// It is now okay to let the various system services start their
// third party code...
Trace.traceEnd(Trace.TRACE_TAG_SYSTEM_SERVER);
}
}
WatchDog
public class Watchdog extends Thread {
static final String TAG = "Watchdog";
static final String BINDERTRACKER = "binderTracker";
// Set this to true to use debug default values.
static final boolean DB = false;
// Set this to true to have the watchdog record kernel thread stacks when it fires
static final boolean RECORD_KERNEL_THREADS = true;
static final long DEFAULT_TIMEOUT = DB ? 10*1000 : 60*1000;
static final long CHECK_INTERVAL = DEFAULT_TIMEOUT / 2;
static Watchdog sWatchdog;
// Looper Checker,用于检查线程的消息队列是否长时间处于工作状态。Watchdog自身的消息队列,ui, Io, display这些全局的消息队列都是被检查的对象
/* This handler will be used to post message back onto the main thread */
final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
// Monitor Checker,用于检查是Monitor对象可能发生的死锁, AMS, PKMS, WMS等核心的系统服务都是Monitor对象。
final HandlerChecker mMonitorChecker;
public static Watchdog getInstance() {
if (sWatchdog == null) {
sWatchdog = new Watchdog();
}
return sWatchdog;
}
private Watchdog() {
super("watchdog");
// Initialize handler checkers for each common thread we want to check. Note
// that we are not currently checking the background thread, since it can
// potentially hold longer running operations with no guarantees about the timeliness
// of operations there.
// 将前台线程加入队列
// The shared foreground thread is the main checker. It is where we
// will also dispatch monitor checks and do other work.
mMonitorChecker = new HandlerChecker(FgThread.getHandler(),
"foreground thread", DEFAULT_TIMEOUT);
mHandlerCheckers.add(mMonitorChecker);
// 将主线程加入队列
// Add checker for main thread. We only do a quick check since there
// can be UI running on the thread.
mHandlerCheckers.add(new HandlerChecker(new Handler(Looper.getMainLooper()),
"main thread", DEFAULT_TIMEOUT));
// 将ui线程加入队列
// Add checker for shared UI thread.
mHandlerCheckers.add(new HandlerChecker(UiThread.getHandler(),
"ui thread", DEFAULT_TIMEOUT));
// 将i/o线程加入队列
// And also check IO thread.
mHandlerCheckers.add(new HandlerChecker(IoThread.getHandler(),
"i/o thread", DEFAULT_TIMEOUT));
// 将display线程加入队列
// And the display thread.
mHandlerCheckers.add(new HandlerChecker(DisplayThread.getHandler(),
"display thread", DEFAULT_TIMEOUT));
// 监听Binder线程
// Initialize monitor for Binder threads.
addMonitor(new BinderThreadMonitor());
}
public void addMonitor(Monitor monitor) {
synchronized (this) {
if (isAlive()) {
throw new RuntimeException("Monitors can't be added once the Watchdog is running");
}
mMonitorChecker.addMonitor(monitor);
}
}
public void addThread(Handler thread) {
addThread(thread, DEFAULT_TIMEOUT);
}
public void addThread(Handler thread, long timeoutMillis) {
synchronized (this) {
if (isAlive()) {
throw new RuntimeException("Threads can't be added once the Watchdog is running");
}
final String name = thread.getLooper().getThread().getName();
mHandlerCheckers.add(new HandlerChecker(thread, name, timeoutMillis));
}
}
public void init(Context context, ActivityManagerService activity) {
mResolver = context.getContentResolver();
mActivity = activity;
// 监听重启广播
context.registerReceiver(new RebootRequestReceiver(),
new IntentFilter(Intent.ACTION_REBOOT),
android.Manifest.permission.REBOOT, null);
}
public interface Monitor {
void monitor();
}
/**
* Used for checking status of handle threads and scheduling monitor callbacks.
*/
public final class HandlerChecker implements Runnable {
public void scheduleCheckLocked() {
if (mMonitors.size() == 0 && mHandler.getLooper().getQueue().isPolling()) {
// If the target looper has recently been polling, then
// there is no reason to enqueue our checker on it since that
// is as good as it not being deadlocked. This avoid having
// to do a context switch to check the thread. Note that we
// only do this if mCheckReboot is false and we have no
// monitors, since those would need to be executed at this point.
mCompleted = true;
return;
}
if (!mCompleted) {
// we already have a check in flight, so no need
return;
}
mCompleted = false;
mCurrentMonitor = null;
mStartTime = SystemClock.uptimeMillis();
// 插入消息到消息队列头部
mHandler.postAtFrontOfQueue(this);
}
@Override
public void run() {
// 处理插入消息队列头部的消息
final int size = mMonitors.size();
for (int i = 0 ; i < size ; i++) {
synchronized (Watchdog.this) {
mCurrentMonitor = mMonitors.get(i);
}
// 检查服务对象锁
mCurrentMonitor.monitor();
}
synchronized (Watchdog.this) {
mCompleted = true;
mCurrentMonitor = null;
}
}
}
/**
* Monitor for checking the availability of binder threads. The monitor will block until
* there is a binder thread available to process in coming IPCs to make sure other processes
* can still communicate with the service.
*/
private static final class BinderThreadMonitor implements Watchdog.Monitor {
@Override
public void monitor() {
Binder.blockUntilThreadAvailable();
}
}
}
UI Thread
public class ServiceThread extends HandlerThread {
}
public final class UiThread extends ServiceThread {
private static UiThread sInstance;
private static Handler sHandler;
private UiThread() {
super("android.ui", Process.THREAD_PRIORITY_FOREGROUND, false /*allowIo*/);
// Make sure UiThread is in the fg stune boost group
Process.setThreadGroup(Process.myTid(), Process.THREAD_GROUP_TOP_APP);
}
private static void ensureThreadLocked() {
if (sInstance == null) {
sInstance = new UiThread();
sInstance.start();
sInstance.getLooper().setTraceTag(Trace.TRACE_TAG_ACTIVITY_MANAGER);
sHandler = new Handler(sInstance.getLooper());
}
}
public static UiThread get() {
synchronized (UiThread.class) {
ensureThreadLocked();
return sInstance;
}
}
public static Handler getHandler() {
synchronized (UiThread.class) {
ensureThreadLocked();
return sHandler;
}
}
}
添加监听
Monitor Checker
final HandlerChecker mMonitorChecker;
用于检查是Monitor对象可能发生的死锁, AMS, PKMS, WMS等核心的系统服务都是Monitor对象。
预警我们不能长时间持有核心系统服务的对象锁,否则会阻塞很多函数的运行;
Looper Checker
final ArrayList<HandlerChecker> mHandlerCheckers = new ArrayList<>();
用于检查线程的消息队列是否长时间处于工作状态。Watchdog自身的消息队列,ui, Io, display这些全局的消息队列都是被检查的对象, 此外,一些重要的线程的消息队列,也会加入到Looper Checker中,譬如AMS, PKMS,这些是在对应的对象初始化时加入的
预警我们不能长时间的霸占消息队列,否则其他消息将得不到处理
AMS
public final class ActivityManagerService extends ActivityManagerNative
implements Watchdog.Monitor, BatteryStatsImpl.BatteryCallback {
public ActivityManagerService(Context systemContext) {
// 检查 AMS 对象锁
Watchdog.getInstance().addMonitor(this);
// 重要线程消息队列
Watchdog.getInstance().addThread(mHandler);
}
public void monitor() {
synchronized (this) { }
}
}
核心原理
在添加Checker之后,该如何使用这些Checker呢?因为Watchdog继承Thread,直接看run方法
@Override
public void run() {
boolean waitedHalf = false;
// 循环检查/每30秒
while (true) {
final ArrayList<HandlerChecker> blockedCheckers;
final String subject;
final boolean allowRestart;
int debuggerWasConnected = 0;
synchronized (this) {
long timeout = CHECK_INTERVAL;
//1、处理所有的HandlerChecker(插入消息到消息队列头部)
// Make sure we (re)spin the checkers that have become idle within
// this wait-and-check interval
for (int i=0; i<mHandlerCheckers.size(); i++) {
HandlerChecker hc = mHandlerCheckers.get(i);
hc.scheduleCheckLocked();
}
if (debuggerWasConnected > 0) {
debuggerWasConnected--;
}
// 2. 开始定期检查(等待30秒后检查checker状态,即HandlerChecker的run方法执行情况)
// NOTE: We use uptimeMillis() here because we do not want to increment the time we
// wait while asleep. If the device is asleep then the thing that we are waiting
// to timeout on is asleep as well and won't have a chance to run, causing a false
// positive on when to kill things.
long start = SystemClock.uptimeMillis();
while (timeout > 0) {
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
try {
wait(timeout);
} catch (InterruptedException e) {
Log.wtf(TAG, e);
}
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
timeout = CHECK_INTERVAL - (SystemClock.uptimeMillis() - start);
}
// 3. 获取状态,状态有如下三种,
final int waitState = evaluateCheckerCompletionLocked();
if (waitState == COMPLETED) {
// The monitors have returned; reset
waitedHalf = false;
continue;
} else if (waitState == WAITING) {
// 等待时间小于30秒
// still waiting but within their configured intervals; back off and recheck
continue;
} else if (waitState == WAITED_HALF) {
// 等待时间在30秒到60秒之间
if (!waitedHalf) {
// We've waited half the deadlock-detection interval. Pull a stack
// trace and wait another half.
ArrayList<Integer> pids = new ArrayList<Integer>();
pids.add(Process.myPid());
ActivityManagerService.dumpStackTraces(true, pids, null, null,
NATIVE_STACKS_OF_INTEREST);
waitedHalf = true;
}
continue;
}
// 走到这里,说明存在超时的HandlerChecker
// something is overdue!
blockedCheckers = getBlockedCheckersLocked();
subject = describeCheckersLocked(blockedCheckers);
allowRestart = mAllowRestart;
}
// Event log打印发生了watchdog
// If we got here, that means that the system is most likely hung.
// First collect stack traces from all threads of the system process.
// Then kill this process so that the system will restart.
EventLog.writeEvent(EventLogTags.WATCHDOG, subject);
ArrayList<Integer> pids = new ArrayList<Integer>();
pids.add(Process.myPid());
//Add processes to <firstPids> which are communicating with <app.pid>.
boolean enableTrackBinder = SystemProperties.getBoolean("persist.sys.enableTrackBinder",false);
if(enableTrackBinder){
int zygotePid = Process.myPpid();
Log.i(BINDERTRACKER, "zygotePid: "+ zygotePid);
BinderTracker binderTracker = new BinderTracker(Process.myPid());
ArrayList<Integer> binderPids = null;
binderPids = binderTracker.getBinderTransaction();
if (binderPids != null && binderPids.size() != 0) {
int binderPidsSize = binderPids.size();
for (int mPerBinderPids = 0; mPerBinderPids < binderPidsSize; mPerBinderPids++) {
if (!pids.contains(binderPids.get(mPerBinderPids))) {
try {
int parentPid=Process.getParentPid(binderPids.get(mPerBinderPids));
if (zygotePid == parentPid){
pids.add(binderPids.get(mPerBinderPids));
} else {
Log.i(BINDERTRACKER, "binder communication with native process : "+ binderPids.get(mPerBinderPids));
}
} finally {
}
}
}
}
}
//开始dumpStackTraces,包含pids中的进程和getInterestingNativePids中的进程
if (mPhonePid > 0) pids.add(mPhonePid);
// Pass !waitedHalf so that just in case we somehow wind up here without having
// dumped the halfway stacks, we properly re-initialize the trace file.
final File stack = ActivityManagerService.dumpStackTraces(
!waitedHalf, pids, null, null, NATIVE_STACKS_OF_INTEREST);
// 等待2s确保dumpStackTraces输出完成
// Give some extra time to make sure the stack traces get written.
// The system's been hanging for a minute, another second or two won't hurt much.
SystemClock.sleep(2000);
// 开始dumpKernelStackTraces
// Pull our own kernel thread stacks as well if we're configured for that
if (RECORD_KERNEL_THREADS) {
// 输出kernel trace信息
dumpKernelStackTraces();
}
String tracesPath = SystemProperties.get("dalvik.vm.stack-trace-file", null);
String traceFileNameAmendment = "_SystemServer_WDT" + mTraceDateFormat.format(new Date());
if (tracesPath != null && tracesPath.length() != 0) {
File traceRenameFile = new File(tracesPath);
String newTracesPath;
int lpos = tracesPath.lastIndexOf (".");
if (-1 != lpos)
newTracesPath = tracesPath.substring (0, lpos) + traceFileNameAmendment + tracesPath.substring (lpos);
else
newTracesPath = tracesPath + traceFileNameAmendment;
traceRenameFile.renameTo(new File(newTracesPath));
tracesPath = newTracesPath;
}
final File newFd = new File(tracesPath);
// 输出dropbox
// Try to add the error to the dropbox, but assuming that the ActivityManager
// itself may be deadlocked. (which has happened, causing this statement to
// deadlock and the watchdog as a whole to be ineffective)
Thread dropboxThread = new Thread("watchdogWriteToDropbox") {
public void run() {
mActivity.addErrorToDropBox(
"watchdog", null, "system_server", null, null,
subject, null, newFd, null);
}
};
dropboxThread.start();
try {
dropboxThread.join(2000); // wait up to 2 seconds for it to return.
} catch (InterruptedException ignored) {}
// At times, when user space watchdog traces don't give an indication on
// which component held a lock, because of which other threads are blocked,
// (thereby causing Watchdog), crash the device to analyze RAM dumps
boolean crashOnWatchdog = SystemProperties
.getBoolean("persist.sys.crashOnWatchdog", false);
if (crashOnWatchdog) {
// Trigger the kernel to dump all blocked threads, and backtraces
// on all CPUs to the kernel log
Slog.e(TAG, "Triggering SysRq for system_server watchdog");
doSysRq('w');
doSysRq('l');
// wait until the above blocked threads be dumped into kernel log
SystemClock.sleep(3000);
// now try to crash the target
doSysRq('c');
}
IActivityController controller;
synchronized (this) {
controller = mController;
}
if (controller != null) {
Slog.i(TAG, "Reporting stuck state to activity controller");
// 将阻塞状态报告给 activity controller
try {
Binder.setDumpDisabled("Service dumps disabled due to hung system process.");
// 1 = keep waiting, -1 = kill system
// 1 = 继续等待 -1 = 杀死系统
int res = controller.systemNotResponding(subject);
if (res >= 0) {
Slog.i(TAG, "Activity controller requested to coninue to wait");
waitedHalf = false;
continue;
}
} catch (RemoteException e) {
}
}
// Only kill the process if the debugger is not attached.
if (Debug.isDebuggerConnected()) {
debuggerWasConnected = 2;
}
if (debuggerWasConnected >= 2) {
Slog.w(TAG, "Debugger connected: Watchdog is *not* killing the system process");
} else if (debuggerWasConnected > 0) {
Slog.w(TAG, "Debugger was connected: Watchdog is *not* killing the system process");
} else if (!allowRestart) {
Slog.w(TAG, "Restart not allowed: Watchdog is *not* killing the system process");
} else {
Slog.w(TAG, "*** WATCHDOG KILLING SYSTEM PROCESS: " + subject);
// 遍历输入阻塞线程的trace信息
for (int i=0; i<blockedCheckers.size(); i++) {
Slog.w(TAG, blockedCheckers.get(i).getName() + " stack trace:");
StackTraceElement[] stackTrace
= blockedCheckers.get(i).getThread().getStackTrace();
for (StackTraceElement element: stackTrace) {
Slog.w(TAG, " at " + element);
}
}
Slog.w(TAG, "*** GOODBYE!");
// 最终杀死System进程
Process.killProcess(Process.myPid());
System.exit(10);
}
waitedHalf = false;
}
}
总结
Watchdog是一个运行在system_server进程的名为”watchdog”的线程::
- Watchdog运作过程,当阻塞时间超过1分钟则触发一次watchdog,会杀死system_server,触发上层重启;
mHandlerCheckers
记录所有的HandlerChecker对象的列表,包括foreground, main, ui, i/o, display线程的handler;mHandlerChecker.mMonitors
记录所有Watchdog目前正在监控Monitor,所有的这些monitors都运行在foreground线程。- 有两种方式加入
Watchdog监控
:- addThread():用于监测Handler线程,默认超时时长为60s.这种超时往往是所对应的handler线程消息处理得慢;
- addMonitor(): 用于监控实现了Watchdog.Monitor接口的服务.这种超时可能是”android.fg”线程消息处理得慢,也可能是monitor迟迟拿不到锁;
以下情况,即使触发了Watchdog,也不会杀掉system_server进程:
- monkey: 设置IActivityController,拦截systemNotResponding事件, 比如monkey.
- hang: 执行am hang命令,不重启;
- debugger: 连接debugger的情况, 不重启;
监控Handler线程
Watchdog监控的线程有:默认地DEFAULT_TIMEOUT=60s
,调试时才为10s方便找出潜在的ANR问题。
线程名 | 对应handler | 说明 | Timeout |
---|---|---|---|
main | new Handler(Looper.getMainLooper()) | 当前主线程 | 1min |
android.fg | FgThread.getHandler | 前台线程 | 1min |
android.ui | UiThread.getHandler | UI线程 | 1min |
android.io | IoThread.getHandler | I/O线程 | 1min |
android.display | DisplayThread.getHandler | display线程 | 1min |
ActivityManager | AMS.MainHandler | AMS线程 | 1min |
PowerManagerService | PMS.PowerManagerHandler | PMS线程 | 1min |
PackageManager | PKMS.PackageHandler | PKMS线程 | 10min |
目前watchdog会监控system_server进程中的以上8个线程:
- 前7个线程的Looper消息处理时间不得超过1分钟;
- PackageManager线程的处理时间不得超过10分钟;
监控同步锁
能够被Watchdog监控的系统服务都实现了Watchdog.Monitor接口,并实现其中的monitor()方法。运行在android.fg
线程,
系统中实现该接口类主要有:
- ActivityManagerService
- WindowManagerService
- InputManagerService
- PowerManagerService
- NetworkManagementService
- MountService
- NativeDaemonConnector
- BinderThreadMonitor
- MediaProjectionManagerService
- MediaRouterService
- MediaSessionService
- BinderThreadMonitor
输出信息
watchdog在check过程中出现阻塞1分钟的情况,则会输出:
- AMS.dumpStackTraces:输出system_server和3个native进程的traces
- 该方法会输出两次,第一次在超时30s的地方;第二次在超时1min;
- WD.dumpKernelStackTraces,输出system_server进程中所有线程的kernel stack;
- 节点/proc/%d/task获取进程内所有的线程列表
- 节点/proc/%d/stack获取kernel的栈
- doSysRq, 触发kernel来dump所有阻塞线程,输出所有CPU的backtrace到kernel log;
- 节点/proc/sysrq-trigger
- dropBox,输出文件到/data/system/dropbox,内容是trace + blocked信息
- 杀掉system_server,进而触发zygote进程自杀,从而重启上层framework。