Linux : task work 机制

task work机制可以在内核中向指定的进程添加一些任务函数,这些任务函数会在进程返回用户态时执行,使用的是该进程的上下文。包括下面的这些API:

  • task_work_add
  • task_work_cancel
  • task_work_run

进程对象task_struct中有个字段用来存储这些待进行的任务列表头即task_works,这个结构体包含一个next指针和需要执行的函数指针。

205 /**
206  * struct callback_head - callback structure for use with RCU and task_work
207  * @next: next update requests in a list
208  * @func: actual update function to call after the grace period.
209  */
210 struct callback_head {
211         struct callback_head *next;
212         void (*func)(struct callback_head *head);
213 };
  4 
  5 static struct callback_head work_exited; /* all we need is ->next == NULL */
  6 
  7 /**
  8  * task_work_add - ask the @task to execute @work->func()
  9  * @task: the task which should run the callback
 10  * @work: the callback to run
 11  * @notify: send the notification if true
 12  *
 13  * Queue @work for task_work_run() below and notify the @task if @notify.
 14  * Fails if the @task is exiting/exited and thus it can‘t process this @work.
 15  * Otherwise @work->func() will be called when the @task returns from kernel
 16  * mode or exits.
 17  *
 18  * This is like the signal handler which runs in kernel mode, but it doesn‘t
 19  * try to wake up the @task.
 20  *
 21  * RETURNS:
 22  * 0 if succeeds or -ESRCH.
 23  */
 24 int
 25 task_work_add(struct task_struct *task, struct callback_head *work, bool notify)
 26 {
 27         struct callback_head *head;
 28 
 29         do {
 30                 head = ACCESS_ONCE(task->task_works);
 31                 if (unlikely(head == &work_exited))
 32                         return -ESRCH;
 33                 work->next = head;
 34         } while (cmpxchg(&task->task_works, head, work) != head);
 35 
 36         if (notify)
 37                 set_notify_resume(task);
 38         return 0;
 39 }

主要工作:

1. 通过CAS以无锁的形式添加了一个链表元素。(新元素排在原有链表头部)

2. set_notify_resume函数向指定的进程设置了一个_TIF_NOTIFY_RESUME标记。

task_work_run执行时机

在返回用户态之前会对当前进程的标记检查,如果相关标记置位则会调用do_notify_resume

595 int_signal:
596         testl $_TIF_DO_NOTIFY_MASK,%edx
597         jz 1f
598         movq %rsp,%rdi          # &ptregs -> arg1
599         xorl %esi,%esi          # oldset -> arg2
600         call do_notify_resume
601 1:      movl $_TIF_WORK_MASK,%edi
602 int_restore_rest:
603         RESTORE_REST
604         DISABLE_INTERRUPTS(CLBR_NONE)
605         TRACE_IRQS_OFF
606         jmp int_with_check
607         CFI_ENDPROC
608 END(system_call)

以上文件为entry_64.S,而标记定义在thread_info.c中

130 /* work to do on interrupt/exception return */
131 #define _TIF_WORK_MASK                                                  132         (0x0000FFFF &                                                   133          ~(_TIF_SYSCALL_TRACE|_TIF_SYSCALL_AUDIT|                       134            _TIF_SINGLESTEP|_TIF_SECCOMP|_TIF_SYSCALL_EMU))
 70 #define TIF_SYSCALL_TRACE       0       /* syscall trace active */
 71 #define TIF_NOTIFY_RESUME       1       /* callback before returning to user */
 72 #define TIF_SIGPENDING          2       /* signal pending */
 73 #define TIF_NEED_RESCHED        3       /* rescheduling necessary */
 74 #define TIF_SINGLESTEP          4       /* reenable singlestep on user return*/
 75 #define TIF_SYSCALL_EMU         6       /* syscall emulation active */
 76 #define TIF_SYSCALL_AUDIT       7       /* syscall auditing active */
 77 #define TIF_SECCOMP             8       /* secure computing */
 78 #define TIF_MCE_NOTIFY          10      /* notify userspace of an MCE */
 79 #define TIF_USER_RETURN_NOTIFY  11      /* notify kernel of userspace return */
 80 #define TIF_UPROBE              12      /* breakpointed or singlestepping */
 81 #define TIF_NOTSC               16      /* TSC is not accessible in userland */
 82 #define TIF_IA32                17      /* IA32 compatibility process */
 83 #define TIF_FORK                18      /* ret_from_fork */
 84 #define TIF_NOHZ                19      /* in adaptive nohz mode */
 85 #define TIF_MEMDIE              20      /* is terminating due to OOM killer */
 86 #define TIF_POLLING_NRFLAG      21      /* idle is polling for TIF_NEED_RESCHED */
 87 #define TIF_IO_BITMAP           22      /* uses I/O bitmap */
 88 #define TIF_FORCED_TF           24      /* true if TF in eflags artificially */
 89 #define TIF_BLOCKSTEP           25      /* set when we want DEBUGCTLMSR_BTF */
 90 #define TIF_LAZY_MMU_UPDATES    27      /* task is updating the mmu lazily */
 91 #define TIF_SYSCALL_TRACEPOINT  28      /* syscall tracepoint instrumentation */
 92 #define TIF_ADDR32              29      /* 32-bit address space on 64 bits */
 93 #define TIF_X32                 30      /* 32-bit native x86-64 binary */
 94 

即_TIF_WORK_MASK表示除开(_TIF_SYSCALL_TRACE, _TIF_SYSCALL_AUDIT, _TIF_SINGLESTEP, _TIF_SECCOMP, _TIF_SYSCALL_EMU)之外的所有标记。自然包括了_TIF_NOTIFY_RESUME标记。

do_notify_resume函数

729 /*
730  * notification of userspace execution resumption
731  * - triggered by the TIF_WORK_MASK flags
732  */
733 __visible void
734 do_notify_resume(struct pt_regs *regs, void *unused, __u32 thread_info_flags)
735 {
736         user_exit();
737 
738 #ifdef CONFIG_X86_MCE
739         /* notify userspace of pending MCEs */
740         if (thread_info_flags & _TIF_MCE_NOTIFY)
741                 mce_notify_process();
742 #endif /* CONFIG_X86_64 && CONFIG_X86_MCE */
743 
744         if (thread_info_flags & _TIF_UPROBE)
745                 uprobe_notify_resume(regs);
746 
747         /* deal with pending signal delivery */
748         if (thread_info_flags & _TIF_SIGPENDING)
749                 do_signal(regs);
750 
751         if (thread_info_flags & _TIF_NOTIFY_RESUME) {
752                 clear_thread_flag(TIF_NOTIFY_RESUME);
753                 tracehook_notify_resume(regs);
754         }
755         if (thread_info_flags & _TIF_USER_RETURN_NOTIFY)
756                 fire_user_return_notifiers();
757 
758         user_enter();
759 }

可以看到在其中调用tracehook_notify_resume函数,也包括其他一些如信号处理相关的函数。

tracehook_notify_resume

174 /**
175  * tracehook_notify_resume - report when about to return to user mode
176  * @regs:               user-mode registers of @current task
177  *
178  * This is called when %TIF_NOTIFY_RESUME has been set.  Now we are
179  * about to return to user mode, and the user state in @regs can be
180  * inspected or adjusted.  The caller in arch code has cleared
181  * %TIF_NOTIFY_RESUME before the call.  If the flag gets set again
182  * asynchronously, this will be called again before we return to
183  * user mode.
184  *
185  * Called without locks.
186  */
187 static inline void tracehook_notify_resume(struct pt_regs *regs)
188 {
189         /*
190          * The caller just cleared TIF_NOTIFY_RESUME. This barrier
191          * pairs with task_work_add()->set_notify_resume() after
192          * hlist_add_head(task->task_works);
193          */
194         smp_mb__after_atomic();
195         if (unlikely(current->task_works))
196                 task_work_run();
197 }

在进程对象的task_works不为null的情况下才有任务需要执行。

task_work_run

 77 /**
 78  * task_work_run - execute the works added by task_work_add()
 79  *
 80  * Flush the pending works. Should be used by the core kernel code.
 81  * Called before the task returns to the user-mode or stops, or when
 82  * it exits. In the latter case task_work_add() can no longer add the
 83  * new work after task_work_run() returns.
 84  */
 85 void task_work_run(void)
 86 {
 87         struct task_struct *task = current;
 88         struct callback_head *work, *head, *next;
 89 
 90         for (;;) {
 91                 /*
 92                  * work->func() can do task_work_add(), do not set
 93                  * work_exited unless the list is empty.
 94                  */
 95                 do {
 96                         work = ACCESS_ONCE(task->task_works);
 97                         head = !work && (task->flags & PF_EXITING) ?
 98                                 &work_exited : NULL;
 99                 } while (cmpxchg(&task->task_works, work, head) != work);
100 
101                 if (!work)
102                         break;
103                 /*
104                  * Synchronize with task_work_cancel(). It can‘t remove
105                  * the first entry == work, cmpxchg(task_works) should
106                  * fail, but it can play with *work and other entries.
107                  */
108                 raw_spin_unlock_wait(&task->pi_lock);
109                 smp_mb();
110 
111                 /* Reverse the list to run the works in fifo order */
112                 head = NULL;
113                 do {
114                         next = work->next;
115                         work->next = head;
116                         head = work;
117                         work = next;
118                 } while (work);
119 
120                 work = head;
121                 do {
122                         next = work->next;
123                         work->func(work);
124                         work = next;
125                         cond_resched();
126                 } while (work);
127         }
128 }

1. 通过CAS,以无锁的方式取得task_works链表

2. 因为原链表是按元素添加到链表的时间逆序排列的(见task_work_add),先把链表反转一遍

3. 反转链表后,遍历链表,执行各个元素的任务函数即work->func(work) 

郑重声明:本站内容如果来自互联网及其他传播媒体,其版权均属原媒体及文章作者所有。转载目的在于传递更多信息及用于网络分享,并不代表本站赞同其观点和对其真实性负责,也不构成任何其他建议。