softirq原理以及源碼分析－成功运行的部落格

Linux 的softirq機制是與SMP緊密不可分的。為此，整個softirq機制的設計與實現中自始自終都貫徹了一個思想：“誰觸發，誰執行”（Who marks，Who runs），也即觸發軟中斷的那個CPU負責執行它所觸發的軟中斷，而且每個CPU都由它自己的軟中斷觸發與控制機制。這個設計思想也使得softirq 機制充分利用了SMP系統的性能和特點。多個softirq可以並行執行，甚至同一個softirq可以在多個processor上同時執行。

一、softirq的實現
     每個softirq在內核中通過struct softirq_action來表示，另外，通過全局屬組softirq_vec標識當前內核支持的所有的softirq。
/* softirq mask and active fields moved to irq_cpustat_t in
* asm/hardirq.h to get better cache usage. KAO
*/

struct softirq_action
{
    void    (*action)(struct softirq_action *);
};

static struct softirq_action softirq_vec[NR_SOFTIRQS] __cacheline_aligned_in_smp;
Linux內核最多可以支持32個softirq（思考：為什麽是32個？），但當前只實現了10個，如下：
enum
{
    HI_SOFTIRQ=0,
    TIMER_SOFTIRQ,
    NET_TX_SOFTIRQ,
    NET_RX_SOFTIRQ,
    BLOCK_SOFTIRQ,
    BLOCK_IOPOLL_SOFTIRQ,
    TASKLET_SOFTIRQ,
    SCHED_SOFTIRQ,
    HRTIMER_SOFTIRQ,
    RCU_SOFTIRQ,    /* Preferable RCU should always be the last softirq */

    NR_SOFTIRQS
};
二、softirq處理函數

    struct softirq_action結構體中，只有一個函數指針成員action，即指向用戶定義的softirq處理函數。當執行時，可以通過如下代碼：
                     softirq_vec[i]->action(i);
    一個註冊的softirq在執行之前必須被激活，術語稱為"raise the softirq"。被激活的softirq通常並不會立即執行，一般會在之後的某個時刻檢查當前系統中是否有被pending的softirq，如果有就去執行，Linux內核中檢查是否有softirq掛起的檢查點主要有以下三類：
（1）硬件中斷代碼返回的時候
/*
* Exit an interrupt context. Process softirqs if needed and possible:
*/
void irq_exit(void)
{
    account_system_vtime(current);
    trace_hardirq_exit();
    sub_preempt_count(IRQ_EXIT_OFFSET);
    if (!in_interrupt() && local_softirq_pending())
        invoke_softirq();

    rcu_irq_exit();
#ifdef CONFIG_NO_HZ
    /* Make sure that timer wheel updates are propagated */
    if (idle_cpu(smp_processor_id()) && !in_interrupt() && !need_resched())
        tick_nohz_stop_sched_tick(0);
#endif
    preempt_enable_no_resched();
}
（2）ksoftirqd內核服務線程運行的時候
static int run_ksoftirqd(void * __bind_cpu)
{
    ... ...
        while (local_softirq_pending()) {
            /* Preempt disable stops cpu going offline.
             If already offline, we'll be on wrong CPU:
             don't process */
            if (cpu_is_offline((long)__bind_cpu))
                goto wait_to_die;
            do_softirq();
            preempt_enable_no_resched();
            cond_resched();
            preempt_disable();
            rcu_note_context_switch((long)__bind_cpu);
        }
        preempt_enable();
        set_current_state(TASK_INTERRUPTIBLE);
    }
    __set_current_state(TASK_RUNNING);
    return 0;
... ...
}
（3）在一些內核子系統中顯示的去檢查掛起的softirq
int netif_rx_ni(struct sk_buff *skb)
{
    int err;

    preempt_disable();
    err = netif_rx(skb);
    if (local_softirq_pending())
        do_softirq();
    preempt_enable();

    return err;
}
下面重點分析以下do_softirq()，了解Linux內核到底是怎麽來處理softirq的。
asmlinkage void do_softirq(void)
{
    unsigned long flags;
    struct thread_info *curctx;
    union irq_ctx *irqctx;
    u32 *isp;

    if (in_interrupt()) /*這個函數需要仔細理解???*/
        return;

    local_irq_save(flags);

    if (local_softirq_pending()) {
        curctx = current_thread_info();
        irqctx = __get_cpu_var(softirq_ctx);
        irqctx->tinfo.task = curctx->task;
        irqctx->tinfo.previous_esp = current_stack_pointer;

        /* build the stack frame on the softirq stack */
        isp = (u32 *) ((char *)irqctx + sizeof(*irqctx));

        call_on_stack(__do_softirq, isp);
        /*
         * Shouldnt happen, we returned above if in_interrupt():
         */
        WARN_ON_ONCE(softirq_count());
    }

    local_irq_restore(flags);
}
do_softirq主要是完成了以下幾個功能：
（1）檢查當前processor上是否有pending的softirq
（2）如果有pending的softirq，為softirq的處理建立新的堆棧，即建立新的軟中斷上下文環境
（3）處理軟中斷__do_softirq
這裏需要重點分析一下in_interrupt（）函數的含義。在linux內核中，為了方便判斷當前執行路徑在哪個上下文環境中，定義了幾個接口：
#define hardirq_count() (preempt_count() & HARDIRQ_MASK)
#define softirq_count() (preempt_count() & SOFTIRQ_MASK)
#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \
     | NMI_MASK))
/*
* Are we doing bottom half or hardware interrupt processing?
* Are we in a softirq context? Interrupt context?
*/
#define in_irq() (hardirq_count())
#define in_softirq() (softirq_count())
#define in_interrupt() (irq_count())
/*
* Are we in NMI context?
*/
#define in_nmi() (preempt_count() & NMI_MASK)
從註釋可以看出包括：硬件中斷上下文，軟件中斷上下文，不可屏蔽上下文等。在這些宏中，都涉及到了preempt_count()這個宏，這個宏是一個比較重要的宏，在Linux源碼中對其做了詳細的註釋：
/*
* We put the hardirq and softirq counter into the preemption
* counter. The bitmask has the following meaning:
*
* - bits 0-7 are the preemption count (max preemption depth: 256)
* - bits 8-15 are the softirq count (max # of softirqs: 256)
*
* The hardirq count can in theory reach the same as NR_IRQS.
* In reality, the number of nested IRQS is limited to the stack
* size as well. For archs with over 1000 IRQS it is not practical
* to expect that they will all nest. We give a max of 10 bits for
* hardirq nesting. An arch may choose to give less than 10 bits.
* m68k expects it to be 8.
*
* - bits 16-25 are the hardirq count (max # of nested hardirqs: 1024)
* - bit 26 is the NMI_MASK
* - bit 28 is the PREEMPT_ACTIVE flag
*
* PREEMPT_MASK: 0x000000ff
* SOFTIRQ_MASK: 0x0000ff00
* HARDIRQ_MASK: 0x03ff0000
* NMI_MASK: 0x04000000
*/
從註釋可以看出，preempt_count各個bit位的含義：
（1）bit0~7位表示搶占計數，即支持最大的搶占深度為256
（2）bit8~15位表示軟中斷計數，即支持最大的軟中斷的個數為256，需要註意的是，由於軟中斷還受制於pending狀態，一個32位的變量，因此實際最大只能支持32個軟中斷。
（3）bit16~25位表示硬件中斷嵌套層數，即最大可支持的嵌套層次為1024，實際情況下這是不可能的，因為中斷的嵌套層數還受制於中斷處理的棧空間的大小。
    介紹了這麽多，現在了重點分析下上面提到的in_interrupt到底表示什麽意思？
#define in_interrupt() (irq_count())

#define irq_count() (preempt_count() & (HARDIRQ_MASK | SOFTIRQ_MASK \

| NMI_MASK))
從其宏定義可以看出，in_interrupt宏的值是硬件中斷嵌套層數，軟中斷計數以及可屏蔽中斷三者之和。回到do_softirq的代碼中，如果in_interrupt的值大於0，就不會處理軟中斷，意思是當有硬件中斷嵌套，其他軟中斷以及不可屏蔽中斷的情況下，不會去處理軟中斷。對於中斷的嵌套層數以及不可屏蔽中斷是比較好理解的，對於軟中斷，應該去分析以下，在什麽地方軟中斷的計數會增加：
__local_bh_disable((unsigned long)__builtin_return_address(0));
static inline void __local_bh_disable(unsigned long ip)
{
    add_preempt_count(SOFTIRQ_OFFSET);
    barrier();
}
# define add_preempt_count(val)    do { preempt_count() += (val); } while (0)
從代碼可以看出，禁止中斷下半部分的函數會增加軟中斷的計數，即當有軟中斷的do_softirq在進行處理時，如果此時被硬件中斷打斷，而且在硬件中斷中又激活了優先級更高的軟中斷，當硬件中斷退出時，那麽當再去執行do_softirq時，此時in_interrupt > 0，豈不是死鎖了！！！希望大家指教。
實際的處理函數為__do_softirq：
asmlinkage void __do_softirq(void)
{
    struct softirq_action *h;
    __u32 pending;
    int max_restart = MAX_SOFTIRQ_RESTART; /*不啟動ksoftirqd之前，最大的處理softirq的次數，經驗值*/
    int cpu;
    /*取得當前被掛起的softirq，同時這裏也解釋了為什麽Linux內核最多支持32個softirq,因為pending只有32bit*/
    pending = local_softirq_pending();
    account_system_vtime(current);

    __local_bh_disable((unsigned long)__builtin_return_address(0));
    lockdep_softirq_enter();

    cpu = smp_processor_id();
restart:
    /* Reset the pending bitmask before enabling irqs */
    set_softirq_pending(0);/*獲取了pending的softirq之後，清空所有pending的softirq的標誌*/

    local_irq_enable();

    h = softirq_vec;

    do {
        if (pending & 1) { /*從最低位開始，循環右移逐位處理pending的softirq*/
            int prev_count = preempt_count();
            kstat_incr_softirqs_this_cpu(h - softirq_vec);

            trace_softirq_entry(h, softirq_vec);
            h->action(h); /*執行softirq的處理函數*/
            trace_softirq_exit(h, softirq_vec);
            if (unlikely(prev_count != preempt_count())) {
                printk(KERN_ERR "huh, entered softirq %td %s %p"
                 "with preempt_count %08x,"
                 " exited with %08x?\n", h - softirq_vec,
                 softirq_to_name[h - softirq_vec],
                 h->action, prev_count, preempt_count());
                preempt_count() = prev_count;
            }

            rcu_bh_qs(cpu);
        }
        h++;
        pending >>= 1; /*循環右移*/
    } while (pending);

    local_irq_disable();

    pending = local_softirq_pending();
    if (pending && --max_restart) /*啟動ksoftirqd的閾值*/
        goto restart;

    if (pending) /*啟動ksoftirqd去處理softirq,此時說明pending的softirq比較多，比較頻繁，上面的處理過程中，又不斷有softirq被pending*/
        wakeup_softirqd();

    lockdep_softirq_exit();

    account_system_vtime(current);
    _local_bh_enable();
｝
三、使用softirq
     softirq一般用在對實時性要求比較強的地方，當前的Linux內核中，只有兩個子系統直接使用了softirq:網絡子系統和塊設備子系統。另外，增加新的softirq需要重新編譯內核，因此，除非必須需要，最好考慮tasklet和kernel timer是否適合當前需要。
     如果必須需要使用softirq，那麽需要考慮的一個重要的問題就是新增加的softirq的優先級，默認情況下，softirq的數值越小優先級越高，根據實際經驗，新增加的softirq最好在BLOCK_SOFTIRQ和TASKLET_SOFTIRQ之間。
     softirq的處理函數通過open_softirq進行註冊，此函數接收兩個參數，一個是softirq的整數索引，另一個是該softirq對應的處理函數。例如在網絡子系統中，註冊了如下兩個softirq及其處理函數：
open_softirq(NET_TX_SOFTIRQ, net_tx_action);
open_softirq(NET_RX_SOFTIRQ, net_rx_action);
    前面提到，軟中斷處理函數註冊後，還需要將該軟中斷激活，此軟中斷才能被執行，激活操作是通過raise_softirq函數來實現，在網絡子系統中激活代碼如下：
/* Called with irq disabled */
static inline void ____napi_schedule(struct softnet_data *sd,
                 struct napi_struct *napi)
{
    list_add_tail(&napi->poll_list, &sd->poll_list);
    __raise_softirq_irqoff(NET_RX_SOFTIRQ);
}
這裏的__raise_softirq_irqoff和raise_softirq的區別是，前者在事先已經關中斷的情況下可以被使用，後者自己完成中斷的關閉和恢復。