做网站 江门,服装网站模板,孝义网站建设,wordpress z开发了一个内核ko模块async_memory_reclaim_for_cold_file_area(内核版本5.14.0-284.11.1)#xff0c;使用kprobe技术捕捉内核copy_page_to_iter()函数#xff0c;在里边执行自定义的hot_file_update_file_status()函数#xff0c;统计文件页page的访问信息#xff0c;源码…开发了一个内核ko模块async_memory_reclaim_for_cold_file_area(内核版本5.14.0-284.11.1)使用kprobe技术捕捉内核copy_page_to_iter()函数在里边执行自定义的hot_file_update_file_status()函数统计文件页page的访问信息源码简要如下
static void kprobe_handler_post(struct kprobe *p, struct pt_regs *regs, unsigned long flags){ struct page *page (struct page *)(regs-di); if(page){ hot_file_update_file_status(page); }}static struct kprobe kp_read_cache_func { //buffer io read读取文件页page数据执行到 copy_folio_to_iter() .symbol_name copy_page_to_iter,};kp_read_cache_func.post_handler kprobe_handler_post;//注册kprobe函数ret register_kprobe(kp_read_cache_func);
但是偶发触发内核异常告警如下
[ 2606.173350] WARNING: CPU: 3 PID: 3347 at arch/x86/kernel/fpu/core.c:60 irq_fpu_usable0x39/0x50[ 2606.173420] fuse [last unloaded: async_memory_reclaim_for_cold_file_area_debug][ 2606.173423] CPU: 3 PID: 3347 Comm: bash Kdump: loaded Tainted: G OE -------- h--- 5.14.0-284.11.1.el9_2.x86_64 #1[ 2606.173425] Hardware name: VMware, Inc. VMware Virtual Platform/440BX Desktop Reference Platform, BIOS 6.00 02/27/2020[ 2606.173426] RIP: 0010:irq_fpu_usable0x39/0x50[ 2606.173432] RSP: 0018:ffffb6fe009076e8 EFLAGS: 00010006[ 2606.173434] RAX: ffffffffa30c6794 RBX: ffffffffa30c6740 RCX: 0000000000000024[ 2606.173435] RDX: 0000000080110002 RSI: ffffffffa30c6770 RDI: ffffffffa30c6740[ 2606.173435] RBP: ffffffffa30c6770 R08: 0000000000000004 R09: 0000000000000004[ 2606.173436] R10: ffffffffa30c6794 R11: 00000000000319c0 R12: 0000000000000001[ 2606.173437] R13: ffffffffa30c6740 R14: 0000000000000024 R15: fffffc6140046780[ 2606.173438] FS: 00007f6c674d4740(0000) GS:ffff971e39ec0000(0000) knlGS:0000000000000000[ 2606.173461] CS: 0010 DS: 0000 ES: 0000 CR0: 0000000080050033[ 2606.173462] CR2: 0000559a22f5dad0 CR3: 000000000c066004 CR4: 00000000003706e0[ 2606.173475] blake2s_compress0x20/0xa0[ 2606.173480] blake2s_final0x41/0x80[ 2606.173483] extract_entropy.constprop.00x94/0x250[ 2606.173493] crng_make_state0x129/0x180[ 2606.173495] _get_random_bytes.part.00x4c/0x190[ 2606.173497] ? __alloc_pages0xe6/0x230[ 2606.173499] get_random_u320x59/0x90[ 2606.173501] allocate_slab0x14a/0x460[ 2606.173505] ___slab_alloc0x44f/0x570[ 2606.173509] ? hot_file_update_file_status0x164/0xa60 [async_memory_reclaim_for_cold_file_area][ 2606.173513] ? find_busiest_group0x65/0x190[ 2606.173516] ? hot_file_update_file_status0x164/0xa60 [async_memory_reclaim_for_cold_file_area][ 2606.173519] kmem_cache_alloc0x294/0x300[ 2606.173522] hot_file_update_file_status0x164/0xa60 [async_memory_reclaim_for_cold_file_area][ 2606.173526] ? 0xffffffffc0572011[ 2606.173528] kprobe_post_process0x25/0x70[ 2606.173531] ? 0xffffffffc0572012[ 2606.173532] kprobe_int3_handler0x154/0x180[ 2606.173534] do_int30x3b/0x80[ 2606.173536] exc_int30x81/0xc0[ 2606.173546] RSP: 0018:ffffb6fe00907bf0 EFLAGS: 00000246[ 2606.173548] RBP: 0000000000000000 R08: 0000000000000402 R09: ffff971e10b97500[ 2606.173549] R10: ffffb6fe00907d98 R11: ffffffffffffffff R12: ffffb6fe00907d38[ 2606.173550] R13: ffffb6fe00907d60 R14: 0000000000000100 R15: 0000000000000000[ 2606.173553] ? 0xffffffffc0572012[ 2606.173554] filemap_read0x18a/0x320...........................
显然是irq_fpu_usable()函数里触发的问题。为了查清楚根因/proc/sys/kernel/panic_on_warn置1。下次触发这个内核告警后触发了内核crash并保存了vmcoe。通过vmcore看到出问题时完整函数流程
crash btPID: 33 TASK: ffff971d01358000 CPU: 3 COMMAND: bash #0 [ffffb6fe00907400] machine_kexec at ffffffffa126c237 #1 [ffffb6fe00907458] __crash_kexec at ffffffffa13c3c9a #2 [ffffb6fe00907518] panic at ffffffffa1cd7dc6 #3 [ffffb6fe00907598] __warn.cold at ffffffffa1cd7feb #4 [ffffb6fe009075d0] report_bug at ffffffffa17ac20e #5 [ffffb6fe00907608] handle_bug at ffffffffa1d254bc #6 [ffffb6fe00907618] exc_invalid_op at ffffffffa1d25684 #7 [ffffb6fe00907630] asm_exc_invalid_op at ffffffffa1e00af6 [exception RIP: irq_fpu_usable57] RIP: ffffffffa1232d29 RSP: ffffb6fe009076e8 RFLAGS: 00010006 RAX: ffffffffa30c6794 RBX: ffffffffa30c6740 RCX: 0000000000000024 RDX: 0000000080110002 RSI: ffffffffa30c6770 RDI: ffffffffa30c6740 RBP: ffffffffa30c6770 R8: 0000000000000004 R9: 0000000000000004 R10: ffffffffa30c6794 R11: 00000000000319c0 R12: 0000000000000001 R13: ffffffffa30c6740 R14: 0000000000000024 R15: fffffc6140046780 ORIG_RAX: ffffffffffffffff CS: 0010 SS: 0018 #8 [ffffb6fe009076e8] blake2s_compress at ffffffffa12e5ae0 #9 [ffffb6fe00907718] blake2s_final at ffffffffa176e391#10 [ffffb6fe00907738] extract_entropy.constprop.0 at ffffffffa18e2904#11 [ffffb6fe009077d8] crng_reseed.constprop.0 at ffffffffa18e2b25#12 [ffffb6fe00907818] crng_make_state at ffffffffa18e2dd9#13 [ffffb6fe00907848] _get_random_bytes at ffffffffa18e2e7c#14 [ffffb6fe009078f8] get_random_u32 at ffffffffa18e3129#15 [ffffb6fe00907918] allocate_slab at ffffffffa1577bea#16 [ffffb6fe00907960] ___slab_alloc at ffffffffa157b17f#17 [ffffb6fe00907a00] kmem_cache_alloc at ffffffffa157d194#18 [ffffb6fe00907a48] hot_file_update_file_status at ffffffffc0dd9d24 [async_memory_reclaim_for_cold_file_area]#19 [ffffb6fe00907aa8] kprobe_post_process at ffffffffa126ec65#20 [ffffb6fe00907ab8] kprobe_int3_handler at ffffffffa126ef44#21 [ffffb6fe00907ae8] do_int3 at ffffffffa1225b2b#22 [ffffb6fe00907af8] exc_int3 at ffffffffa1d25f51#23 [ffffb6fe00907b10] asm_exc_int3 at ffffffffa1e00b35#24 [ffffb6fe00907b98] copy_page_to_iter at ffffffffa1764d32#25 [ffffb6fe00907bf8] filemap_read at ffffffffa14d7b9a..................
触发问题的函数流程是kprobe捕捉内核函数copy_page_to_iter。在读文件时执行到该函数时产生int3中断在中断服务函数里最后执行自定义的hot_file_update_file_status()函数在该函数里执行kmem_cache_alloc()分配一个自定义slab数据结构。而在分配slab时执行allocate_slab-shuffle_freelist-get_random_int分配一个随机数源码如下
static bool shuffle_freelist(struct kmem_cache *s, struct page *page){ void *start; void *cur; void *next; unsigned long idx, pos, page_limit, freelist_count; if (page-objects 2 || !s-random_seq) return false; freelist_count oo_objects(s-oo); //分配随机数这里触发内核告警 pos get_random_int() % freelist_count; ............ next next_freelist_entry(s, page, pos, start, page_limit,freelist_count); set_freepointer(s, cur, next); ............}
为什么分配slab时要分配随机数呢查资料说是为了使分配的slab obj地址随机不容易被攻击安全考虑。
把这个过程的函数流程再整理一下hot_file_update_file_status-kmem_cache_alloc-___slab_alloc-allocate_slab-shuffle_freelist-get_random_int-get_random_u32-_get_random_bytes-crng_make_state-crng_reseed-drain_entropy-extract_entropy-blake2s_final-__blake2s_final-blake2s_compress-kernel_fpu_begin-irq_fpu_usable。再看下最后的几个关键blake2s_compress函数调用
void blake2s_compress(struct blake2s_state *state, const u8 *block, size_t nblocks, const u32 inc){ kernel_fpu_begin();.................. kernel_fpu_end();}static inline void kernel_fpu_begin(void){ kernel_fpu_begin_mask(KFPU_MXCSR);}void kernel_fpu_begin_mask(unsigned int kfpu_mask){ preempt_disable(); //这里触发内核告警 WARN_ON_FPU(!irq_fpu_usable()); WARN_ON_FPU(this_cpu_read(in_kernel_fpu)); this_cpu_write(in_kernel_fpu, true); .................. }
是irq_fpu_usable()函数返回false触发的crash看下这个函数的源码
bool irq_fpu_usable(void){ if (WARN_ON_ONCE(in_nmi())) return false; if (this_cpu_read(in_kernel_fpu)) return false; if (!in_hardirq()) return true; return !softirq_count();}
显然in_nmi()、this_cpu_read(in_kernel_fpu)、softirq_count()大于0都可能会导致该函数返回false得一一排查。
crash时的cpu是cpu3很快查找cpu3的in_kernel_fpu这个per cpu 变量是0
crash in_kernel_fpuPER-CPU DATA TYPE: bool in_kernel_fpu;PER-CPU ADDRESSES: [0]: ffff971e39e18ed8 [1]: ffff971e39e58ed8 [2]: ffff971e39e98ed8 //cpu3的in_kernel_fpu 变量地址 [3]: ffff971e39ed8ed8crash rd 0xffff971e39ed8ed8ffff971e39ed8ed8: 0000000000000000 ........
in_nmi() 和 softirq_count() 得查看cpu3的preempt_count变量是0x 80110003如下
crash __preempt_countPER-CPU DATA TYPE: int __preempt_count;PER-CPU ADDRESSES: [0]: ffff971e39e18f00 [1]: ffff971e39e58f00 [2]: ffff971e39e98f00 //cpu3的preempt_count变量地址 [3]: ffff971e39ed8f00crash rd -x ffff971e39ed8f00ffff971e39ed8f00: 0000000080110003
然后查看in_nmi()和softirq_count()全都来自preempt_count()函数。
#define PREEMPT_NEED_RESCHED 0x80000000static __always_inline int preempt_count(void){ return raw_cpu_read_4(__preempt_count) ~PREEMPT_NEED_RESCHED;}#define PREEMPT_BITS 8#define PREEMPT_SHIFT 0#define SOFTIRQ_BITS 8#define SOFTIRQ_SHIFT (PREEMPT_SHIFT PREEMPT_BITS)//8#define __IRQ_MASK(x) ((1UL (x))-1)#define HARDIRQ_SHIFT (SOFTIRQ_SHIFT SOFTIRQ_BITS)//16#define HARDIRQ_BITS 4 // ((1 4) - 1) 16 0xF 16#define HARDIRQ_MASK (__IRQ_MASK(HARDIRQ_BITS) HARDIRQ_SHIFT) //preempt_count 的 bit16~bit19#define hardirq_count() (preempt_count() HARDIRQ_MASK) //((1 8) - 1) 8 0x7F 8#define SOFTIRQ_MASK (__IRQ_MASK(SOFTIRQ_BITS) SOFTIRQ_SHIFT) //preempt_count 的 bit8~bit15# define softirq_count() (preempt_count() SOFTIRQ_MASK)#define NMI_SHIFT (HARDIRQ_SHIFT HARDIRQ_BITS)//20#define NMI_BITS 4 //((1 4) - 1) 20 0xF 20#define NMI_MASK (__IRQ_MASK(NMI_BITS) NMI_SHIFT) //preempt_count 的 bit20~bit23#define nmi_count() (preempt_count() NMI_MASK)#define in_nmi() (nmi_count())
cpu3的preempt_count()值是 0x80110003 , bit8~bit15 是0bit16~bit19是1bit20~bit23是1。即 in_nmi() 是1in_hardirq()是1softirq_count()是0。显然问题根源竟然是 in_nmi() 是非0导致的。这个就是NMI计数非0说明当前处于NMI中断。这怎么可能呢怎么会处于NMI中断呢什么是NMI中断这个一般是当发生hard lock、cpu硬件级别错误时才会发生的不可屏蔽中断。可是这个问题场景只是我利用kprobe捕捉内核函数然后产生int3中断在中断服务函数里最后执行自定义的hot_file_update_file_status()函数在该函数里执行kmem_cache_alloc()分配一个slab数据结构而已这个普通场景怎么会触发NMI中断呢
这个虚拟机是rocky 9.2 之前就出现过莫名奇妙的问题怀疑内存数据有问题难道这次又是内存数据有问题当遇到莫名其妙的问题就容易怀疑到内存有问题导致数据跳变、内存越界等导致哪里会有这么多神奇的行为还是先耐着性子看下源码吧。于是发现in_nmi()大于0是执行了 __nmi_enter()导致的如下
#define __nmi_enter() \ do { \ lockdep_off(); \ arch_nmi_enter(); \ BUG_ON(in_nmi() NMI_MASK); \ __preempt_count_add(NMI_OFFSET HARDIRQ_OFFSET); \ } while (0)
再进一步发现int3中断里竟然在irqentry_nmi_enter()函数里执行了__nmi_enter细节如下
irqentry_state_t noinstr irqentry_nmi_enter(struct pt_regs *regs){ irqentry_state_t irq_state; irq_state.lockdep lockdep_hardirqs_enabled(); __nmi_enter();//令 in_nmi 加1 lockdep_hardirqs_off(CALLER_ADDR0); lockdep_hardirq_enter(); rcu_nmi_enter(); instrumentation_begin(); trace_hardirqs_off_finish(); ftrace_nmi_enter(); instrumentation_end(); return irq_state;}DEFINE_IDTENTRY_RAW(exc_int3){ if (poke_int3_handler(regs)) return; if (user_mode(regs)) { …………… } else { //令 in_nmi 加1 irqentry_state_t irq_state irqentry_nmi_enter(regs); instrumentation_begin(); //执行kprobe 的函数 if (!do_int3(regs)) die(int3, regs, 0); instrumentation_end(); //令 in_nmi 减1 irqentry_nmi_exit(regs, irq_state); }}static bool do_int3(struct pt_regs *regs){ int res;#ifdef CONFIG_KGDB_LOW_LEVEL_TRAP if (kgdb_ll_trap(DIE_INT3, int3, regs, 0, X86_TRAP_BP, SIGTRAP) NOTIFY_STOP) return true;#endif /* CONFIG_KGDB_LOW_LEVEL_TRAP */#ifdef CONFIG_KPROBES //执行kprobe 的函数 if (kprobe_int3_handler(regs)) return true;#endif res notify_die(DIE_INT3, int3, regs, 0, X86_TRAP_BP, SIGTRAP); return res NOTIFY_STOP;}
简单说从源码角度来说每次kprobe触发的int3中断在exc_int3()中执行kprobe hot_file_update_file_status函数前都会先执行 irqentry_nmi_enter-__nmi_enter 令NMI中断加1。这样看到的每次执行 hot_file_update_file_status-kmem_cache_alloc 函数分配slab 前都会出发内核告警irq_fpu_usable。可实际测试是随机的这是怎么回事
我认为 hot_file_update_file_status-kmem_cache_alloc-___slab_alloc-allocate_slab-shuffle_freelist-get_random_int-get_random_u32-_get_random_bytes-crng_make_state-crng_reseed-drain_entropy-extract_entropy-blake2s_final-__blake2s_final-blake2s_compress-kernel_fpu_begin-irq_fpu_usable这个流程并不是每次都执行到最后的 irq_fpu_usable 中间有很多条件判断但是一旦执行到irq_fpu_usable 必然会触发内核告警。
简单说kprobe的函数里不能执行 kmem_cache_alloc()分配slab 内存。因为kprobe的init3中断会令NMI计数加1然后执行kmem_cache_alloc()分配slab有概率执行到irq_fpu_usable因为NMI计数大于0而触发内核告警。
怎么解决呢
1:在执行 hot_file_update_file_status-kmem_cache_alloc 前自己的源码里强制令nmi计数减1但怕对系统会有不良影响。2:自己实现一套简易的slab源码不执行 get_random_int() 函数就行。
但是这个两个方案改动太大研究源码后其实有个很简单的方法不用动内核。
禁止掉 allocate_slab random 功能就不会执行 hot_file_update_file_status-kmem_cache_alloc-___slab_alloc-allocate_slab-shuffle_freelist-get_random_int 函数具体改动如下
struct hot_cold_file_global{ struct kmem_cache *file_stat_cachep;}struct hot_cold_file_global hot_cold_file_global_info;static void (*cache_random_seq_destroy)(struct kmem_cache *cachep);//获取内核cache_random_seq_destroy函数指针cache_random_seq_destroy (void *)kallsyms_lookup_name_async(cache_random_seq_destroy);cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);//禁止分配 file_stat_cachep 这个slab时使用随机数功能分配slab时就不会执行到shuffle_freelist-get_random_int函数cache_random_seq_destroy(hot_cold_file_global_info.file_stat_cachep);
当执行cache_random_seq_destroy()函数后就令slab的random_seq为NULL
void cache_random_seq_destroy(struct kmem_cache *cachep){ kfree(cachep-random_seq); cachep-random_seq NULL;}
然后将来分配slab执行到allocate_slab-shuffle_freelist时直接返回false不会再执行到get_random_int()函数。
static bool shuffle_freelist(struct kmem_cache *s, struct page *page){ void *start; void *cur; void *next; unsigned long idx, pos, page_limit, freelist_count; //直接return false if (page-objects 2 || !s-random_seq) return false; freelist_count oo_objects(s-oo); //分配随机数这里触发crash pos get_random_int() % freelist_count; ............}