主要参考:《情景分析》
这个函数的功能是很明显的,就是将用户空间中的内容拷贝到内核空间中:
static inline unsigned long __must_check copy_from_user(void *to,
const void __user *from,
unsigned long n)
{
int sz = __compiletime_object_size(to);
if (likely(sz == -1 || sz >= n)) //检查to空间的大小
n = _copy_from_user(to, from, n);
else
copy_from_user_overflow();
return n;
}
/**
* copy_from_user: - Copy a block of data from user space.
* @to: Destination address, in kernel space.
* @from: Source address, in user space.
* @n: Number of bytes to copy.
*
* Context: User context only. This function may sleep.
*
* Copy data from user space to kernel space.
*
* Returns number of bytes that could not be copied.
* On success, this will be zero.
*
* If some data could not be copied, this function will pad the copied
* data to the requested size using zero bytes.
*/
unsigned long
_copy_from_user(void *to, const void __user *from, unsigned long n)
{
if (access_ok(VERIFY_READ, from, n))
n = __copy_from_user(to, from, n);
else
memset(to, 0, n);
return n;
}
access_ok用来检查from和n的合理性:
/**
* access_ok: - Checks if a user space pointer is valid
* @type: Type of access: %VERIFY_READ or %VERIFY_WRITE. Note that
* %VERIFY_WRITE is a superset of %VERIFY_READ - if it is safe
* to write to a block, it is always safe to read from it.
* @addr: User space pointer to start of block to check
* @size: Size of block to check
*
* Context: User context only. This function may sleep.
*
* Checks if a pointer to a block of memory in user space is valid.
*
* Returns true (nonzero) if the memory block may be valid, false (zero)
* if it is definitely invalid.
*
* Note that, depending on architecture, this function probably just
* checks that the pointer is in the user space range - after calling
* this function, memory access functions may still return -EFAULT.
*/
#define access_ok(type, addr, size) (likely(__range_not_ok(addr, size) == 0))
/*
* Test whether a block of memory is a valid user space address.
* Returns 0 if the range is valid, nonzero otherwise.
*
* This is equivalent to the following test:
* (u33)addr + (u33)size > (u33)current->addr_limit.seg (u65 for x86_64)
*
* This needs 33-bit (65-bit for x86_64) arithmetic. We have a carry...
*/
#define __range_not_ok(addr, size) \
({ \
unsigned long flag, roksum; \
__chk_user_ptr(addr); \
asm("add %3,%1 ; sbb %0,%0 ; cmp %1,%4 ; sbb $0,%0" \
: "=&r" (flag), "=r" (roksum) \
: "1" (addr), "g" ((long)(size)), \
"rm" (current_thread_info()->addr_limit.seg)); \
flag; \
})
/**
* __copy_from_user: - Copy a block of data from user space, with less checking.
* @to: Destination address, in kernel space.
* @from: Source address, in user space.
* @n: Number of bytes to copy.
*
* Context: User context only. This function may sleep.
*
* Copy data from user space to kernel space. Caller must check
* the specified block with access_ok() before calling this function.
*
* Returns number of bytes that could not be copied.
* On success, this will be zero.
*
* If some data could not be copied, this function will pad the copied
* data to the requested size using zero bytes.
*
* An alternate version - __copy_from_user_inatomic() - may be called from
* atomic context and will fail rather than sleep. In this case the
* uncopied bytes will *NOT* be padded with zeros. See fs/filemap.h
* for explanation of why this is needed.
*/
static __always_inline unsigned long
__copy_from_user(void *to, const void __user *from, unsigned long n)
{
might_fault();
if (__builtin_constant_p(n)) { //如果n是1、2、4时,则处理相对简单些
unsigned long ret;
switch (n) {
case 1:
__get_user_size(*(u8 *)to, from, 1, ret, 1);
return ret;
case 2:
__get_user_size(*(u16 *)to, from, 2, ret, 2);
return ret;
case 4:
__get_user_size(*(u32 *)to, from, 4, ret, 4);
return ret;
}
}
return __copy_from_user_ll(to, from, n); //如果n不是2的幂
}
#define __get_user_size(x, ptr, size, retval, errret) \
do { \
retval = 0; \
__chk_user_ptr(ptr); \
switch (size) { \
case 1: \
__get_user_asm(x, ptr, retval, "b", "b", "=q", errret); \
break; \
case 2: \
__get_user_asm(x, ptr, retval, "w", "w", "=r", errret); \
break; \
case 4: \
__get_user_asm(x, ptr, retval, "l", "k", "=r", errret); \
break; \
case 8: \
__get_user_asm_u64(x, ptr, retval, errret); \
break; \
default: \
(x) = __get_user_bad(); \
} \
} while (0)
#define __get_user_asm(x, addr, err, itype, rtype, ltype, errret) \
asm volatile("1: mov"itype" %2,%"rtype"1\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"3: mov %3,%0\n" \
" xor"itype" %"rtype"1,%"rtype"1\n" \
" jmp 2b\n" \
".previous\n" \
_ASM_EXTABLE(1b, 3b) \
: "=r" (err), ltype(x) \
: "m" (__m(addr)), "i" (errret), "0" (err))
当n不是特殊的数字时,则会调用__copy_from_user_ll_nozero(to, from, n):
unsigned long __copy_from_user_ll_nozero(void *to, const void __user *from,
unsigned long n)
{
if (movsl_is_ok(to, from, n)) //如果to和from都是8字节对齐的话,那么用mosl命令来进行复制操作是不合适的
__copy_user(to, from, n);
else
n = __copy_user_intel((void __user *)to,
(const void *)from, n);
return n;
}
/* Generic arbitrary sized copy. */
#define __copy_user(to, from, size) \
do { \
int __d0, __d1, __d2; \
__asm__ __volatile__( \
" cmp $7,%0\n" \ #如果要拷贝的字符串长度超过八个字节,那么直接movsb即可
" jbe 1f\n" \
" movl %1,%0\n" \
" negl %0\n" \
" andl $7,%0\n" \
" subl %0,%3\n" \#这几条指令的主要工作是将to转换为2字节对齐的???
"4: rep; movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
"0: rep; movsl\n" \
" movl %3,%0\n" \
"1: rep; movsb\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"5: addl %3,%0\n" \
" jmp 2b\n" \
"3: lea 0(%3,%0,4),%0\n" \
" jmp 2b\n" \
".previous\n" \
".section __ex_table,\"a\"\n" \
" .align 4\n" \
" .long 4b,5b\n" \
" .long 0b,3b\n" \
" .long 1b,2b\n" \
".previous" \
: "=&c"(size), "=&D" (__d0), "=&S" (__d1), "=r"(__d2) \
: "3"(size), "0"(size), "1"(to), "2"(from) \
: "memory"); \
} while (0)
在这个函数中,完成字符串拷贝工作的是标号0开始到标号1结束的三行代码。但是,为什么这段代码看起来这么复杂呢?特别是为什么多出来两个fixup段呢???
如果to的指针出错(非法指针),movsb和movsl是有会发生异常的。在老版本的内核中,为了尽量避免这种情况的发生,每次从用户空间读或者写的时候都会调用verify_area进行指针合法性的检验,但是这样明显”打击面“过大,会显著的影响效率因为绝大部分的指针是不会发生错误的。因此,新版本的内核将检查合法性指针的步骤取消,如果指针非法的话,那直接让页面异常发生,从而调用do_page_fault。
在do_page_fault中,截取如下代码:
if (unlikely(!down_read_trylock(&mm->mmap_sem))) {
if ((error_code & PF_USER) == 0 &&
!search_exception_tables(regs->ip)) {
bad_area_nosemaphore(regs, error_code, address);
return;
}
内核维护了一个”异常表“,当异常发生时,内核能够在该”异常表“中找到发生异常的指令的地址,并得到相应的”修复“地址fixup,就将CPU在异常返回后将要重新执行的地址替换成这个“修复”地址。为什么这样做呢?因为在这种情况下内核不能为当前进程补上一个页面(那样的话name所指的字符串就变成空白的了)。而如果任其自然的话,则从异常返回时,当前进程必然会不断地执行同一条指令而产生新的异常。
我们来看搜索异常表的过程:
/* Given an address, look for it in the exception tables. */
const struct exception_table_entry *search_exception_tables(unsigned long addr)
{
const struct exception_table_entry *e;
e = search_extable(__start___ex_table, __stop___ex_table-1, addr);
if (!e)
e = search_module_extables(addr);
return e;
}
#endif /* CONFIG_MODULES */
#endif /* !ARCH_HAS_SORT_EXTABLE */
#ifndef ARCH_HAS_SEARCH_EXTABLE
/*
* Search one exception table for an entry corresponding to the
* given instruction address, and return the address of the entry,
* or NULL if none is found.
* We use a binary search, and thus we assume that the table is
* already sorted.
*/
const struct exception_table_entry *
search_extable(const struct exception_table_entry *first,
const struct exception_table_entry *last,
unsigned long value)
{
while (first <= last) {
const struct exception_table_entry *mid;
mid = ((last - first) >> 1) + first;
/*
* careful, the distance between value and insn
* can be larger than MAX_LONG:
*/
if (mid->insn < value)
first = mid + 1;
else if (mid->insn > value)
last = mid - 1;
else
return mid;
}
return NULL;
}
我们来看一下exception_table_entry数据结构:
/*
* The exception table consists of pairs of addresses: the first is the
* address of an instruction that is allowed to fault, and the second is
* the address at which the program should continue. No registers are
* modified, so it is entirely up to the continuation code to figure out
* what to do.
*
* All the routines below use bits of fixup code that are out of line
* with the main instruction path. This means when everything is well,
* we don't even have to jump over them. Further, they do not intrude
* on our cache or tlb entries.
*/
struct exception_table_entry {
unsigned long insn, fixup;
};
结构中insn表示可能发生异常的指令所在的地址;fixup则为用来替换的“修复”地址。本着谁用谁负责的原则,我们这里的__copy_user()要从用户空间拷贝数据,可能发生问题,那么它就负责在异常表中为其可能发生问题的指令建立这样的数据结构。
回到__copy_user中:
"4: rep; movsb\n" \
" movl %3,%0\n" \
" shrl $2,%0\n" \
" andl $3,%3\n" \
" .align 2,0x90\n" \
"0: rep; movsl\n" \
" movl %3,%0\n" \
"1: rep; movsb\n" \
"2:\n" \
".section .fixup,\"ax\"\n" \
"5: addl %3,%0\n" \
" jmp 2b\n" \
"3: lea 0(%3,%0,4),%0\n" \
" jmp 2b\n" \
".previous\n" \
".section __ex_table,\"a\"\n" \
" .align 4\n" \
" .long 4b,5b\n" \ #4b处指令的修复地址在5处
" .long 0b,3b\n" \ #0b处指令的修复地址在3处
" .long 1b,2b\n" \ #1b处指令的修复地址在2处
".previous" \
GNU的gcc和ld除了支持text段和data段外,还支持fixup段和__ex_table段。前者专门用于异常发生后的修复,实际上跟text段差不多;后者专门用于异常地址表。
我们再来看RESOTRE_REGS中的fixup段:
.macro RESTORE_REGS pop=0
RESTORE_INT_REGS
1: popl_cfi %ds
/*CFI_RESTORE ds;*/
2: popl_cfi %es
/*CFI_RESTORE es;*/
3: popl_cfi %fs
/*CFI_RESTORE fs;*/
POP_GS \pop
.pushsection .fixup, "ax"
4: movl $0, (%esp)
jmp 1b
5: movl $0, (%esp)
jmp 2b
6: movl $0, (%esp)
jmp 3b
.section __ex_table, "a"
.align 4
.long 1b, 4b #1b处指令的修复代码在4处
.long 2b, 5b #2b处指令的修复代码在5处
.long 3b, 6b #3b处指令的修复代码在6处
.popsection
POP_GS_EX
.endm
为什么恢复ds的时候会出错呢?如果因为不管什么原因使得段选择码或描述项无效或者不符时,CPU就会产生一次“全面保护”(GP)异常。当这样的异常发生在系统空间时,就要为之准备好修复手段。这里将%ds在堆栈中的副本清成0,然后重新执行pop %ds。其实,这并不是真正的修复,而只是避免进一步的GP异常。以0作为段选择码(空选择码)是不会。将空选择码仿如出CS和SS外的段寄存器中是不会引起GP异常的,而要到以后企图通过这个空选择码访问内存的时候才会引起异常,但那是回到用户空间以后的事情了。而如果这个异常发生在用户空间,那么直接将这个进程“杀”掉算了。如果真的是恢复CS或者SS的时候发生GP错误了,那么直接将该进程杀掉,重新调度其它的进程执行就OK了。
《Linux内核情景分析》