Linux kernel 之 socket 创建过程分析

  • 重要结构体

  • struct socket 结构体

    // 普通的 BSD 标准 socket 结构体
    // socket_state: socket 状态, 连接?不连接?
    // type: socket type (%SOCK_STREAM, etc)
    // flags: socket flags (%SOCK_NOSPACE, etc)
    // ops: 专用协议的socket的操作
    // file: 与socket 有关的指针列表
    // sk: 负责协议相关结构体,这样就让这个这个结构体和协议分开。
    // wq: 等待队列
    struct socket {  
       socket_state        state;                                                  
                                                                                    
       kmemcheck_bitfield_begin(type);                                             
       short           type;                                                       
       kmemcheck_bitfield_end(type);                                               
                                                                                
       unsigned long       flags;                                                  
                                                                                
      struct socket_wq __rcu  *wq;                                                
                                                                                  
      struct file     *file;                                                      
      struct sock     *sk;                                                        
      const struct proto_ops  *ops;                                               
    };  
  • struct socket 的创建

    // socket() 本质上是 glibc 中的函数,执行的实际上是 sys_socketcall() 系统调用。
    // sys_socketcall() 几乎是所有的socket函数的入口, 
    // 也就是 bind,connect 等函数都是需要asmlinkage long sys_socketcall(int call, unsigned long __user *args); 、、sys_socketcall() 作为入口,函数如下:

    // include/linux/syscalls.h
    asmlinkage long sys_socketcall(int call, unsigned long __user *args); 
    // net/socket.c
    SYSCALL_DEFINE2(socketcall, int, call, unsigned long __user *, args)            
    {                                                                               
        unsigned long a[AUDITSC_ARGS];                                              
        unsigned long a0, a1;                                                       
        int err;                                                                    
        unsigned int len;                                                           
                                                                                
        if (call < 1 || call > SYS_SENDMMSG)                                        
            return -EINVAL;                                                         
                                                                                
        len = nargs[call];                                                          
        if (len > sizeof(a))                                                        
            return -EINVAL;                                                         
                                                                               
        /* copy_from_user should be SMP safe. */                                    
        if (copy_from_user(a, args, len))                                           
            return -EFAULT;                                                         
                                                                                
        err = audit_socketcall(nargs[call] / sizeof(unsigned long), a);             
        if (err)                                                                    
            return err;                                                             
                                                                                    
        a0 = a[0]; 
        a1 = a[1];                                                                  
        // 判断,然后运行相对应的函数 
        switch (call) {                                                             
        case SYS_SOCKET:  // 这里就是 sys_socket(), 
            err = sys_socket(a0, a1, a[2]);                                         
            break;                                                                  
        case SYS_BIND:                                                              
            err = sys_bind(a0, (struct sockaddr __user *)a1, a[2]);                 
            break;                                                                  
        case SYS_CONNECT:                                                           
            err = sys_connect(a0, (struct sockaddr __user *)a1, a[2]);              
            break;                                                                  
        case SYS_LISTEN:                                                            
            err = sys_listen(a0, a1);                                               
            break;                                                                  
        // ... ...                                                             
        default:                                                                    
            err = -EINVAL;                                                          
            break;                                                                  
        }                                                                           
        return err;                                                                 
    }                                                                                                                                                                                     
    // include/linux/syscalls.h
    asmlinkage long sys_socket(int, int, int);
    // net/socket.c 
    SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)                     
    {                                                                                  
        int retval;                                                                    
        struct socket *sock;                                                           
        int flags;                                                                     
                                                                                       
        /* Check the SOCK_* constants for consistency.  */                             
        BUILD_BUG_ON(SOCK_CLOEXEC != O_CLOEXEC);                                       
        BUILD_BUG_ON((SOCK_MAX | SOCK_TYPE_MASK) != SOCK_TYPE_MASK);                   
        BUILD_BUG_ON(SOCK_CLOEXEC & SOCK_TYPE_MASK);                                   
        BUILD_BUG_ON(SOCK_NONBLOCK & SOCK_TYPE_MASK);                                  
                                                                                       
        flags = type & ~SOCK_TYPE_MASK;                                                
        if (flags & ~(SOCK_CLOEXEC | SOCK_NONBLOCK))                                
            return -EINVAL;                                                         
        type &= SOCK_TYPE_MASK;                                                     
                                                                                    
        if (SOCK_NONBLOCK != O_NONBLOCK && (flags & SOCK_NONBLOCK))                 
            flags = (flags & ~SOCK_NONBLOCK) | O_NONBLOCK;                          
        // 这里创建了 socket 结构体
        retval = sock_create(family, type, protocol, &sock);                        
        if (retval < 0)                                                             
            goto out;                                                               
        // 与文件系统进行关联
        retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));               
        if (retval < 0)                                                             
            goto out_release;                                                       
                                                                                    
    out:                                                                            
        /* It may be already another descriptor 8) Not kernel problem. */           
        return retval;                                                              
                                                                                
    out_release:                                                                    
        sock_release(sock);                                                         
        return retval;                                                              
    }                                                                               
  • sock_create() 函数

    // include/linux/net.h
    int sock_create(int family, int type, int proto, struct socket **res);

    // net/socket.c
    int sock_create(int family, int type, int protocol, struct socket **res)        
    {                                                                               
        return __sock_create(current->nsproxy->net_ns, family, type, protocol, res, 0);
    }                                                                               
    EXPORT_SYMBOL(sock_create);  

    // include/linux/net.h 
    int __sock_create(struct net *net, int family, int type, int proto,             
          struct socket **res, int kern);   
    // net/socket.c
    int __sock_create(struct net *net, int family, int type, int protocol,          
                 struct socket **res, int kern)                                     
    {                                                                               
        int err;                                                                    
        struct socket *sock;                                                        
        const struct net_proto_family *pf;                                          
                                                                                    
        /*                                                                          
         *      Check protocol is in range                                          
         */    
        // 检查 协议族是否在范围呢  
        if (family < 0 || family >= NPROTO)                                         
            return -EAFNOSUPPORT;                                                   
        if (type < 0 || type >= SOCK_MAX)   // 检查类型  
            return -EINVAL;                                                         
                                                                                    
        /* Compatibility.                                                           
                                                                                    
           This uglymoron is moved from INET layer to here to avoid                 
           deadlock in module load.                                                 
         */    // 检查用的是PF_INET 其实这个都是兼容的。 
        if (family == PF_INET && type == SOCK_PACKET) {                             
            static int warned;                                                      
            if (!warned) {                                                          
                warned = 1;                                                         
                pr_info("%s uses obsolete (PF_INET,SOCK_PACKET)\n",                 
                    current->comm);                                                 
            }                                                                       
            family = PF_PACKET;                                                     
        }                                                                           
        // 安全机制检查 
        err = security_socket_create(family, type, protocol, kern);                 
        if (err)                                                                    
            return err;                                                             
                                                                                    
        /*                                                                          
         *  Allocate the socket and allow the family to set things up. if           
         *  the protocol is 0, the family is instructed to select an appropriate    
         *  default.                                                                
         */  // ----> sock_alloc  接下面  
        sock = sock_alloc();                                                        
        if (!sock) {                                                                
            net_warn_ratelimited("socket: no more sockets\n");                      
            return -ENFILE; /* Not exactly a match, but its the                     
                       closest posix thing */                                       
        }                                                                           
                                                                                    
        sock->type = type;    
         
        // ... ...    
        return 0;         
       // ... ...            
    }                                   
    EXPORT_SYMBOL(__sock_create);  
  • sock_alloc() 函数解析,被上面的 __sock_create() 函数调用

    // net/socket.c
    static struct socket *sock_alloc(void)                                          
    {                                                                               
        struct inode *inode;                                                        
        struct socket *sock;                                                        
                                                                                    
        inode = new_inode_pseudo(sock_mnt->mnt_sb);                                 
        if (!inode)                                                                 
            return NULL;                                                            
                                                                                    
        sock = SOCKET_I(inode);                                                     
                                                                                    
        kmemcheck_annotate_bitfield(sock, type);                                    
        inode->i_ino = get_next_ino();                                              
        inode->i_mode = S_IFSOCK | S_IRWXUGO; // 模式  
        inode->i_uid = current_fsuid();  // 获取当前的uid  
        inode->i_gid = current_fsgid();  // 获取当前的gid 
        inode->i_op = &sockfs_inode_ops; // 操作 
                                                                                    
        this_cpu_add(sockets_in_use, 1);                                            
        return sock;   
    }   
    // 申请一个 socket 结构体 ,名字为 sock
    // 申请一个新的节点和一个新的 socket 项目, 绑定他们两个并且初始化
    // 如果申请inode 失败返回 NULL, 或者返回sock  
    // 接下来我们再看到 SOCKET_I(inode);
    // include/net/sock.h
    static inline struct socket *SOCKET_I(struct inode *inode)                      
    {                                                                               
        return &container_of(inode, struct socket_alloc, vfs_inode)->socket;        
    }    
    // 然后我们发现,返回的是 inode 内的socket 结构体。  
    
    // 我们可以分析一个 container_of() 这个是怎么定义的。
    // include/linux/kernel.h
    #define container_of(ptr, type, member) ({          \                           
    const typeof( ((type *)0)->member ) *__mptr = (ptr);    \                   
    (type *)( (char *)__mptr - offsetof(type,member) );})  
    //  typeof 将 ptr 的指针临时保存起来为 __mptr
    //  然后用这个 __mptr 指针减去下面的 member 的便宜量。
    //  得到的就是 type 这个结构体的头指针。
    //  offsetof   include/linux/stddef.h
    #undef offsetof                                                                 
    #ifdef __compiler_offsetof                                                      
    #define offsetof(TYPE, MEMBER)  __compiler_offsetof(TYPE, MEMBER)               
    #else                                                                           
    #define offsetof(TYPE, MEMBER)  ((size_t)&((TYPE *)0)->MEMBER)                  
    #endif   
                                                                              
    // 反正这里有点难理解,最后得到的结果是 type 这个结构体的头指针。
    
    // 所以说 SOCKET_I() 得到的是 struct socket_alloc 的头指针
    // include/net/sock.h
    struct socket_alloc {                                                           
        struct socket socket;                                                       
        struct inode vfs_inode;                                                     
    };               
  • #### 回到 __sock_create() 继续分析
    // net/socket.c   --> __sock_create()
    #ifdef CONFIG_MODULES                                                           
        /* Attempt to load a protocol module if the find failed.                    
         *                                                                          
         * 12/09/1996 Marcin: But! this makes REALLY only sense, if the     user        
         * requested real, full-featured networking support upon    configuration.         
         * Otherwise module support will break!                                     
         */                                                                         
        if (rcu_access_pointer(net_families[family]) == NULL)                       
            request_module("net-pf-%d", family);                                    
    #endif                                                                          
    如果在 make menuconfig 中选上 编译成模块的选项,则会运行上面这个部分。
    里面先是检查对应的协议族的操作表是否已经安装,如果没有安装则使用 request_module 进行安装,现在都是在 TCP/IP协议下进行分析,所以 family 是 AF_INET , 也就是 2 , 所以实际检查的全局变量是 net_families[2], 这个全局变量是在系统初始化时由 net/ipv4/af_inet.c 文件进行安装,具体代码是:
    // net/ipv4/af_inet.c
    static int __init inet_init(void)                                               
    {                                                                               
        struct inet_protosw *q;                                                     
        struct list_head *r;                                                        
        int rc = -EINVAL;                                                           
                                                                                    
        sock_skb_cb_check_size(sizeof(struct inet_skb_parm));                       
        // 各个协议的注册 
        rc = proto_register(&tcp_prot, 1);                                          
        if (rc)                                                                     
            goto out;                                                               
                                                                                    
        rc = proto_register(&udp_prot, 1);                                          
        if (rc)                                                                     
            goto out_unregister_tcp_proto;                                          
                                                                                    
        rc = proto_register(&raw_prot, 1);                                          
        if (rc)                                                                     
            goto out_unregister_udp_proto;                                          
                                                                                    
        rc = proto_register(&ping_prot, 1);                                         
        if (rc)                                                                     
            goto out_unregister_raw_proto;                                          
                                                                                    
        /*                                                                          
         *  Tell SOCKET that we are alive...                                        
         */                                                                         
    
            (void)sock_register(&inet_family_ops);                                      
                                                                                    
    #ifdef CONFIG_SYSCTL                                                            
        ip_static_sysctl_init();                                                    
    #endif                                                                          
                                                                                    
        /*                                                                          
         *  Add all the base protocols.                                             
         */                                                                         
        // 各个协议的添加,添加不成功则报错 
        if (inet_add_protocol(&icmp_protocol, IPPROTO_ICMP) < 0)                    
            pr_crit("%s: Cannot add ICMP protocol\n", __func__);                    
        if (inet_add_protocol(&udp_protocol, IPPROTO_UDP) < 0)                      
            pr_crit("%s: Cannot add UDP protocol\n", __func__);                     
        if (inet_add_protocol(&tcp_protocol, IPPROTO_TCP) < 0)                      
            pr_crit("%s: Cannot add TCP protocol\n", __func__);                     
    #ifdef CONFIG_IP_MULTICAST                                                      
        if (inet_add_protocol(&igmp_protocol, IPPROTO_IGMP) < 0)                    
            pr_crit("%s: Cannot add IGMP protocol\n", __func__);                    
    #endif                                                                          
                                                                                    
        /* Register the socket-side information for inet_create. */                 
        for (r = &inetsw[0]; r < &inetsw[SOCK_MAX]; ++r)                            
            INIT_LIST_HEAD(r);                                                      
        // 把这个关键性的链接表一个个注册上去
        // ******************************************************
        // inetsw_array 结构体数组数组, 这里面有包含每个的协议,比如说tcp_prot
        static struct inet_protosw inetsw_array[] =                                     
        {                                                                               
            {                                                                           
                .type =       SOCK_STREAM,                                              
                .protocol =   IPPROTO_TCP,                                              
                .prot =       &tcp_prot,                                                
                .ops =        &inet_stream_ops,                                         
                .flags =      INET_PROTOSW_PERMANENT |                                  
                        INET_PROTOSW_ICSK,                                            
            },                                                                          
                                                                                
            {                                                                           
                .type =       SOCK_DGRAM,                                               
                .protocol =   IPPROTO_UDP,                                              
                .prot =       &udp_prot,                                                
                .ops =        &inet_dgram_ops,                                          
                .flags =      INET_PROTOSW_PERMANENT,                                   
            },                                                                       
                                                                                
            {                                                                        
                .type =       SOCK_DGRAM,                                               
                .protocol =   IPPROTO_ICMP,                                             
                .prot =       &ping_prot,                                               
                .ops =        &inet_dgram_ops,                                          
                .flags =      INET_PROTOSW_REUSE,                                       
            },
            // ... ...
        } 
        
        // tcp_prot  ---> net/ipv4/tcp_ipv4.c
        struct proto tcp_prot = {                                                       
            .name           = "TCP",                                                    
            .owner          = THIS_MODULE,                                              
            .close          = tcp_close,                                                
            .connect        = tcp_v4_connect,                                               
            .disconnect     = tcp_disconnect,                                               
            .accept         = inet_csk_accept,                                              
            .ioctl          = tcp_ioctl,                                                    
            .init           = tcp_v4_init_sock,    // 这是init 函数会在后面被调用
            .destroy        = tcp_v4_destroy_sock,                                          
            .shutdown       = tcp_shutdown,                                                 
            .setsockopt     = tcp_setsockopt,                                               
            .getsockopt     = tcp_getsockopt,                                               
            .recvmsg        = tcp_recvmsg,                                                  
            .sendmsg        = tcp_sendmsg,                                                  
            .sendpage       = tcp_sendpage,                                                 
            .backlog_rcv        = tcp_v4_do_rcv,                                            
            .release_cb     = tcp_release_cb,                                               
            .hash           = inet_hash,                                                    
            .unhash         = inet_unhash,                                                  
            .get_port       = inet_csk_get_port,                                            
            .enter_memory_pressure  = tcp_enter_memory_pressure,                            
            .stream_memory_free = tcp_stream_memory_free,                                   
            .sockets_allocated  = &tcp_sockets_allocated,                                   
            .orphan_count       = &tcp_orphan_count,                                        
            .memory_allocated   = &tcp_memory_allocated,                                    
            .memory_pressure    = &tcp_memory_pressure,                                 
            .sysctl_mem     = sysctl_tcp_mem,                                           
            .sysctl_wmem        = sysctl_tcp_wmem,                                      
            .sysctl_rmem        = sysctl_tcp_rmem,                                      
            .max_header     = MAX_TCP_HEADER,                                           
            .obj_size       = sizeof(struct tcp_sock),                                  
            .slab_flags     = SLAB_DESTROY_BY_RCU,                                      
            .twsk_prot      = &tcp_timewait_sock_ops,                                   
            .rsk_prot       = &tcp_request_sock_ops,                                    
            .h.hashinfo     = &tcp_hashinfo,                                            
            .no_autobind        = true,                                                 
        #ifdef CONFIG_COMPAT                                                            
            .compat_setsockopt  = compat_tcp_setsockopt,                                
            .compat_getsockopt  = compat_tcp_getsockopt,                                
        #endif                                                                          
        #ifdef CONFIG_MEMCG_KMEM                                                        
            .init_cgroup        = tcp_init_cgroup,                                      
            .destroy_cgroup     = tcp_destroy_cgroup,                                   
            .proto_cgroup       = tcp_proto_cgroup,                                     
        #endif                                                                          
        };                                                                              
        EXPORT_SYMBOL(tcp_prot); 
        // ***********************************************************
        for (q = inetsw_array; q < &inetsw_array[INETSW_ARRAY_LEN]; ++q)            
            inet_register_protosw(q);                                               
        
        // 各个协议模块的初始化 
        /*                                                                          
         *  Set the ARP module up                                                   
         */                                                                         
                                                                                    
        arp_init();                                                                 
                                                                                    
        /*                                                                          
         *  Set the IP module up                                                    
         */                                                                         
                                                                                    
        ip_init();                                                                  
                                                                                    
        tcp_v4_init();                                                              
                                                                                    
        /* Setup TCP slab cache for open requests. */                               
        tcp_init();                                                                 
                                                                                
        /* Setup UDP memory threshold */                                            
        udp_init();                                                                 
                                                                                    
        /* Add UDP-Lite (RFC 3828) */                                               
        udplite4_register();                                                        
                                                                          
        ping_init();                                                                
                                                                                    
        /*                                                                          
         *  Set the ICMP layer up                                                   
         */                                                                         
                                                                                
        if (icmp_init() < 0)                                                        
            panic("Failed to create the ICMP control socket.\n");                   
                                                                                    
        /*                                                                          
         *  Initialise the multicast router                                         
         */                                                                         
    #if defined(CONFIG_IP_MROUTE)                                                   
        if (ip_mr_init())                                                           
            pr_crit("%s: Cannot init ipv4 mroute\n", __func__);                     
    #endif                                                                          
                                                                                    
        if (init_inet_pernet_ops())                                                 
            pr_crit("%s: Cannot init ipv4 inet pernet ops\n", __func__);            
        /*                                                                          
         *  Initialise per-cpu ipv4 mibs                                            
         */                                                                         
                                                                                
        if (init_ipv4_mibs())                                                       
            pr_crit("%s: Cannot init ipv4 mibs\n", __func__);                       
                                                                                    
        ipv4_proc_init();                                                           
                                                                                    
        ipfrag_init();                                                              
                                                                                    
        dev_add_pack(&ip_packet_type);                                              
                                                                                    
        ip_tunnel_core_init();                                                      
                                                                                
        rc = 0;                                                                     
    out:                                                                            
        return rc;                                                                  
    out_unregister_raw_proto:                                                       
        proto_unregister(&raw_prot);                                                
    out_unregister_udp_proto:                                                       
        proto_unregister(&udp_prot);                                                
    out_unregister_tcp_proto:                                                       
        proto_unregister(&tcp_prot);                                                
        goto out;                                                                   
    }                                                                               

    fs_initcall(inet_init);
  • #### 很粗浅的看完协议那一部分之后我们回到 __sock_create()
    // net/socket.c
    // 看到 这个回调函数的调用
        err = pf->create(net, sock, protocol, kern);                                
        if (err < 0)                                                                
           goto out_module_put; 
    // 先看一个 inet_protosw 结构体
    // include/net/protocol.h
    /* This is used to register socket interfaces for IP protocols.  */             
    struct inet_protosw {                                                           
        struct list_head list;                                                      
                                                                                    
            /* These two fields form the lookup key.  */                            
        unsigned short   type;     /* This is the 2nd argument to socket(2). */     
        unsigned short   protocol; /* This is the L4 protocol number.  */           
                                                                                    
        struct proto     *prot;                                                     
        const struct proto_ops *ops;                                                
                                                                                    
        unsigned char    flags;      /* See INET_PROTOSW_* below.  */               
    };                                                                              
        
    // 上面的 create 函数对应的是 net/ipv4/af_inet.c 里面的 inet_create 函数
    // net/ipv4/af_inet.c
    static int inet_create(struct net *net, struct socket *sock, int protocol,      
               int kern)                                                        
    {                                                                               
        struct sock *sk;                                                            
        struct inet_protosw *answer;                                                
        struct inet_sock *inet;                                                     
        struct proto *answer_prot;                                                  
        unsigned char answer_flags;                                                 
        int try_loading_module = 0;                                                 
        int err;                                                                    
        // 检查协议是否在范围之内 
        if (protocol < 0 || protocol >= IPPROTO_MAX)                                
            return -EINVAL;                                                         
        // 设置状态为未连接
        sock->state = SS_UNCONNECTED;                                               
                                                                                    
        /* Look for the requested type/protocol pair. */ 
    // 遍历寻找请求的协议类型 
    lookup_protocol:                                                                
        err = -ESOCKTNOSUPPORT;                                                     
        rcu_read_lock(); 
        // 遍历 inetsw[] 数组,其实就是次数而已
        list_for_each_entry_rcu(answer, &inetsw[sock->type], list) { 
                                                                                    
            err = 0;   
            // 检查对应的协议,然后再选择合适的协议 
            /* Check the non-wild match. */ 
            // 找到对应的协议,如果找到对应的协议,但是protocol 不是 IPPRORO_IP,则直接退出 
            if (protocol == answer->protocol) {                                     
                if (protocol != IPPROTO_IP)                                         
                    break;                                                          
            } else {                                                                
                /* Check for the two wild cases. */                                 
                if (IPPROTO_IP == protocol) {                                       
                    protocol = answer->protocol;                                    
                    break;                                                          
                }                                                                   
                if (IPPROTO_IP == answer->protocol)                                 
                    break;                                                          
            } 
            // 如果没有对应的协议则返回错误码 
            err = -EPROTONOSUPPORT;
        }                                                                           
        // 如果没有加载模块的保护措施 
        if (unlikely(err)) {                                                        
            if (try_loading_module < 2) {                                           
                rcu_read_unlock();                                                  
                /*                                                                  
                 * Be more specific, e.g. net-pf-2-proto-132-type-1                 
                 * (net-pf-PF_INET-proto-IPPROTO_SCTP-type-SOCK_STREAM)             
                 */                                                                 
                if (++try_loading_module == 1)                                      
                    request_module("net-pf-%d-proto-%d-type-%d",                    
                               PF_INET, protocol, sock->type);                      
                /*                                                                  
                 * Fall back to generic, e.g. net-pf-2-proto-132                    
                 * (net-pf-PF_INET-proto-IPPROTO_SCTP)                              
                 */                                                                 
                else                                                                
                    request_module("net-pf-%d-proto-%d",                            
                               PF_INET, protocol);                                  
                goto lookup_protocol;                                               
            } else                                                                  
                goto out_rcu_unlock;                                                
        }                                                                           
                                                                                    
        err = -EPERM;  
        //  检查通用性,只有root 权限然后使用原始套接字 
        if (sock->type == SOCK_RAW && !kern &&                                      
            !ns_capable(net->user_ns, CAP_NET_RAW))                                 
                goto out_rcu_unlock;   
                                                 
        // 对socket 的操作集合进行了互联。
        sock->ops = answer->ops;                                                    
        answer_prot = answer->prot;                                                 
        answer_flags = answer->flags;                                               
        rcu_read_unlock();                                                          
                                                                                    
        WARN_ON(!answer_prot->slab);                                                
                                                                                    
        err = -ENOBUFS;
        /* 此处调用sk_alloc分配一个struct sock,该结构体庞大,其作用是网络层对socket的表示,意思就是IP协议下有很多东西比如IP地址,网卡接口,端口等等信息需要再socket层中有所体现从而使编程者方便使用,然后就利用指针等形式把内容进行一定程度上的映射。sk_alloc首先对sock->proto和sock_creator进行设置,设置成当前协议对应的proto调用sk_prot_alloc()根据是否提供了slab缓存而判断是使用slab缓存还是通用缓存。只要分配成功,则调用sock_lock_init()对缓存进行初始化,主要是对sock锁、等待队列以及进程数据结构中的网络空间结构进行分配。初始化完了后调用sock_net_set()函数对网络空间结构进行记录,然后最后增加一个net计数器。至此回到inet_create,判断是否成功分配 */ 
        sk = sk_alloc(net, PF_INET, GFP_KERNEL, answer_prot, kern);                 
        if (!sk)                                                                    
            goto out;                                                               
                                                                                    
        err = 0;                                                                    
        if (INET_PROTOSW_REUSE & answer_flags)                                      
            sk->sk_reuse = SK_CAN_REUSE;                                            
        
        // 返回一个 struct inet_sock 的指针给 inet                                                                            
        inet = inet_sk(sk);     
        // 判断是不是面向连通                                                    
        inet->is_icsk = (INET_PROTOSW_ICSK & answer_flags) != 0;                    
                                                                                    
        inet->nodefrag = 0;                                                         
        
        // 判断是不是原始套接字,如果是,新建IP头部。
        if (SOCK_RAW == sock->type) {                                               
            inet->inet_num = protocol;                                              
            if (IPPROTO_RAW == protocol)                                            
                inet->hdrincl = 1;                                                  
        }                                                                           
        // 判断是否采用路径 MTU 发现算法                                                                           
        if (net->ipv4.sysctl_ip_no_pmtu_disc)                                       
            inet->pmtudisc = IP_PMTUDISC_DONT;                                      
        else                                                                        
            inet->pmtudisc = IP_PMTUDISC_WANT;                                      
                                                                                    
        inet->inet_id = 0;
                                                          
        // 进一步初始化结构体 sk (struct sock)
        // sock_init_data: 初始化接收,发送,错误信息队列,三个队列都是双向链表,属于sk_buff_head 结构体,其中会把 sk_buff 结构体串联在一起,初始化数据包发送定时器,变量,(主要是函数指针)
        sock_init_data(sock, sk); 
        sk->sk_destruct    = inet_sock_destruct;                                    
        sk->sk_protocol    = protocol;                                              
        sk->sk_backlog_rcv = sk->sk_prot->backlog_rcv;                              
                                                                                    
        inet->uc_ttl    = -1;                                                       
        inet->mc_loop   = 1;                                                        
        inet->mc_ttl    = 1;                                                        
        inet->mc_all    = 1;                                                        
        inet->mc_index  = 0;                                                        
        inet->mc_list   = NULL;                                                     
        inet->rcv_tos   = 0;                                                        
                                                                                    
        sk_refcnt_debug_inc(sk);                                                    
                                                                                    
        if (inet->inet_num) {                                                       
            /* It assumes that any protocol which allows                            
             * the user to assign a number at socket                                
             * creation time automatically                                          
             * shares.                                                              
             */                                                                     
            inet->inet_sport = htons(inet->inet_num);                               
            /* Add to protocol hash chains. */                                      
            sk->sk_prot->hash(sk);                                                  
        }                                                                           
        
        //  这里,就是调用了协议里面的 init 函数  tcp_v4_init_sock 
        if (sk->sk_prot->init) {                                                    
            err = sk->sk_prot->init(sk);                                            
            if (err)                                                                
                sk_common_release(sk);                                              
        }                                                                           
    out:                                                                            
        return err;                                                                 
    out_rcu_unlock:                                                                 
        rcu_read_unlock();                                                          
        goto out;                                                                   
    }                                                                                                                                  
  • tcp_v4_init_sock 函数

    static int tcp_v4_init_sock(struct sock *sk)                                    
    {      
        // 强制转换类型 
        struct inet_connection_sock *icsk = inet_csk(sk);                           
        // 调用这个进行初始化 ,里面就时关于tcp 的一些初始化了,到此为止 
        tcp_init_sock(sk);                                                          
        // ipv4 专用操作 
        icsk->icsk_af_ops = &ipv4_specific;                                         
                                                                                    
    #ifdef CONFIG_TCP_MD5SIG                                                        
        tcp_sk(sk)->af_specific = &tcp_sock_ipv4_specific;                          
    #endif                                                                          
                                                                                    
        return 0;                                                                   
    }                                                                               
  • 到此, sock_create 分析完毕

  • 最后回到 SYSCALL_DEFINE3(socket, int, family, int, type, int, protocol)

    // net/socket.c  
    // 刚才分析完毕  
    retval = sock_create(family, type, protocol, &sock);                        
        if (retval < 0)                                                             
            goto out;
    // socket 映射到文件系统
    retval = sock_map_fd(sock, flags & (O_CLOEXEC | O_NONBLOCK));               
        if (retval < 0)                                                             
            goto out_release;                                          
    // net/socket.c
    static int sock_map_fd(struct socket *sock, int flags)                          
    {                                                                               
        struct file *newfile;                                                       
        int fd = get_unused_fd_flags(flags);                                        
        if (unlikely(fd < 0))     
            return fd;      
        
        // 申请一个 sock file 节点 
        newfile = sock_alloc_file(sock, flags, NULL);                               
        if (likely(!IS_ERR(newfile))) {                                             
            fd_install(fd, newfile);                                                
            return fd;                                                              
        }                                                                           
                                                                                    
        put_unused_fd(fd);                                                          
        return PTR_ERR(newfile);                                                    
    }
    // 这里所展现的意思是,把socket当成一个文件节点进行操作,open, read,write ,ioctl 等                                                                 
  • 参考: http://www.cnblogs.com/hyd-desert-camel/p/3536341.html

转载于:https://www.cnblogs.com/chenfulin5/p/6927040.html

你可能感兴趣的:(Linux kernel 之 socket 创建过程分析)