作者: hacktao
写于2010.1.17
Linux 网络协议栈实现与ISO/OSI网络协议栈七层之间有对应关系。下面我们简单的自上而下的来分析下各部分主要实现的功能及实现的方法。
BSD套接字是我们实现网络通讯的主要方式。他提供我们应用软件开发的接口(socket,bind,connect,send,recv,accept)。因此我们从这里出来,来分析整一个网络体系结构。
1. BSD SOCKET 层
BSD socket 层,主要对应是ISO/OSI中的表示层,对应的函数集定义在socket.c中,其中实现对socket,bind ,accept 等系统调用,对应的下层响应函数(sock_socket,sock_bind,sock_accept).
这里我们具体来分析其中实现过程:
我们通过对socket.c的分析,会发现像socket,bind,accept等套接字的系统调用函数具有一个共同的入口函数:
asmlinkage int sys_socketcall(int call, unsigned long *args)
{
int er;
switch(call)
{
case SYS_SOCKET:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_socket(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_BIND:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_bind(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_CONNECT:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_connect(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_LISTEN:
er=verify_area(VERIFY_READ, args, 2 * sizeof(long));
if(er)
return er;
return(sock_listen(get_fs_long(args+0),
get_fs_long(args+1)));
case SYS_ACCEPT:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_accept(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_GETSOCKNAME:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_getsockname(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_GETPEERNAME:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_getpeername(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_SOCKETPAIR:
er=verify_area(VERIFY_READ, args, 4 * sizeof(long));
if(er)
return er;
return(sock_socketpair(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(unsigned long *)get_fs_long(args+3)));
case SYS_SEND:
er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
if(er)
return er;
return(sock_send(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3)));
case SYS_SENDTO:
er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
if(er)
return er;
return(sock_sendto(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3),
(struct sockaddr *)get_fs_long(args+4),
get_fs_long(args+5)));
case SYS_RECV:
er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
if(er)
return er;
return(sock_recv(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3)));
case SYS_RECVFROM:
er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
if(er)
return er;
return(sock_recvfrom(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3),
(struct sockaddr *)get_fs_long(args+4),
(int *)get_fs_long(args+5)));
case SYS_SHUTDOWN:
er=verify_area(VERIFY_READ, args, 2* sizeof(unsigned long));
if(er)
return er;
return(sock_shutdown(get_fs_long(args+0),
get_fs_long(args+1)));
case SYS_SETSOCKOPT:
er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
if(er)
return er;
return(sock_setsockopt(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(char *)get_fs_long(args+3),
get_fs_long(args+4)));
case SYS_GETSOCKOPT:
er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
if(er)
return er;
return(sock_getsockopt(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(char *)get_fs_long(args+3),
(int *)get_fs_long(args+4)));
default:
return(-EINVAL);
}
代码1
从sys_socketcall函数实现来看,其中call表示具体被调用的应用层的接口函数(如 bind)。 第二个参数是一个指针,指向具体被调用函数,(如bind函数)所需的参数。这些用户进行系统调用的时传入的参数原封不动地传递到内核网络协议相关底层实现函数使用。
然后调用相应的BSD SOCKET 层的处理函数(以bind为例)
流程:
Bind->sys_socketcall->sys_bind
static int sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
struct socket *sock;
int i;
char address[MAX_SOCK_ADDR];
int err;
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if((err=move_addr_to_kernel(umyaddr,addrlen,address))<0)
return err;
if ((i = sock->ops->bind(sock, (struct sockaddr *)address, addrlen)) < 0)
{
return(i);
}
return(0);
}
代码2
其中
2. INET socket层
该层对应是OSI中的会话层。实现文件是af_inet.c。该文件中定以了inet_bind,inet_accept,inet_accept 等函数,这些函数都实现BSD层的函数,内核对一个系统调用是层层下放的。INET 层是对BSD层的主要调用。这种调用如下所示:
Bind->sys_socketcall->sys_bind->inet_bind
static int inet_bind(struct socket *sock, struct sockaddr *uaddr,
int addr_len)
{
struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
struct sock *sk=(struct sock *)sock->data, *sk2;
unsigned short snum = 0 /* Stoopid compiler.. this IS ok */;
int chk_addr_ret;
/* check this error. */
if (sk->state != TCP_CLOSE)
return(-EIO);
if(addr_len<sizeof(struct sockaddr_in))
return -EINVAL;
if(sock->type != SOCK_RAW)
{
if (sk->num != 0)
return(-EINVAL);
snum = ntohs(addr->sin_port);
/*
* We can’t just leave the socket bound wherever it is, it might
* be bound to a privileged port. However, since there seems to
* be a bug here, we will leave it if the port is not privileged.
*/
if (snum == 0)
{
snum = get_new_socknum(sk->prot, 0);
}
if (snum < PROT_SOCK && !suser())
return(-EACCES);
}
chk_addr_ret = ip_chk_addr(addr->sin_addr.s_addr);
if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && chk_addr_ret != IS_MULTICAST)
return(-EADDRNOTAVAIL); /* Source address MUST be ours! */
if (chk_addr_ret || addr->sin_addr.s_addr == 0)
sk->saddr = addr->sin_addr.s_addr;
if(sock->type != SOCK_RAW)
{
/* Make sure we are allowed to bind here. */
cli();
for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];
sk2 != NULL; sk2 = sk2->next)
{
/* should be below! */
if (sk2->num != snum)
continue;
if (!sk->reuse)
{
sti();
return(-EADDRINUSE);
}
if (sk2->num != snum)
continue; /* more than one */
if (sk2->saddr != sk->saddr)
continue; /* socket per slot ! -FB */
if (!sk2->reuse || sk2->state==TCP_LISTEN)
{
sti();
return(-EADDRINUSE);
}
}
sti();
remove_sock(sk);
put_sock(snum, sk);
sk->dummy_th.source = ntohs(sk->num);
sk->daddr = 0;
sk->dummy_th.dest = 0;
}
return(0);
}
代码3
BSD层调用INET层通过socket 结构中的ops字段完成。Ops字段是一个proto_ops 结构类型构成。该结构是由函数指针构成。
struct socket {
short type; /* SOCK_STREAM, … */
socket_state state;
long flags;
struct proto_ops *ops; /* protocols do most everything */
void *data; /* protocol data */
struct socket *conn; /* server socket connected to */
struct socket *iconn; /* incomplete client conn.s */
struct socket *next;
struct wait_queue **wait; /* ptr to place to wait on */
struct inode *inode;
struct fasync_struct *fasync_list; /* Asynchronous wake up list */
};
代码4
struct proto_ops {
int family;
int (*create) (struct socket *sock, int protocol);
int (*dup) (struct socket *newsock, struct socket *oldsock);
int (*release) (struct socket *sock, struct socket *peer);
int (*bind) (struct socket *sock, struct sockaddr *umyaddr,
int sockaddr_len);
int (*connect) (struct socket *sock, struct sockaddr *uservaddr,
int sockaddr_len, int flags);
int (*socketpair) (struct socket *sock1, struct socket *sock2);
int (*accept) (struct socket *sock, struct socket *newsock,
int flags);
int (*getname) (struct socket *sock, struct sockaddr *uaddr,
int *usockaddr_len, int peer);
int (*read) (struct socket *sock, char *ubuf, int size,
int nonblock);
int (*write) (struct socket *sock, char *ubuf, int size,
int nonblock);
int (*select) (struct socket *sock, int sel_type,
select_table *wait);
int (*ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
int (*listen) (struct socket *sock, int len);
int (*send) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags);
int (*recv) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags);
int (*sendto) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags, struct sockaddr *, int addr_len);
int (*recvfrom) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags, struct sockaddr *, int *addr_len);
int (*shutdown) (struct socket *sock, int flags);
int (*setsockopt) (struct socket *sock, int level, int optname,
char *optval, int optlen);
int (*getsockopt) (struct socket *sock, int level, int optname,
char *optval, int *optlen);
int (*fcntl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
};
代码5
Socket.c中大量调用语句是:socket->ops->bind 或者socket->ops->socket.而实际上这些函数指向的是af_inet.c(INET层)定义的函数。如socket->ops->bind 实际上是inet_bind, socket->ops->socket实际上是inet_socket.
读者肯定疑问两者是怎么实现的呢?
参见af_inet.c文件中有一个初始化的结构体
static struct proto_ops inet_proto_ops = {
AF_INET,
inet_create,
inet_dup,
inet_release,
inet_bind,
inet_connect,
inet_socketpair,
inet_accept,
inet_getname,
inet_read,
inet_write,
inet_select,
inet_ioctl,
inet_listen,
inet_send,
inet_recv,
inet_sendto,
inet_recvfrom,
inet_shutdown,
inet_setsockopt,
inet_getsockopt,
inet_fcntl,
};
代码6
将实现了inet的函数指针放到该结构体里。然后将该结构图在初始化时的指针放到socket的结构体中。
void inet_proto_init(struct net_proto *pro)
{
struct inet_protocol *p;
int i;
printk(“Swansea University Computer Society TCP/IP for NET3.019/n”);
/*
* Tell SOCKET that we are alive…
*/
(void) sock_register(inet_proto_ops.family, &inet_proto_ops); //这里实现将inet_bind等函数绑定到socket结构体中。
seq_offset = CURRENT_TIME*250;
/*
* Add all the protocols.
*/
for(i = 0; i < SOCK_ARRAY_SIZE; i++)
{
tcp_prot.sock_array[i] = NULL;
udp_prot.sock_array[i] = NULL;
raw_prot.sock_array[i] = NULL;
}
tcp_prot.inuse = 0;
tcp_prot.highestinuse = 0;
udp_prot.inuse = 0;
udp_prot.highestinuse = 0;
raw_prot.inuse = 0;
raw_prot.highestinuse = 0;
printk(“IP Protocols: ”);
for(p = inet_protocol_base; p != NULL;)
{
struct inet_protocol *tmp = (struct inet_protocol *) p->next;
inet_add_protocol(p);
printk(“%s%s”,p->name,tmp?”, ”:”/n”);
p = tmp;
}
/*
* Set the ARP module up
*/
arp_init();
/*
* Set the IP module up
*/
ip_init();
}
代码7
这样就实现sys_bind->inet_bind 的过程。
同样INET层对传输层函数的调用也是通过函数指针形式来完成,只不过在此桥梁作用的是sock结构而已。Sock结构中prot 字段是一个proto结构体,该proto结构也是主要由一些函数指针组成,另外传输层所使用的协议不同(tcp或者udp),故对应不同对应函数集。
Sock.h 文件
struct sock {
struct options *opt;
volatile unsigned long wmem_alloc;
volatile unsigned long rmem_alloc;
unsigned long write_seq;
unsigned long sent_seq;
unsigned long acked_seq;
unsigned long copied_seq;
unsigned long rcv_ack_seq;
unsigned long window_seq;
unsigned long fin_seq;
unsigned long urg_seq;
unsigned long urg_data;
/*
* Not all are volatile, but some are, so we
* might as well say they all are.
*/
volatile char inuse,
dead,
urginline,
intr,
blog,
done,
reuse,
keepopen,
linger,
delay_acks,
destroy,
ack_timed,
no_check,
zapped, /* In ax25 & ipx means not linked */
broadcast,
nonagle;
unsigned long lingertime;
int proc;
struct sock *next;
struct sock *prev; /* Doubly linked chain.. */
struct sock *pair;
struct sk_buff * volatile send_head;
struct sk_buff * volatile send_tail;
struct sk_buff_head back_log;
struct sk_buff *partial;
struct timer_list partial_timer;
long retransmits;
struct sk_buff_head write_queue,
receive_queue;
struct proto *prot;
struct wait_queue **sleep;
unsigned long daddr;
unsigned long saddr;
unsigned short max_unacked;
unsigned short window;
unsigned short bytes_rcv;
/* mss is min(mtu, max_window) */
unsigned short mtu; /* mss negotiated in the syn’s */
volatile unsigned short mss; /* current eff. mss - can change */
volatile unsigned short user_mss; /* mss requested by user in ioctl */
volatile unsigned short max_window;
unsigned long window_clamp;
unsigned short num;
volatile unsigned short cong_window;
volatile unsigned short cong_count;
volatile unsigned short ssthresh;
volatile unsigned short packets_out;
volatile unsigned short shutdown;
volatile unsigned long rtt;
volatile unsigned long mdev;
volatile unsigned long rto;
/* currently backoff isn’t used, but I’m maintaining it in case
* we want to go back to a backoff formula that needs it
*/
volatile unsigned short backoff;
volatile short err;
unsigned char protocol;
volatile unsigned char state;
volatile unsigned char ack_backlog;
unsigned char max_ack_backlog;
unsigned char priority;
unsigned char debug;
unsigned short rcvbuf;
unsigned short sndbuf;
unsigned short type;
unsigned char localroute; /* Route locally only */
#ifdef CONFIG_IPX
ipx_address ipx_dest_addr;
ipx_interface *ipx_intrfc;
unsigned short ipx_port;
unsigned short ipx_type;
#endif
#ifdef CONFIG_AX25
/* Really we want to add a per protocol private area */
ax25_address ax25_source_addr,ax25_dest_addr;
struct sk_buff *volatile ax25_retxq[8];
char ax25_state,ax25_vs,ax25_vr,ax25_lastrxnr,ax25_lasttxnr;
char ax25_condition;
char ax25_retxcnt;
char ax25_xx;
char ax25_retxqi;
char ax25_rrtimer;
char ax25_timer;
unsigned char ax25_n2;
unsigned short ax25_t1,ax25_t2,ax25_t3;
ax25_digi *ax25_digipeat;
#endif
#ifdef CONFIG_ATALK
struct atalk_sock at;
#endif
/* IP ’private area’ or will be eventually */
int ip_ttl; /* TTL setting */
int ip_tos; /* TOS */
struct tcphdr dummy_th;
struct timer_list keepalive_timer; /* TCP keepalive hack */
struct timer_list retransmit_timer; /* TCP retransmit timer */
struct timer_list ack_timer; /* TCP delayed ack timer */
int ip_xmit_timeout; /* Why the timeout is running */
#ifdef CONFIG_IP_MULTICAST
int ip_mc_ttl; /* Multicasting TTL */
int ip_mc_loop; /* Loopback (not implemented yet) */
char ip_mc_name[MAX_ADDR_LEN]; /* Multicast device name */
struct ip_mc_socklist *ip_mc_list; /* Group array */
#endif
/* This part is used for the timeout functions (timer.c). */
int timeout; /* What are we waiting for? */
struct timer_list timer; /* This is the TIME_WAIT/receive timer when we are doing IP */
struct timeval stamp;
/* identd */
struct socket *socket;
/* Callbacks */
void (*state_change)(struct sock *sk);
void (*data_ready)(struct sock *sk,int bytes);
void (*write_space)(struct sock *sk);
void (*error_report)(struct sock *sk);
};
struct proto {
struct sk_buff * (*wmalloc)(struct sock *sk,
unsigned long size, int force,
int priority);
struct sk_buff * (*rmalloc)(struct sock *sk,
unsigned long size, int force,
int priority);
void (*wfree)(struct sock *sk, struct sk_buff *skb,
unsigned long size);
void (*rfree)(struct sock *sk, struct sk_buff *skb,
unsigned long size);
unsigned long (*rspace)(struct sock *sk);
unsigned long (*wspace)(struct sock *sk);
void (*close)(struct sock *sk, int timeout);
int (*read)(struct sock *sk, unsigned char *to,
int len, int nonblock, unsigned flags);
int (*write)(struct sock *sk, unsigned char *to,
int len, int nonblock, unsigned flags);
int (*sendto)(struct sock *sk,
unsigned char *from, int len, int noblock,
unsigned flags, struct sockaddr_in *usin,
int addr_len);
int (*recvfrom)(struct sock *sk,
unsigned char *from, int len, int noblock,
unsigned flags, struct sockaddr_in *usin,
int *addr_len);
int (*build_header)(struct sk_buff *skb,
unsigned long saddr,
unsigned long daddr,
struct device **dev, int type,
struct options *opt, int len, int tos, int ttl);
int (*connect)(struct sock *sk,
struct sockaddr_in *usin, int addr_len);
struct sock * (*accept) (struct sock *sk, int flags);
void (*queue_xmit)(struct sock *sk,
struct device *dev, struct sk_buff *skb,
int free);
void (*retransmit)(struct sock *sk, int all);
void (*write_wakeup)(struct sock *sk);
void (*read_wakeup)(struct sock *sk);
int (*rcv)(struct sk_buff *buff, struct device *dev,
struct options *opt, unsigned long daddr,
unsigned short len, unsigned long saddr,
int redo, struct inet_protocol *protocol);
int (*select)(struct sock *sk, int which,
select_table *wait);
int (*ioctl)(struct sock *sk, int cmd,
unsigned long arg);
int (*init)(struct sock *sk);
void (*shutdown)(struct sock *sk, int how);
int (*setsockopt)(struct sock *sk, int level, int optname,
char *optval, int optlen);
int (*getsockopt)(struct sock *sk, int level, int optname,
char *optval, int *option);
unsigned short max_header;
unsigned long retransmits;
struct sock * sock_array[SOCK_ARRAY_SIZE];
char name[80];
int inuse, highestinuse;
};
传输层:
该层实现是tcp.c和 udp.c两个文件。分别实现tcp和udp的两种方式传输。
如果用TCP这prot字段是指向TCP协议操作函数集。
如果用UDP这prot字段是指向UDP协议操作函数集
在对inet_create函数分析中。
其中 sock->prot->bind 对应在tcp中tcp_bind 其他类同,原理如同BSD层到INET层一样。
其中 sock->prot->bind 对应在tcp中udp_bind 其他类同,原理如同BSD层到INET层一样。
分析先到此为止,to be continued…