【Linux 内核网络协议栈源码剖析】af_inet.c——INET Socket层(2)

前篇 socket.c 介绍的是BSD socket 层的函数。应用系统的函数调用如socket、bind、listen、accept、connect 等函数,将通过共同的入口函数 sys_socket 调用BSD socket层的对应系统调用函数,如sock_socket、sock_bind、sock_listen、sock_accept、sock_connect 等函数(socket.c)。从前面分析得知, sock_ 函数内部又将调用下一层的函数实现,而这些下层函数就是 af_inet.c 文件中定义的函数,我们把该层取名为 INET 层。INET 的下一层即为传输层:tcp 和 udp

源码位置 net\inet\af_inet.c

BSD socket层对应的套接字结构是 socket 结构,这是一个通用的套接字结构,INET socket 层则使用的是 sock 结构,这是一个比 socket 更复杂的结构,具体到某种协议,一个通用,一个是具体。该结构比较庞大,我们就在函数内部对涉及到的 sock 结构变量分析算了。

这里从af_inet.c 文件的开头函数逐个分析.介绍函数有点多,主流函数部分额外标红。

1、检测一个端口号是否已被使用

/*
 *	See if a socket number is in use.
 */
 //用于检测一个端口号是否已被使用
 //prot表示传输层操作函数集的一个结构,每个传输层协议都有一个proto结构对应
 //所有使用某种协议的套接字均被插入到sock_array数组中元素指向的sock结构链表中
 //256个链表至多存放65535个套接字,有点类似哈希表结构
static int sk_inuse(struct proto *prot, int num)
{
	struct sock *sk;
    //用端口号寻址对应proto结构中sock_array所对应元素所指的链表,然后遍历
    //该链表中所有sock结构,检查sock结构num字段与参数num值是否相同
	for(sk = prot->sock_array[num & (SOCK_ARRAY_SIZE -1 )];
		sk != NULL;  sk=sk->next) 
	{
		if (sk->num == num) 
			return(1);
	}
	//sock_array数组中存放的是链表,对于不同的协议使用相同的端口号不会引起任何问题
	return(0);
}
一定要清楚 sock_array 这个数据结构,可以把其看做是一个哈希表(不知道哈希表的,自行面壁, 点此),这里是分离链接散列表。所有的在用的 sock 结构都挂载在这里面

2、获取一个新的未使用端口号

/*
 *	Pick a new socket number
 */
 //获取一个新的未使用端口号
 //prot表示所使用的协议,base表示最小起始端口号
unsigned short get_new_socknum(struct proto *prot, unsigned short base)
{
	static int start=0;

	/*
	 * Used to cycle through the port numbers so the
	 * chances of a confused connection drop.
	 */
	 
	int i, j;
	int best = 0;
	int size = 32767; /* a big num. */
	struct sock *sk;
    //1024之下的端口号被保留,或者必须使用特权才能使用
	if (base == 0) 
		base = PROT_SOCK+1+(start % 1024);
	if (base <= PROT_SOCK) 
	{
		base += PROT_SOCK+(start % 1024);
	}

	/* Now look through the entire array and try to find an empty ptr. */
	for(i=0; i < SOCK_ARRAY_SIZE; i++) 
	{
		j = 0;
		sk = prot->sock_array[(i+base+1) &(SOCK_ARRAY_SIZE -1)];
		while(sk != NULL) 
		{
			sk = sk->next;
			j++;
		}
		if (j == 0) //该链表尚未使用,可直接返回,端口计算即获取当前端口号
		{
			start =(i+1+start )%1024;
			return(i+base+1);
		}
		if (j < size) //否则取有效的j值最小的表项
		{
			best = i;
			size = j;
		}
	}

	/* Now make sure the one we want is not in use. */
	while(sk_inuse(prot, base +best+1)) 
	{
		best += SOCK_ARRAY_SIZE;
	}
	return(best+base+1);
}


3、将具有确定端口号的新 sock 结构加入到sock_array 中

/*
 *	Add a socket into the socket tables by number.
 */
 //将具有确定端口号的新sock结构加入到sock_array数组表示的sock结构链表中
void put_sock(unsigned short num, struct sock *sk)
{
	struct sock *sk1;
	struct sock *sk2;
	int mask;
	unsigned long flags;

	sk->num = num;//本地端口号
	sk->next = NULL;
	num = num &(SOCK_ARRAY_SIZE -1);

	/* We can't have an interrupt re-enter here. */
	save_flags(flags);//保存标志
	cli();

	sk->prot->inuse += 1;//当前使用的套接字总数+1
	//如果大于最大值,就用最大值取代
	if (sk->prot->highestinuse < sk->prot->inuse)
		sk->prot->highestinuse = sk->prot->inuse;

	//如果该端口号对应的sock结构不存在,则插入,然后恢复标志等信息
	if (sk->prot->sock_array[num] == NULL) 
	{
		sk->prot->sock_array[num] = sk;
		restore_flags(flags);
		return;
	}
	//如果已经存在
	restore_flags(flags);
	//估计本地地址子网反掩码,mask掩码,saddr本地地址
	for(mask = 0xff000000; mask != 0xffffffff; mask = (mask >> 8) | mask) 
	{
		if ((mask & sk->saddr) &&
		    (mask & sk->saddr) != (mask & 0xffffffff)) 
		{
			mask = mask << 8;
			break;
		}
	}
	cli();
	//使用本地地址掩码进行地址排列
	sk1 = sk->prot->sock_array[num];
	for(sk2 = sk1; sk2 != NULL; sk2=sk2->next) 
	{
		if (!(sk2->saddr & mask)) 
		{
			if (sk2 == sk1) 
			{
				sk->next = sk->prot->sock_array[num];
				sk->prot->sock_array[num] = sk;
				sti();
				return;
			}
			sk->next = sk2;
			sk1->next= sk;
			sti();
			return;
		}
		sk1 = sk2;
	}

	/* Goes at the end. */
	sk->next = NULL;
	sk1->next = sk;
	sti();
}

4、从sock_array 中移除一个指定 sock 结构

/*
 *	Remove a socket from the socket tables.
 */
//移除一个指定sock结构
static void remove_sock(struct sock *sk1)
{
	struct sock *sk2;
	unsigned long flags;

	if (!sk1->prot) 
	{
		printk("sock.c: remove_sock: sk1->prot == NULL\n");
		return;
	}

	/* We can't have this changing out from under us. */
	save_flags(flags);
	cli();
	//sock_array可以看做是一个哈希表结构,清楚这个,就很容易理解其内部关系了
	//数据结构是骨架,所以阅读源码的时候,要清楚其采用的是什么结构思想
    //首先定位到哈希表数组索引
	sk2 = sk1->prot->sock_array[sk1->num &(SOCK_ARRAY_SIZE -1)];
	if (sk2 == sk1)//如果第一个恰好是要移除的 
	{
		sk1->prot->inuse -= 1;//当前使用数-1
		//就指向下一个
		sk1->prot->sock_array[sk1->num &(SOCK_ARRAY_SIZE -1)] = sk1->next;
		restore_flags(flags);
		return;
	}
    //不是第一个就查找链表
	while(sk2 && sk2->next != sk1) 
	{
		sk2 = sk2->next;
	}
    //查找成功
	if (sk2) 
	{
		sk1->prot->inuse -= 1;
		sk2->next = sk1->next;//可以看出,这里只是将其从链表中移除,其实体结构还是存在的
		restore_flags(flags);
		return;
	}
	restore_flags(flags);//恢复现场
}

5、销毁一个套接字。INET 域对应的套接字说的就是 sock 结构
/*
 *	Destroy an AF_INET socket
 */
 //这里就是真正意义上的销毁一个套接字了
 //先清空该套接字各个队列中缓存数据包的释放操作,然后再释放sock结构
void destroy_sock(struct sock *sk)
{
	struct sk_buff *skb;

  	sk->inuse = 1;			/* just to be safe. */

  	/* In case it's sleeping somewhere. */
	//检查dead标志位,如果置1,表示已经被销毁
  	if (!sk->dead) 
  		sk->write_space(sk);
    //先从其对应链表中移除
  	remove_sock(sk);
  
  	/* 将sock结构中各定时器从相应链表中移除 */
  	delete_timer(sk);
  	/* Nor send them */
	del_timer(&sk->retransmit_timer);

	//释放partial指针指向的数据包
	while ((skb = tcp_dequeue_partial(sk)) != NULL) {
		IS_SKB(skb);
		kfree_skb(skb, FREE_WRITE);
	}

	/* 清空写队列. */
  	while((skb = skb_dequeue(&sk->write_queue)) != NULL) {
		IS_SKB(skb);
		kfree_skb(skb, FREE_WRITE);
  	}
  	
  	/*
  	 *	Don't discard received data until the user side kills its
  	 *	half of the socket.
  	 */

	if (sk->dead) 
	{
	//清空已缓存的待读取数据包
  		while((skb=skb_dequeue(&sk->receive_queue))!=NULL) 
  		{
		/*
		 * This will take care of closing sockets that were
		 * listening and didn't accept everything.
		 */
			if (skb->sk != NULL && skb->sk != sk) 
			{
				IS_SKB(skb);
				skb->sk->dead = 1;
				skb->sk->prot->close(skb->sk, 0);
			}
			IS_SKB(skb);
			kfree_skb(skb, FREE_READ);
		}
	}	

	/* 清空重发队列中缓存的数据包. */
	cli();
	for(skb = sk->send_head; skb != NULL; )
	{
		struct sk_buff *skb2;

		/*
		 * We need to remove skb from the transmit queue,
		 * or maybe the arp queue.
		 */
		if (skb->next  && skb->prev) {
/*			printk("destroy_sock: unlinked skb\n");*/
			IS_SKB(skb);
			skb_unlink(skb);
		}
		skb->dev = NULL;
		skb2 = skb->link3;
		kfree_skb(skb, FREE_WRITE);
		skb = skb2;
	}
	sk->send_head = NULL;
	sti();

  	/* 清空数据包接收缓存队列. */
  	while((skb=skb_dequeue(&sk->back_log))!=NULL) 
  	{
		/* this should never happen. */
/*		printk("cleaning back_log\n");*/
		kfree_skb(skb, FREE_READ);
	}

	/* Now if it has a half accepted/ closed socket. */
	if (sk->pair) 
	{
		sk->pair->dead = 1;
		sk->pair->prot->close(sk->pair, 0);
		sk->pair = NULL;
  	}

	/*
	 * Now if everything is gone we can free the socket
	 * structure, otherwise we need to keep it around until
	 * everything is gone.
	 */
     //确认前期各项缓冲区已经释放,则可以对sock结构本身进行释放
     //有木有发现,这跟销毁一个进程的步骤是类似的,这好比父进程收尸了
	  if (sk->dead && sk->rmem_alloc == 0 && sk->wmem_alloc == 0) 
	  {
		kfree_s((void *)sk,sizeof(*sk));
	  } 
	  //否则,设置定时器,等待其他进程释放其读写缓冲区后再进行sock结构的释放
	  else 
	  {
		/* this should never happen. */
		/* actually it can if an ack has just been sent. */
		sk->destroy = 1;
		sk->ack_backlog = 0;
		sk->inuse = 0;
		reset_timer(sk, TIME_DESTROY, SOCK_DESTROY_TIME);
  	}
}

6、用于设置和读取套接字的有关信息

/*
 *	The routines beyond this point handle the behaviour of an AF_INET
 *	socket object. Mostly it punts to the subprotocols of IP to do
 *	the work.
 */
 //这个就是inet域,sock_fcntl调用的下层函数,同理,后面介绍的inet_前缀函数也是这样
 //sock->ops->fcntl(sock, cmd, arg)即调用该函数,BSD socket层调用INET socket层函数
 //用于设置或读取套接字的有关信息,这里是获得套接字的宿主
static int inet_fcntl(struct socket *sock, unsigned int cmd, unsigned long arg)
{
	struct sock *sk;
    //INET域,data指向sock结构。由BSD层socket 进入 INET层socket
	sk = (struct sock *) sock->data;

	switch(cmd) 
	{
		case F_SETOWN://设置
			/*
			 * This is a little restrictive, but it's the only
			 * way to make sure that you can't send a sigurg to
			 * another process.
			 */
			 //需要超级用户权限
			if (!suser() && current->pgrp != -arg &&
				current->pid != arg) return(-EPERM);
			sk->proc = arg;//该sock结构所属进程的进程号
			return(0);
		case F_GETOWN://读取
			return(sk->proc);
		default:
			return(-EINVAL);
	}
}

7、设置和读取选项。函数内部又将调用下一层函数

/*
 *	Set socket options on an inet socket.
 */
 //设置选项
static int inet_setsockopt(struct socket *sock, int level, int optname,
		    char *optval, int optlen)
{
	//INET域对应sock结构
  	struct sock *sk = (struct sock *) sock->data;  
	//这里根据选项的层次分别调用不同的下层函数
	if (level == SOL_SOCKET)//调用BSD层了
		return sock_setsockopt(sk,level,optname,optval,optlen);
	if (sk->prot->setsockopt==NULL)
		return(-EOPNOTSUPP);
	else//转传输层
		return sk->prot->setsockopt(sk,level,optname,optval,optlen);
}

/*
 *	Get a socket option on an AF_INET socket.
 */
 //读取选项
static int inet_getsockopt(struct socket *sock, int level, int optname,
		    char *optval, int *optlen)
{
  	struct sock *sk = (struct sock *) sock->data;  	
  	if (level == SOL_SOCKET) 
  		return sock_getsockopt(sk,level,optname,optval,optlen);
  	if(sk->prot->getsockopt==NULL)  	
  		return(-EOPNOTSUPP);
  	else
  		return sk->prot->getsockopt(sk,level,optname,optval,optlen);//调用下一层函数,INET 的下一层就是传输层
}

8、自动绑定本地端口

/*
 *	Automatically bind an unbound socket.
 */
 //自动绑定一个本地端口号,一般用于客户端,实际应用层编程时,对于客户端我们并没有
 //特别去绑定某个端口号,而是由系统自动绑定
static int inet_autobind(struct sock *sk)
{
	/* We may need to bind the socket. */
	if (sk->num == 0) 
	{
	//获取一个新的未使用的端口号
		sk->num = get_new_socknum(sk->prot, 0);
		if (sk->num == 0) 
			return(-EAGAIN);
		put_sock(sk->num, sk);//加入到sock_array哈希表中
		//将一个无符号短整型数从网络字节顺序转换为主机字节顺序。大小端问题
		sk->dummy_th.source = ntohs(sk->num);//TCP首部中的source字段表示本地端口号
	}
	return 0;
}


9、监听客户端的请求,这是对应用编程 listen函数的描述。这个函数为BSD socket 层 sock_listen函数的下一层调用,可以看出主要就是对 sock 结构中state字段的设置

listen -> sys_socket -> sock_listen -> inet_listen   完(完表示这个函数到这层就结束了)

/*
 *	Move a socket into listening state.
 */
 //sock_listen的下层调用函数
 //这个函数主要是对sock结构中state字段的设置。listen函数到这层完成处理
static int inet_listen(struct socket *sock, int backlog)
{
	//获取sock数据结构
	struct sock *sk = (struct sock *) sock->data;
       //如果sock事先没有绑定端口号,则自动绑定一个端口号
	if(inet_autobind(sk)!=0)
		return -EAGAIN;

	/* We might as well re use these. */ 
	/*
	 * note that the backlog is "unsigned char", so truncate it
	 * somewhere. We might as well truncate it to what everybody
	 * else does..
	 */
	 //等待的最大数
	if ((unsigned) backlog > 128)
		backlog = 128;
	sk->max_ack_backlog = backlog;//缓存的未应答数据包个数
	if (sk->state != TCP_LISTEN)//如果不是listen状态,则置位listen状态
	{
		sk->ack_backlog = 0;//缓存的个数置0
		sk->state = TCP_LISTEN;
	}
	return(0);
}

10、建立sock 结构。sock_socket 的下一层调用。

socket -> sys_socket -> sock_socket -> inet_create 完

/*
 *	Create an inet socket.
 *
 *	FIXME: Gcc would generate much better code if we set the parameters
 *	up in in-memory structure order. Gcc68K even more so
 */
 //该函数被上层sock_socket函数调用,用于创建一个socket套接字对应的sock结构并对其进行初始化
 //socket是通用结构,sock是具体到某种协议的结构
 //代码是一大串,功能就是建立套接字对应的sock结构并对其进行初始化
static int inet_create(struct socket *sock, int protocol)
{
	struct sock *sk;
	struct proto *prot;
	int err;
    //分配一个sock结构
	sk = (struct sock *) kmalloc(sizeof(*sk), GFP_KERNEL);
	if (sk == NULL) 
		return(-ENOBUFS);
	sk->num = 0;//本地端口号
	sk->reuse = 0;
	//根据类型进行相关字段的赋值
	//关于哪种类型与协议的对应关系,请参考<UNP 卷1>
	switch(sock->type) 
	{
		case SOCK_STREAM:
		case SOCK_SEQPACKET:
			if (protocol && protocol != IPPROTO_TCP) 
			{
				kfree_s((void *)sk, sizeof(*sk));
				return(-EPROTONOSUPPORT);
			}
			protocol = IPPROTO_TCP;//tcp协议
			sk->no_check = TCP_NO_CHECK;
			//这个prot变量表明了套接字使用的是何种协议
			//然后使用的则是对应协议的操作函数
			prot = &tcp_prot;
			break;

		case SOCK_DGRAM:
			if (protocol && protocol != IPPROTO_UDP) 
			{
				kfree_s((void *)sk, sizeof(*sk));
				return(-EPROTONOSUPPORT);
			}
			protocol = IPPROTO_UDP;//udp协议
			sk->no_check = UDP_NO_CHECK;//不使用校验
			prot=&udp_prot;
			break;
      
		case SOCK_RAW:
			if (!suser()) //超级用户才能处理
			{
				kfree_s((void *)sk, sizeof(*sk));
				return(-EPERM);
			}
			if (!protocol)// 原始套接字类型,这里表示端口号
			{
				kfree_s((void *)sk, sizeof(*sk));
				return(-EPROTONOSUPPORT);
			}
			prot = &raw_prot;
			sk->reuse = 1;
			sk->no_check = 0;	/*
						 * Doesn't matter no checksum is
						 * performed anyway.
						 */
			sk->num = protocol;//本地端口号
			break;

		case SOCK_PACKET:
			if (!suser()) 
			{
				kfree_s((void *)sk, sizeof(*sk));
				return(-EPERM);
			}
			if (!protocol) 
			{
				kfree_s((void *)sk, sizeof(*sk));
				return(-EPROTONOSUPPORT);
			}
			prot = &packet_prot;
			sk->reuse = 1;
			sk->no_check = 0;	/* Doesn't matter no checksum is
						 * performed anyway.
						 */
			sk->num = protocol;
			break;

		default://不符合以上任何类型,则返回
			kfree_s((void *)sk, sizeof(*sk));
			return(-ESOCKTNOSUPPORT);
	}
	sk->socket = sock;//建立与其对应的socket之间的关系
#ifdef CONFIG_TCP_NAGLE_OFF
	sk->nonagle = 1;//如果定义了Nagle算法
#else    
	sk->nonagle = 0;
#endif  
	//各种初始化
	sk->type = sock->type;
	sk->stamp.tv_sec=0;
	sk->protocol = protocol;
	sk->wmem_alloc = 0;
	sk->rmem_alloc = 0;
	sk->sndbuf = SK_WMEM_MAX;
	sk->rcvbuf = SK_RMEM_MAX;
	sk->pair = NULL;
	sk->opt = NULL;
	sk->write_seq = 0;
	sk->acked_seq = 0;
	sk->copied_seq = 0;
	sk->fin_seq = 0;
	sk->urg_seq = 0;
	sk->urg_data = 0;
	sk->proc = 0;
	sk->rtt = 0;				/*TCP_WRITE_TIME << 3;*/
	sk->rto = TCP_TIMEOUT_INIT;		/*TCP_WRITE_TIME*/
	sk->mdev = 0;
	sk->backoff = 0;
	sk->packets_out = 0;
	sk->cong_window = 1; /* start with only sending one packet at a time. */
	sk->cong_count = 0;
	sk->ssthresh = 0;
	sk->max_window = 0;
	sk->urginline = 0;
	sk->intr = 0;
	sk->linger = 0;
	sk->destroy = 0;
	sk->priority = 1;
	sk->shutdown = 0;
	sk->keepopen = 0;
	sk->zapped = 0;
	sk->done = 0;
	sk->ack_backlog = 0;
	sk->window = 0;
	sk->bytes_rcv = 0;
	sk->state = TCP_CLOSE;
	sk->dead = 0;
	sk->ack_timed = 0;
	sk->partial = NULL;
	sk->user_mss = 0;
	sk->debug = 0;

	/* this is how many unacked bytes we will accept for this socket.  */
	sk->max_unacked = 2048; /* needs to be at most 2 full packets. */

	/* how many packets we should send before forcing an ack. 
	   if this is set to zero it is the same as sk->delay_acks = 0 */
	sk->max_ack_backlog = 0;
	sk->inuse = 0;
	sk->delay_acks = 0;
	skb_queue_head_init(&sk->write_queue);
	skb_queue_head_init(&sk->receive_queue);
	sk->mtu = 576;//最大传输单元
	sk->prot = prot;
	sk->sleep = sock->wait;
	sk->daddr = 0;//远端地址
	sk->saddr = 0 /* 本地地址 */;
	sk->err = 0;
	sk->next = NULL;
	sk->pair = NULL;
	sk->send_tail = NULL;
	sk->send_head = NULL;
	sk->timeout = 0;
	sk->broadcast = 0;
	sk->localroute = 0;
	init_timer(&sk->timer);
	init_timer(&sk->retransmit_timer);
	sk->timer.data = (unsigned long)sk;
	sk->timer.function = &net_timer;
	skb_queue_head_init(&sk->back_log);
	sk->blog = 0;
	sock->data =(void *) sk;
	sk->dummy_th.doff = sizeof(sk->dummy_th)/4;
	sk->dummy_th.res1=0;
	sk->dummy_th.res2=0;
	sk->dummy_th.urg_ptr = 0;
	sk->dummy_th.fin = 0;
	sk->dummy_th.syn = 0;
	sk->dummy_th.rst = 0;
	sk->dummy_th.psh = 0;
	sk->dummy_th.ack = 0;
	sk->dummy_th.urg = 0;
	sk->dummy_th.dest = 0;
	sk->ip_tos=0;
	sk->ip_ttl=64;
#ifdef CONFIG_IP_MULTICAST
	sk->ip_mc_loop=1;
	sk->ip_mc_ttl=1;
	*sk->ip_mc_name=0;
	sk->ip_mc_list=NULL;
#endif
  	
	sk->state_change = def_callback1;
	sk->data_ready = def_callback2;
	sk->write_space = def_callback3;
	sk->error_report = def_callback1;

	if (sk->num) //如果分配了本地端口号
	{
	/*
	 * It assumes that any protocol which allows
	 * the user to assign a number at socket
	 * creation time automatically
	 * shares.
	 */
	 //对面前面已经有的注释的代码部分,以后将不再赘述注释
		put_sock(sk->num, sk);//参考前面注释吧
		sk->dummy_th.source = ntohs(sk->num);
	}

	if (sk->prot->init) //调用init函数
	{
		err = sk->prot->init(sk);
		if (err != 0) 
		{
			destroy_sock(sk);//出错了,就销毁
			return(err);
		}
	}
	return(0);
}
看到没,创建socket 时候,就只是分配file 结构,分配文件描述符,关联inode。完全没有涉及到任何协议,而创建 sock 则关联到协议了。所以socket 是一个通用结构,sock 则是一个具体的结构。网路协议栈层层封装,高内聚,低耦合,使得某层结构的变化不会影响到其余层,多好的设计啊


11、复制一个套接字。服务器端用于实际通信的套接字与其监听套接字是不同的。oldsocket表示监听套接字结构,newsocket表示新创建的用于实际通信的套接字结构

/*
 *	Duplicate a socket.
 */
 //复制一个套接字,内部实现很简单,就是转调用create。套接字的dup是这样,
 //那么其余文件描述符的dup也应该是这样
static int inet_dup(struct socket *newsock, struct socket *oldsock)
{
	return(inet_create(newsock,((struct sock *)(oldsock->data))->protocol));
}

12、对套接字状态的检测。套接字状态的迁移,参考前面博文: TCP状态转移

/*
 * Return 1 if we still have things to send in our buffers.
 */
 //对套接字状态的检测
static inline int closing(struct sock * sk)
{
	switch (sk->state) {
		case TCP_FIN_WAIT1:
		case TCP_CLOSING:
		case TCP_LAST_ACK:
			return 1;//正在处于关闭状态
	}
	return 0;
}

13、inet_release 函数继续完成BSD 层sock_release函数,该函数的关闭操作部分将继续下调用close函数

/*
 *	The peer socket should always be NULL (or else). When we call this
 *	function we are destroying the object and from then on nobody
 *	should refer to it.
 */
 //sock_release 调用的INET层函数,进一步完成套接字的关闭操作
static int inet_release(struct socket *sock, struct socket *peer)
{
    //获得对应 sock 结构
	struct sock *sk = (struct sock *) sock->data;
	if (sk == NULL) 
		return(0);
    //通知相关进程套接字状态的变化
	sk->state_change(sk);

	/* Start closing the connection.  This may take a while. */

#ifdef CONFIG_IP_MULTICAST
	/* Applications forget to leave groups before exiting */
    //处理一个使用多播的套接字被关闭时对多播地址列表的处理
	ip_mc_drop_socket(sk);
#endif
	/*
	 * If linger is set, we don't return until the close
	 * is complete.  Other wise we return immediately. The
	 * actually closing is done the same either way.
	 *
	 * If the close is due to the process exiting, we never
	 * linger..
	 */
    //linger=1表示在关闭套接字时需要等待一段时间以确认其已关闭
    //=0,则直接返回
	if (sk->linger == 0 || (current->flags & PF_EXITING))
	{
		sk->prot->close(sk,0);//继续转
		sk->dead = 1;//表示套接字已释放
	} 
	else//等待一段时间返回 
	{
		sk->prot->close(sk, 0);
		cli();
		if (sk->lingertime)//等待关闭操作的时间,只有linger=1才有意义
			current->timeout = jiffies + HZ*sk->lingertime;//定时器确定时间
        //不断检测套接字状态。这个函数名closing取得真是...
        //为真的条件是正在处于关闭状态,并且未超时,closing 正在进行时态,所以,
        //closing函数的功能应该是判断是否处于正关闭状态
		while(closing(sk) && current->timeout>0) 
		{
			interruptible_sleep_on(sk->sleep);//如果发生中断
			if (current->signal & ~current->blocked) 
			{
				break;
#if 0
				/* not working now - closes can't be restarted */
				sti();
				current->timeout=0;
				return(-ERESTARTSYS);
#endif
			}
		}
		current->timeout=0;//定时器清零
		sti();
		sk->dead = 1;//已释放
	}
	sk->inuse = 1;//设置字段,表示有进程(其实就是本进程)正在使用该sock结构,其余进程需等待,理解为加锁吧

	/* 解除sock 与 socket之间的关系 */
	sock->data = NULL;
	release_sock(sk);//转调用函数
	sk->socket = NULL;
	return(0);
}

14、inet_bind 是sock_bind 调用的下层函数,完成本地地址绑定。如果没有具体制定地址和端口号,将由系统自动进行分配。

bind -> sys_socket -> sock_bind -> inet_bind  完

/* this needs to be changed to disallow
   the rebinding of sockets.   What error
   should it return? */
//完成本地地址绑定,本地地址绑定包括IP地址和端口号两个部分
static int inet_bind(struct socket *sock, struct sockaddr *uaddr,
	       int addr_len)
{
	struct sockaddr_in *addr=(struct sockaddr_in *)uaddr;
	struct sock *sk=(struct sock *)sock->data, *sk2;
	unsigned short snum = 0 /* Stoopid compiler.. this IS ok */;
	int chk_addr_ret;

	/* check this error. */
	//在进行地址绑定时,该套接字应该处于关闭状态
	if (sk->state != TCP_CLOSE)
		return(-EIO);
	//地址长度字段校验
	if(addr_len<sizeof(struct sockaddr_in))
		return -EINVAL;

    //非原始套接字类型,绑定前,没有端口号
	if(sock->type != SOCK_RAW)
	{
		if (sk->num != 0) 
			return(-EINVAL);

		snum = ntohs(addr->sin_port);//将地址结构中的端口号转为主机字节顺序

		/*
		 * We can't just leave the socket bound wherever it is, it might
		 * be bound to a privileged port. However, since there seems to
		 * be a bug here, we will leave it if the port is not privileged.
		 */
		 //如果端口号为0,则自动分配一个
		if (snum == 0) 
		{
			snum = get_new_socknum(sk->prot, 0);//得到一个新的端口号
		}
		//端口号有效性检验,1024以上,超级用户权限
		if (snum < PROT_SOCK && !suser()) 
			return(-EACCES);
	}
	//检查地址是否是一个本地接口地址
	chk_addr_ret = ip_chk_addr(addr->sin_addr.s_addr);
	//如果指定的地址不是本地地址,并且也不是一个多播地址,则错误返回
	if (addr->sin_addr.s_addr != 0 && chk_addr_ret != IS_MYADDR && chk_addr_ret != IS_MULTICAST)
		return(-EADDRNOTAVAIL);	/* Source address MUST be ours! */
	//如果没有指定地址,则系统自动分配一个本地地址  	
	if (chk_addr_ret || addr->sin_addr.s_addr == 0)
		sk->saddr = addr->sin_addr.s_addr;
	
	if(sock->type != SOCK_RAW)
	{
		/* Make sure we are allowed to bind here. */
		cli();
	
		//for循环主要是检查检查有无冲突的端口号,有冲突,但不允许地址复用,肯定错误退出
		//成功跳出for循环时,已经定位到了哈希表sock_array指定索引的链表的末端
		for(sk2 = sk->prot->sock_array[snum & (SOCK_ARRAY_SIZE -1)];
					sk2 != NULL; sk2 = sk2->next) 
		{
		/* should be below! */
			if (sk2->num != snum) //没有重复,继续搜索下一个
				continue;//除非有重复,否则后面的代码将不会被执行
			if (!sk->reuse)//端口号重复,如果没有设置地址复用标志,退出
			{
				sti();
				return(-EADDRINUSE);
			}
			
			if (sk2->num != snum) 
				continue;		/* more than one */
			if (sk2->saddr != sk->saddr) //地址和端口一个意思
				continue;	/* socket per slot ! -FB */
			//如果状态是LISTEN表明该套接字是一个服务端,服务端不可使用地址复用选项
			if (!sk2->reuse || sk2->state==TCP_LISTEN) 
			{
				sti();
				return(-EADDRINUSE);
			}
		}
		sti();

		remove_sock(sk);//将sk sock结构从其之前的表中删除
		put_sock(snum, sk);//然后根据新分配的端口号插入到新的表中。说明系统在维护许多这样的表
		sk->dummy_th.source = ntohs(sk->num);//tcp首部,本地地址,端口初始化
		sk->daddr = 0;//sock结构所代表套接字的远端地址
		sk->dummy_th.dest = 0;//tcp首部,远端地址
	}
	return(0);
}

15、connect -> sys_socket -> sock_connect -> inet_connect -> ...,同上面inet_bind 一样,继续完成未完成的使命。该函数涉及到 tcp 的三次握手

从这个函数我们可以得知。客户端自动绑定端口发生在connect函数中(inet_connect),也可以看到tcp 三次握手的状态迁移

connect -> sys_socket -> sock_connect -> inet_connect ->  connect(下层传输层函数,对于tcp协议则是调用 tcp_connect)

/*
 *	Connect to a remote host. There is regrettably still a little
 *	TCP 'magic' in here.
 */
 //完成套接字的连接请求操作,这是客户端主动向服务器端发送请求
static int inet_connect(struct socket *sock, struct sockaddr * uaddr,
		  int addr_len, int flags)
{
	struct sock *sk=(struct sock *)sock->data;
	int err;
	sock->conn = NULL;
    //如果处于正在连接过程中,且tcp对应的状态
	if (sock->state == SS_CONNECTING && tcp_connected(sk->state))
	{
		sock->state = SS_CONNECTED;//直接设置字段为已经连接
		/* Connection completing after a connect/EINPROGRESS/select/connect */
		return 0;	/* Rock and roll */
	}
    //如果正在连接过程中,且是tcp协议
	if (sock->state == SS_CONNECTING && sk->protocol == IPPROTO_TCP && (flags & O_NONBLOCK)) {
		if (sk->err != 0)
		{
			err=sk->err;
			sk->err=0;
			return -err;
		}
		//返回正在进行状态
		return -EALREADY;	/* Connecting is currently in progress */
	}
  	//没有连接
	if (sock->state != SS_CONNECTING) 
	{
		/* We may need to bind the socket. */
		if(inet_autobind(sk)!=0)//自动绑定一个端口号,客户端自动绑定端口号是在connect函数中实现的
			return(-EAGAIN);
		if (sk->prot->connect == NULL) //不支持该项操作,没有指定操作函数
			return(-EOPNOTSUPP);
		//转调用connect函数
		err = sk->prot->connect(sk, (struct sockaddr_in *)uaddr, addr_len);
		if (err < 0) 
			return(err);
  		sock->state = SS_CONNECTING;//设置状态字段,表示正在连接过程中
	}
	//这个状态下,这是关闭信号。各个状态描述,参考下面链接
	http://blog.csdn.net/wenqian1991/article/details/40110703
	if (sk->state > TCP_FIN_WAIT2 && sock->state==SS_CONNECTING)
	{
		sock->state=SS_UNCONNECTED;
		cli();
		err=sk->err;
		sk->err=0;
		sti();
		return -err;
	}
    //没有建立,就是在正在建立的路上
	if (sk->state != TCP_ESTABLISHED &&(flags & O_NONBLOCK)) 
	  	return(-EINPROGRESS);//过程正在处理

	cli(); /* avoid the race condition */
	//等待上面的下层函数返回
	while(sk->state == TCP_SYN_SENT || sk->state == TCP_SYN_RECV) 
	{
		interruptible_sleep_on(sk->sleep);
		if (current->signal & ~current->blocked) 
		{
			sti();
			return(-ERESTARTSYS);
		}
		/* This fixes a nasty in the tcp/ip code. There is a hideous hassle with
		   icmp error packets wanting to close a tcp or udp socket. */
		if(sk->err && sk->protocol == IPPROTO_TCP)
		{
			sti();
			sock->state = SS_UNCONNECTED;
			err = -sk->err;
			sk->err=0;
			return err; /* set by tcp_err() */
		}
	}
	sti();
	sock->state = SS_CONNECTED;//成功建立连接

	if (sk->state != TCP_ESTABLISHED && sk->err) 
	{
		sock->state = SS_UNCONNECTED;
		err=sk->err;
		sk->err=0;
		return(-err);
	}
	return(0);
}

16、inet_accept。sock_accept 在调用 inet_accpet 函数之前已经使用 inet_dup 函数创建了其对应的 sock 结构,newsock 的data字段已经指向了其对应的 sock 结构。inet_dup 调用 inet_create,创建了和原来的 sock 结构,协议,类型一致的 sock 结构。

该 inet_connect 内部又将调用下层函数,学习内核网络协议栈,救的习惯这种时不时调用下层函数的伎俩。

accept -> sys_socket -> sock_accpet -> inet_accpet -> accept(下层函数)

/*
 *	Accept a pending connection. The TCP layer now gives BSD semantics.
 */
//先去看看sock_accept,看看各个参数的意思,newsock是dup sock后的新sock
//sock为监听套接字,newsock为连接成功后实际用于通信的sock
static int inet_accept(struct socket *sock, struct socket *newsock, int flags)
{
	struct sock *sk1, *sk2;
	int err;

	sk1 = (struct sock *) sock->data;

	/*
	 * We've been passed an extra socket.
	 * We need to free it up because the tcp module creates
	 * its own when it accepts one.
	 */
	 //如果sock->data 已经指向了对应的sock结构,则把它销毁
	 //销毁旧的,后面指向新的accept后的
	if (newsock->data)
	{
	  	struct sock *sk=(struct sock *)newsock->data;
	  	newsock->data=NULL;
	  	sk->dead = 1;
	  	destroy_sock(sk);
	}
  
	if (sk1->prot->accept == NULL) //算了,懒得写了
		return(-EOPNOTSUPP);

	/* Restore the state if we have been interrupted, and then returned. */
	//如果套接字在等待连接的过程中被中断,则监听套接字与中断的套接字关联,下次优先处理该套接字
	if (sk1->pair != NULL ) 
	{
		sk2 = sk1->pair;
		sk1->pair = NULL;
	} 
	else
	{
		sk2 = sk1->prot->accept(sk1,flags);//交给下层处理函数
		if (sk2 == NULL) 
		{
			if (sk1->err <= 0)
				printk("Warning sock.c:sk1->err <= 0.  Returning non-error.\n");
			err=sk1->err;
			sk1->err=0;
			return(-err);
		}
	}
	//socket sock建立关联
	newsock->data = (void *)sk2;//指向新的
	sk2->sleep = newsock->wait;
	sk2->socket = newsock;
	newsock->conn = NULL;//还没有连接客户端,这里是没有连接对端套接字,调用该函数的是服务器端
	if (flags & O_NONBLOCK) 
		return(0);

	cli(); /* avoid the race. */
	//三次握手中间过程,tcp SYN序列号接收
	while(sk2->state == TCP_SYN_RECV) 
	{
	//被中断了
		interruptible_sleep_on(sk2->sleep);
		if (current->signal & ~current->blocked) 
		{
			sti();
			sk1->pair = sk2;//存入pair,下次优先处理
			sk2->sleep = NULL;
			sk2->socket=NULL;
			newsock->data = NULL;
			return(-ERESTARTSYS);
		}
	}
	sti();
    //连接失败,三次握手失败
	if (sk2->state != TCP_ESTABLISHED && sk2->err > 0) 
	{
		err = -sk2->err;
		sk2->err=0;
		sk2->dead=1;	/* ANK */
		destroy_sock(sk2);//销毁新建的sock结构
		newsock->data = NULL;
		return(err);
	}
	newsock->state = SS_CONNECTED;//已经建立了连接
	return(0);
}

算了,篇幅已经够大了,几个主流函数已经介绍完了,其余就不做介绍了.

af_inet.c 文件作为 INET 层处理函数定义文件,处理来自 BSD 层的请求,并在完成本层相应的检查工作后继续将请求发送给下层传输层函数进行具体的处理。该INET 层承BSD 层启传输层。

参考资料:《Linux 内核网络栈源代码情景分析》,Linux kernel 1.2.13


你可能感兴趣的:(源码,网络协议,内核,网络栈)