我是非常喜欢linux内核的,作为世界上最伟大的开源软件(我觉得),随手可得的最新版本的源代码,有那么多大牛在维护与更新,读读它,真的对
我的帮助特别大,零零散散的很久了,现在想要学习网络,学习网络就需要懂套接字编程,也就是去学习一大堆API的用法,但是这样很容易忘记也没有什么
价值,我觉得最好的学习套接字的方法,就是去读源代码,网络协议栈是linux源代码中比较庞大的一部分,特别是比较新的版本,所以可以选择一些低版本的
源码来研读,虽然代码一直在迭代,但是核心思想是不会有太大的改变的,特别是函数接口,因为那是标准,所以只要不是特别需要,函数接口是不会改变的。
接下来的时间里面,我将系统的对linux 1.2.13的源代码做读书笔记,但是顺序可能不确定,有可能是先看网络,然后再看内存管理,但我个人觉得,读
操作系统的源代码,首先要弄明白的是内存管理,这是操作系统运行的基础,只有弄明白了内存管理,才能去理解其他的部分如进程管理等,还有,我觉得有些
部分可以有选择的读,比如设备管理,我就不打算去读,因为太费时间,需要去了解硬件,不到迫不得已,我决定不去读设备管理部分,文件系统不需要读,当然
选择一个就可以了,比如可以选择FAT,或者EXT,或者minux,总之要去读,因为这是文件操作的基础,重点是内存管理,特别容易迷失在内核空间与用户空间
之间,搞明白特别重要,ipc一定要看,文件系统一定要看,网络也要看。
好了,今天开个头,很明显今天要选择的部分是网络,也就是套接字,暂且不对网络协议栈作更深入的研究,我今天只读那些在套机字编程中需要的函数,
对于每一个函数,先了解功能,然后分析源码,这样会比较深刻。
需要知道的是,套接字文件实现在BSD层,可以在文件linux/net/sock.c里面看到套接字操作的第一层函数,对于底层调用,我们暂且不深入,因为今天
的目标是明白函数是用来干嘛的,而不是如何实现的,如果要想知道一个函数比如send函数是如何实现的,就需要到INET层去寻找,然后再继续追踪下去,
因为linux的设计是分层的,所以有一些函数实现特别简单,这是简单的调用下层函数。
下面是对socket.c这个文件的注释
/* * NET An implementation of the SOCKET network access protocol. * * Version: @(#)socket.c 1.1.93 18/02/95 * * Authors: Orest Zborowski, <[email protected]> * Ross Biro, <[email protected]> * Fred N. van Kempen, <[email protected]> * * Fixes: * Anonymous : NOTSOCK/BADF cleanup. Error fix in * shutdown() * Alan Cox : verify_area() fixes * Alan Cox : Removed DDI * Jonathan Kamens : SOCK_DGRAM reconnect bug * Alan Cox : Moved a load of checks to the very * top level. * Alan Cox : Move address structures to/from user * mode above the protocol layers. * Rob Janssen : Allow 0 length sends. * Alan Cox : Asynchronous I/O support (cribbed from the * tty drivers). * Niibe Yutaka : Asynchronous I/O for writes (4.4BSD style) * Jeff Uphoff : Made max number of sockets command-line * configurable. * Matti Aarnio : Made the number of sockets dynamic, * to be allocated when needed, and mr. * Uphoff's max is used as max to be * allowed to allocate. * Linus : Argh. removed all the socket allocation * altogether: it's in the inode now. * Alan Cox : Made sock_alloc()/sock_release() public * for NetROM and future kernel nfsd type * stuff. * * * This program is free software; you can redistribute it and/or * modify it under the terms of the GNU General Public License * as published by the Free Software Foundation; either version * 2 of the License, or (at your option) any later version. * * * This module is effectively the top level interface to the BSD socket * paradigm. Because it is very simple it works well for Unix domain sockets, * but requires a whole layer of substructure for the other protocols. * * In addition it lacks an effective kernel -> kernel interface to go with * the user one. */
/* 该文件为BSD socket层 */
#include <linux/config.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/fcntl.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <asm/system.h>
#include <asm/segment.h>
/* 下面是声明对网络数据提供普通文件操作的接口函数。 以及相关数据结构的声明。file_operations结构定义了普通文件操作函数集合 系统中每个文件对应一个file结构,file结构中有一个file_operations变量,当使用 write、read等函数对某个文件描述符进行操作时,系统首先根据文件描述符找到其对应的file 结构,然后调用其成员变量的file_operations中对应的函数来完成请求 */
static int sock_lseek(struct inode *inode, struct file *file, off_t offset,
int whence);
static int sock_read(struct inode *inode, struct file *file, char *buf,
int size);
static int sock_write(struct inode *inode, struct file *file, char *buf,
int size);
static int sock_readdir(struct inode *inode, struct file *file,
struct dirent *dirent, int count);
static void sock_close(struct inode *inode, struct file *file);
static int sock_select(struct inode *inode, struct file *file, int which, select_table *seltable);
static int sock_ioctl(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg);
static int sock_fasync(struct inode *inode, struct file *filp, int on);
/* * Socket files have a set of 'special' operations as well as the generic file ones. These don't appear * in the operation structures but are done directly via the socketcall() multiplexor. */
/* 【hujian】 定义一个函数集合,这样当我们需要操作read这样的操作时。 只需要调用sock_read来完成就好了。 但是没有类似open的函数,因为socket函数就相当于完成了open的功能 */
static struct file_operations socket_file_ops = {
sock_lseek,
sock_read,
sock_write,
sock_readdir,
sock_select,
sock_ioctl,
NULL, /* mmap */
NULL, /* no special open code... */
sock_close,
NULL, /* no fsync */
sock_fasync
};
/* * The protocol list. Each protocol is registered in here. */
/* proto_ops是一个结构,该结构中声明了一系列操作函数,pops数据将在sock_register函数中 被初始化,对于不同的操作域,具有不同的操作函数集合,比如对应于INET域的操作函数集合是 inet_proto_ops操作函数集合。 */
static struct proto_ops *pops[NPROTO];
/* * Statistics counters of the socket lists * 很容易理解,就是系统正在使用的套接字数量 */
static int sockets_in_use = 0;
/* * Support routines. Move socket addresses back and forth across the kernel/user * divide and look after the messy bits. */
#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - 16 for IP, 16 for IPX, about 80 for AX.25 */
/* 【hujian】 下面两个函数将实现内核缓冲区与用户缓冲区之间的数据复制 */
static int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr)
{
int err;
if(ulen<0||ulen>MAX_SOCK_ADDR)
return -EINVAL;//地址错误
if(ulen==0)
return 0;//不需要赋值数据
if((err=verify_area(VERIFY_READ,uaddr,ulen))<0)//检查该地址区间?
return err;
/* 调用下层函数来实现数据复制,从用户缓冲区中的uaddr开始赋值ulen长度的 数据到内核缓冲区中的kaddr开始的地方,只需要调用下层函数就可以了 */
memcpy_fromfs(kaddr,uaddr,ulen);
return 0;
}
static int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen)
{
int err;
int len;
/*检查地址区间*/
if((err=verify_area(VERIFY_WRITE,ulen,sizeof(*ulen)))<0)
return err;
len=get_fs_long(ulen);
if(len>klen)
len=klen;
if(len<0 || len> MAX_SOCK_ADDR)
return -EINVAL;
if(len)
{
if((err=verify_area(VERIFY_WRITE,uaddr,len))<0)
return err;
/* memcpy to file system,也就是将内核空间里面的数据复制到文件系统上面的某个 地方去,也就是我们想要实现的功能,就是将内核空间中的某段数据复制到用户缓冲区中 就可以了。 */
memcpy_tofs(uaddr,kaddr,len);
}
put_fs_long(len,ulen);
return 0;
}
/* * Obtains the first available file descriptor and sets it up for use. */
/* 【hujian】 获得文件描述符,分配file数据结构(get filedes) 为网络套接字分配一个文件描述字,在socket系统调用sys_socket实现中,内核在分配了inode,socket,sock之后, 会调用该函数来获得一个文件描述符,将这个获得的文件描述符作为socket系统调用的返回值。 函数需要一个inode结构,因为分配文件描述符的同时需要一个file结构,file结构中的f_inode字段即指向 这个inode结构,每个file结构都需要一个inode结构来对应 */
static int get_fd(struct inode *inode)
{
int fd;
struct file *file;
/* * Find a file descriptor suitable for return to the user. */
/* get_empty_filp()函数将返回一个空的file结构,内核会在内核里面维护一个files数组 这个数组里面将保存所有file,这个函数将去遍历该数组,找到一个可用的数组元素,然后 变为空然后返回给调用者。 */
file = get_empty_filp();
if (!file)
return(-1);//file结构分配失败,很可能是因为files数组已经没有空的了
/* 这里我们可以看到,每次分配文件描述子都是从最小的可用0文件描述符开始搜索的, 所以该函数分配的文件描述符是系统中当前可用的最小的文件描述符号。 */
for (fd = 0; fd < NR_OPEN; ++fd)
if (!current->files->fd[fd])
break;
/* 已经没有办法再分配文件描述符号了,因为已经满了 */
if (fd == NR_OPEN)
{
file->f_count = 0;
return(-1);
}
FD_CLR(fd, ¤t->files->close_on_exec);
current->files->fd[fd] = file;
/* 初始化,所以每当我们获得一个文件描述字时,我们可以对该文件描述符做 read、write等操作,这都是我们在这里进行初始化的结果 */
file->f_op = &socket_file_ops;
file->f_mode = 3;
/* 设置该文件可以读写,当然这个标志可以由用户修改 */
file->f_flags = O_RDWR;
/* 设置这个文件的访问次数? */
file->f_count = 1;
/* 我们应该知道,每一个文件都和一个inode相对应,该inode就是我们需要给本函数的inode 我们将会把用户提供的inode赋值给我们新建的file结构,这样这个file结构就和一个inode 唯一关联了,我们就可以通过控制该file来访问存于文件系统里面的inode节点了。 */
file->f_inode = inode;
/* inode里面也有一个引用次数的计数器,我们需要加1,不然这个file还是不存在的 内核会在某些检查中将该file分配给新请求的进程的,因为只要这个inode的引用次数 为0,内核就认为没有人再用这个inode了,那么可以回收资源了。 */
if (inode)
inode->i_count++;
file->f_pos = 0;
return(fd);
}
/* * Go from an inode to its socket slot. * * The original socket implementation wasn't very clever, which is * why this exists at all.. * socket结构查询 */
inline struct socket *socki_lookup(struct inode *inode)
{
return &inode->u.socket_i;
}
/* * Go from a file number to its socket slot. */
/* 该函数实现的是根据文件描述子来获得该文件描述子的file信息,然后再file信息里面得到我们需要的 inode信息。然后,我们就可以调用我们在上面实现的socki_lookup将我们需要的socket返回了 */
static inline struct socket *sockfd_lookup(int fd, struct file **pfile)
{
struct file *file;
struct inode *inode;
/* 检查文件描述字是否合法,并且获得当前进程的文件的文件描述子为fd的文件(file) 我们可以根据file来获得文件的inode,然后我们可以根据inode来获得该inodedui 应得socket,然后将得到的socket返回,这个函数的功能就完成了 */
if (fd < 0 || fd >= NR_OPEN || !(file = current->files->fd[fd]))
return NULL;
/* 获得当前文件描述子的inode信息 */
inode = file->f_inode;
/* 如果这个inode不合法,或者该inode指向的socket不存在,则返回null */
if (!inode || !inode->i_sock)
return NULL;
/* 当然,然后用户提供的参数中的file不为空的话,那么就需要将该file指向用户的指针 当用户视图从这个指针里面获取内容的时候,就可以得到结构 */
if (pfile)
*pfile = file;
/* 但是我们需要的是我们所给定的文件描述字对应的socket,而不是file,所以我们还需要根据inode信息来得到 我们需要的socket,当然这个函数在上面我们已经看过,只需要通过inode就可以得到socket */
return socki_lookup(inode);
}
/* * Allocate a socket. */
/* 分配一个新的socket结构,同时对结构进行一些初始化。 该函数实际上是先得到一个inode,然后对该inode进行一些初始化 */
struct socket *sock_alloc(void)
{
struct inode * inode;
struct socket * sock;
/* 得到一个空的inode,和get_empty_filp函数是一样的,内核将会遍历 内核中维护的一个数组inodes(不知道是不是叫这个名字),然后找到一个可以使用的inode 之后(这个inode的引用次数为0则表明这个inode不会再被使用,这个时候这个inode就是处在可 再次被分配的状态的),将这个inode初始化。 */
inode = get_empty_inode();
if (!inode)
return NULL;//已经没有更多可以使用的inode了,可能现在文件系统已经满了
/* 初始化这个新的inode,具体看inode的结果定义 /include/linux/fs.h/ */
inode->i_mode = S_IFSOCK;
inode->i_sock = 1;
/* 该inode的用户id和用户组id 这是inode的创建者,是为了实现文件保护而设置的标志,当然这些标志是可以被修改的 */
inode->i_uid = current->uid;
inode->i_gid = current->gid;
/* 获得sock,这个sock是新的,我们可以根据sock来找到file,然后可以找到inode 总之只要我们有其中之一,就可以找到我们需要的其他的内容(暂且这么想吧~) */
sock = &inode->u.socket_i;
/* 当然的到这个sock之后,我们需要做一些初始化工作啊 */
sock->state = SS_UNCONNECTED;
sock->flags = 0;
sock->ops = NULL;
sock->data = NULL;
sock->conn = NULL;
sock->iconn = NULL;
sock->next = NULL;
sock->wait = &inode->i_wait;
sock->inode = inode; /* "backlink": we could use pointer arithmetic instead */
sock->fasync_list = NULL;
sockets_in_use++;
return sock;
}
/* * Release a socket. */
/* 释放并且回收一个socket */
static inline void sock_release_peer(struct socket *peer)
{
peer->state = SS_DISCONNECTING;
wake_up_interruptible(peer->wait);
sock_wake_async(peer, 1);
}
/* 继续的释放socket */
void sock_release(struct socket *sock)
{
int oldstate;
struct socket *peersock, *nextsock;
/* 在释放之前,需要先将该sock的状态设置为未连接 */
if ((oldstate = sock->state) != SS_UNCONNECTED)
sock->state = SS_DISCONNECTING;
/* * Wake up anyone waiting for connections. */
for (peersock = sock->iconn; peersock; peersock = nextsock)
{
nextsock = peersock->next;
sock_release_peer(peersock);
}
/* * Wake up anyone we're connected to. First, we release the * protocol, to give it a chance to flush data, etc. */
peersock = (oldstate == SS_CONNECTED) ? sock->conn : NULL;
if (sock->ops) /*调用下层函数来实现我们需要的功能,ops这个域只是一个函数接口集合*/
sock->ops->release(sock, peersock);
if (peersock)/*如果我们还在和某个sock连接着,则将这个sock也释放了*/
sock_release_peer(peersock);
--sockets_in_use; /* Bookkeeping.. ,我们应该还记得,这个字段是一个socket’计数器,也就是 系统里面正在使用的socket的数量,当我们释放了一个socket的时候,当然需要将 该值减少1~~*/
iput(SOCK_INODE(sock));//实现对该inode结构的引用计数减1
}
/* * Sockets are not seekable. */
static int sock_lseek(struct inode *inode, struct file *file, off_t offset, int whence)
{
return(-ESPIPE);
}
/* * Read data from a socket. ubuf is a user mode pointer. We make sure the user * area ubuf...ubuf+size-1 is writable before asking the protocol. */
/* 我们现在应该知道的事情是:套接字也是一种文件,所以我们可以获取到套接字的文件描述子(套接字描述子),然后 就像文件一样操作他们,比如我们想要读写网络数据,那么我们就可以调用read来读取,当然我们可以调用write来向 套接字文件写入内容(数据)。 在BSD层。这些函数的实现仅仅是向下引用,就是将这个任务交给下一层的网络层来实现,BSD下一层是INET层, 所以这一层的这些函数的接口都指向INET层的相同接口,如果inet层依然没有办法实现这些接口,将再次向下 引用,直到可以实现最基本的操作后,函数再向上实现。 */
static int sock_read(struct inode *inode, struct file *file, char *ubuf, int size)
{
struct socket *sock;
int err;
//根据inode信息获得sock结构,如果这个socket是空的,直接返回错误,没有什么错误处理
if (!(sock = socki_lookup(inode)))
{
printk("NET: sock_read: can't find socket for inode!\n");
return(-EBADF);
}
//发现了这个socket,但是这不知道什么标志不允许,然后也是直接返回错误
if (sock->flags & SO_ACCEPTCON)
return(-EINVAL);
if(size<0)
return -EINVAL;
if(size==0)
return 0;
//我们可以想象,这个函数应该是检查这个区域。具体检查什么应该在函数的实现中看到。
if ((err=verify_area(VERIFY_WRITE,ubuf,size))<0)
return err;
//这个虚假的调用。只是指向下一层的网络接口,在sock里面读出size大小的内容,存在ubuf中
return(sock->ops->read(sock, ubuf, size, (file->f_flags & O_NONBLOCK)));
}
/* * Write data to a socket. We verify that the user area ubuf..ubuf+size-1 is * readable by the user process. */
static int sock_write(struct inode *inode, struct file *file, char *ubuf, int size)
{
struct socket *sock;
int err;
if (!(sock = socki_lookup(inode)))
{
printk("NET: sock_write: can't find socket for inode!\n");
return(-EBADF);
}
if (sock->flags & SO_ACCEPTCON)
return(-EINVAL);
if(size<0)
return -EINVAL;
if(size==0)
return 0;
//还是检查内存,如果出错直接返回
if ((err=verify_area(VERIFY_READ,ubuf,size))<0)
return err;
//继续调用下一层的函数,往sock里面写入大小为size,内容为ubuf的数据,写入的标志的指定的标志
return(sock->ops->write(sock, ubuf, size,(file->f_flags & O_NONBLOCK)));
}
/* * You can't read directories from a socket! */
static int sock_readdir(struct inode *inode, struct file *file, struct dirent *dirent,
int count)
{
return(-EBADF);
}
/* * With an ioctl arg may well be a user mode pointer, but we don't know what to do * with it - thats up to the protocol still. */
int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
unsigned long arg)
{
struct socket *sock;
if (!(sock = socki_lookup(inode)))
{
printk("NET: sock_ioctl: can't find socket for inode!\n");
return(-EBADF);
}
//继续调用下层函数,控制套接字的模式?
return(sock->ops->ioctl(sock, cmd, arg));
}
static int sock_select(struct inode *inode, struct file *file, int sel_type, select_table * wait)
{
struct socket *sock;
if (!(sock = socki_lookup(inode)))
{
printk("NET: sock_select: can't find socket for inode!\n");
return(0);
}
/* * We can't return errors to select, so it's either yes or no. */
if (sock->ops && sock->ops->select)
return(sock->ops->select(sock, sel_type, wait));
return(0);
}
/* 该函数将实现关闭一个socket 只是简单的调用sock_release and sock_release_peer 他们的调用关系应该是这样的: socket_close()->sock_release()->sock_release_peer() 所以这个函数只是实现简单的函数调用。 */
void sock_close(struct inode *inode, struct file *filp)
{
struct socket *sock;
/* * It's possible the inode is NULL if we're closing an unfinished socket. */
if (!inode) /*这个inode是个空的,难道是我们正在关闭一个没有完成的socket?应该是啊*/
return;
/* 我们已经看过这个函数了,那么这个函数只是实现根据inode来找到sock 只是将inode结构中的sock返回来而已 */
if (!(sock = socki_lookup(inode)))
{
printk("NET: sock_close: can't find socket for inode!\n");
return;
}
/* 这个函数用来分配或者释放一个fasync_struct 结构,如果第三个参数为0,那么说明是释放,否则表明 是分配,这个函数调用在这里将完成将释放当前进程对应的该结构 但是为什么不是sock而是进程呢? 因为一个套接字很可能被多个进程所使用 */
sock_fasync(inode, filp, 0);
/* 释放该socket */
sock_release(sock);
}
/* * Update the socket async list */
static int sock_fasync(struct inode *inode, struct file *filp, int on)
{
struct fasync_struct *fa, *fna=NULL, **prev;
struct socket *sock;
unsigned long flags;
if (on)
{
fna=(struct fasync_struct *)kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
if(fna==NULL)
return -ENOMEM;
}
sock = socki_lookup(inode);
prev=&(sock->fasync_list);
save_flags(flags);
cli();
for(fa=*prev; fa!=NULL; prev=&fa->fa_next,fa=*prev)
if(fa->fa_file==filp)
break;
if(on)
{
if(fa!=NULL)
{
kfree_s(fna,sizeof(struct fasync_struct));
restore_flags(flags);
return 0;
}
fna->fa_file=filp;
fna->magic=FASYNC_MAGIC;
fna->fa_next=sock->fasync_list;
sock->fasync_list=fna;
}
else
{
if(fa!=NULL)
{
*prev=fa->fa_next;
kfree_s(fa,sizeof(struct fasync_struct));
}
}
restore_flags(flags);
return 0;
}
/* 实现异步唤醒 */
int sock_wake_async(struct socket *sock, int how)
{
if (!sock || !sock->fasync_list)
return -1;
switch (how)
{
case 0:
kill_fasync(sock->fasync_list, SIGIO);
break;
case 1:
if (!(sock->flags & SO_WAITDATA))
kill_fasync(sock->fasync_list, SIGIO);
break;
case 2:
if (sock->flags & SO_NOSPACE)
{
kill_fasync(sock->fasync_list, SIGIO);
sock->flags &= ~SO_NOSPACE;
}
break;
}
return 0;
}
/* * Wait for a connection. * 该函数只用于UNIX通信域,用于处理客户端连接请求 * 结构中的iconn,conn结构用于unix域中的连接操作,其中iconn只用于服务器端,表示等待连接但尚未 * 连接的客户端socket结构链表 */
int sock_awaitconn(struct socket *mysock, struct socket *servsock, int flags)
{
struct socket *last;
/* * We must be listening * 检查服务器端是不是处于服务状态,也就是服务端处于监听状态,可以进行连接 */
if (!(servsock->flags & SO_ACCEPTCON))
{
return(-EINVAL);
}
/* * Put ourselves on the server's incomplete connection queue. */
mysock->next = NULL;
cli();
//将本次客户端连接的套接字插入到服务端的socket结构中的iconn字段指向的链表
//这样的话,就表示客户端正在等待连接
//get the servsock list's last item,if the pointer is null ,
if (!(last = servsock->iconn))
servsock->iconn = mysock;
else
{
while (last->next)
last = last->next;
last->next = mysock;
}
mysock->state = SS_CONNECTING;
mysock->conn = servsock;//其实我们需要的就是这么一句,这样我们就可以告诉服务端,我这个客户端正在等待连接啊,快点连接我!!!
sti();
/* * Wake up server, then await connection. server will set state to * SS_CONNECTED if we're connected. */
//唤醒服务端进程,以来服务本地客户端连接
wake_up_interruptible(servsock->wait);
//该函数将唤醒servsock结构上的fasync_list指向的队列中的每个元素对应的进程
sock_wake_async(servsock, 0);
//因为唤醒服务端需要一定的时间,所以一般我们可以进入到这个条件语句里面执行
if (mysock->state != SS_CONNECTED)
{
if (flags & O_NONBLOCK)
return -EINPROGRESS;
//等待服务端服务本次连接
interruptible_sleep_on(mysock->wait);
//本地客户端被唤醒后,继续检查是不是已经得到处理了,就是和是否已经完成了与服务端的连接
if (mysock->state != SS_CONNECTED &&
mysock->state != SS_DISCONNECTING)
{
/* * if we're not connected we could have been * 1) interrupted, so we need to remove ourselves * from the server list * 这是信号中断,这样的话我们需要自己将这个套接字从服务端里面删除 * 2) rejected (mysock->conn == NULL), and have * already been removed from the list * 拒绝连接,所以失败,这样的话,这个客户端已经被删除 */
if (mysock->conn == servsock) //这是说客户端的连接等于服务端了吗?然后连接却失败了!!
{
cli();
//如果第一个等待连接的套接字就是我们自己的套接字,那么删除,为了性能真是拼了!!
if ((last = servsock->iconn) == mysock)
servsock->iconn = mysock->next;
else //现在我们需要找到那个套接字(也就是我们自己的套接字)
{
while (last->next != mysock)
last = last->next;
last->next = mysock->next;//然后将这个套接字删除
}
sti();
}
//是中断还是拒绝连接呢?
//如果mysock->conn为null的话,表示服务器拒绝连接,则返回-EACCESS
//否则就是被中断标志,返回-EINIT
//总之,如果是服务端拒绝服务的话,那么服务端就会删除这个拒绝的连接
//否则这个连接不是空的,那么我们需要自己删除这个套接字。
return(mysock->conn ? -EINTR : -EACCES);
}
}
return(0);
}
/* * Perform the socket system call. we locate the appropriate * family, then create a fresh socket. * 继续向下层调用接口实现 */
/* 该函数将实现下面的功能: (1)、通过函数sock_alloc来分配socket结构和sock结构,这两个结构在不同的层次表示一个套接字连接 (2)、分配inode和file结构用于普通文件操作 (3)、分配一个文件描述子返回给应用程序作为以后的操作句柄 */
static int sock_socket(int family, int type, int protocol)
{
int i, fd;
struct socket *sock; //套接字
struct proto_ops *ops;//协议选项
/* Locate the correct protocol family. */
for (i = 0; i < NPROTO; ++i)
{
if (pops[i] == NULL) continue;
if (pops[i]->family == family)
break;//找到所属于的协议族
}
//就是当前系统不支持这样的协议族
if (i == NPROTO)
{
return -EINVAL;
}
//得到协议
ops = pops[i];
/* * Check that this is a type that we know how to manipulate and * the protocol makes sense here. The family can still reject the * protocol later. */
if ((type != SOCK_STREAM && type != SOCK_DGRAM &&
type != SOCK_SEQPACKET && type != SOCK_RAW &&
type != SOCK_PACKET) || protocol < 0)
return(-EINVAL);
/* * Allocate the socket and allow the family to set things up. if * the protocol is 0, the family is instructed to select an appropriate * default. */
//sock_alloc函数将返回一个新的套接字,如果系统不支持更多的套接字了,那么就会失败
//分配socket和sock,这两个结构在不同的层次表示一个套接字连接
if (!(sock = sock_alloc()))
{
printk("NET: sock_socket: no more sockets\n");
return(-ENOSR); /* Was: EAGAIN, but we are out of system resources! */
}
//我们已经得到一个新的套接字,现在就可以初始化这个套接字了
sock->type = type;
sock->ops = ops;
//向下层调用函数来实现
if ((i = sock->ops->create(sock, protocol)) < 0)
{
sock_release(sock);
return(i);
}
//得到套接字描述子,我们可以根据这个描述子来实现各种类似文件的操作
if ((fd = get_fd(SOCK_INODE(sock))) < 0)
{
sock_release(sock);
return(-EINVAL);
}
//将这个套接字描述子返回去
return(fd);
}
/* * Create a pair of connected sockets. */
static int sock_socketpair(int family, int type, int protocol, unsigned long usockvec[2])
{
int fd1, fd2, i;
struct socket *sock1, *sock2;
int er;
/* * Obtain the first socket and check if the underlying protocol * supports the socketpair call. */
if ((fd1 = sock_socket(family, type, protocol)) < 0)
return(fd1);
sock1 = sockfd_lookup(fd1, NULL);
if (!sock1->ops->socketpair)
{
sys_close(fd1);
return(-EINVAL);
}
/* * Now grab another socket and try to connect the two together. */
if ((fd2 = sock_socket(family, type, protocol)) < 0)
{
sys_close(fd1);
return(-EINVAL);
}
//sockfd_lookup将会查看这个套接字是否已经被创建,如果没有被创建的话,那么函数返回失败
//套接字也是一种文件,所以创建的套接字会有一个inode存在。
sock2 = sockfd_lookup(fd2, NULL);
//下层函数来做。
if ((i = sock1->ops->socketpair(sock1, sock2)) < 0)
{
sys_close(fd1);
sys_close(fd2);
return(i);
}
//让两个新的套接字描述字连接在一起,这两个套接字描述字全双工的
sock1->conn = sock2;
sock2->conn = sock1;
sock1->state = SS_CONNECTED;
sock2->state = SS_CONNECTED;
er=verify_area(VERIFY_WRITE, usockvec, 2 * sizeof(int));
if(er)
{
sys_close(fd1);
sys_close(fd2);
return er;
}
put_fs_long(fd1, &usockvec[0]);
put_fs_long(fd2, &usockvec[1]);
return(0);
}
/* * Bind a name to a socket. Nothing much to do here since it's * the protocol's responsibility to handle the local address. * * We move the socket address to kernel space before we call * the protocol layer (having also checked the address is ok). */
//我们在创建一个套接字描述子之后,这个套接字资源还只能在进程内使用,其他进程无法识别这个新的
//套接字资源,所以需要给这个新的套接字绑定一个名字,这样其他进程就可以根据这个套接字名字来访问
//这个套接字了,也就是可以和这个套接字通信了
static int sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
struct socket *sock;
int i;
char address[MAX_SOCK_ADDR];
int err;
//assert
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
//查看这个套接字是否已经存在,如果不存在的话,返回失败
//第二个参数将返回套接字结构的file结构,如果我们需要知道套接字的inode信息的话,这个参数不应该为空
//但是这里仅仅是为了测试是不是存在,所以不需要保存file结构
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//在使用这个名字之前 ,将套接字的地址把move到内核空间
if((err=move_addr_to_kernel(umyaddr,addrlen,address))<0)
return err;
//下层函数继续做苦力
if ((i = sock->ops->bind(sock, (struct sockaddr *)address, addrlen)) < 0)
{
return(i);
}
return(0);
}
/* * Perform a listen. Basically, we allow the protocol to do anything * necessary for a listen, and if that works, we mark the socket as * ready for listening. */
//ok,对于tcp连接,该函数将创建监听队列,这个队列里面保存所有请求连接该套接字的套接字
//当我们调用accept函数的时候,accept函数将创建一个新的socket,然后取出这个监听队列中
//的一个套接字,用这个新的套接字来与之通信,需要注意的是,因为我们只需要让这个新的套接字
//和特定的套接字通信,所以不需要绑定名字
//第二个参数是监听队列的长度
static int sock_listen(int fd, int backlog)
{
struct socket *sock;
//assert
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
//assert whether the socket already exist.
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//assert the socket's state
if (sock->state != SS_UNCONNECTED)
{
return(-EINVAL);
}
//继续调用下层函数来做事情
if (sock->ops && sock->ops->listen)
sock->ops->listen(sock, backlog);
sock->flags |= SO_ACCEPTCON;
return(0);
}
/* * For accept, we attempt to create a new socket, set up the link * with the client, wake up the client, then return the new * connected fd. We collect the address of the connector in kernel * space and move it to user at the very end. This is buggy because * we open the socket then return an error. */
//这个函数视图新建一个新的套接字来为一个客户端的连接请求服务,这个套接字将作为结果返回给用户以便
//用户可以知道为他服务的套接字是哪一个。
//上面说了,函数会将连接的套接字的地址从内核空间move到用户空间,这样用户就可以控制这个套接字了。
//
static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
struct file *file;
struct socket *sock, *newsock;
int i;
char address[MAX_SOCK_ADDR];
int len;
//asert
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
//get the fd's file structure.
if (!(sock = sockfd_lookup(fd, &file)))
return(-ENOTSOCK);
//asert
if (sock->state != SS_UNCONNECTED)
{
return(-EINVAL);
}
//assert
if (!(sock->flags & SO_ACCEPTCON))
{
return(-EINVAL);
}
//new socket
if (!(newsock = sock_alloc()))
{
printk("NET: sock_accept: no more sockets\n");
return(-ENOSR); /* Was: EAGAIN, but we are out of system resources! */
}
//this is the server socket.
newsock->type = sock->type;
newsock->ops = sock->ops;
//copy the sock to newsock
if ((i = sock->ops->dup(newsock, sock)) < 0)
{
sock_release(newsock);
return(i);
}
//call down..
i = newsock->ops->accept(sock, newsock, file->f_flags);
if ( i < 0)
{
sock_release(newsock);
return(i);
}
if ((fd = get_fd(SOCK_INODE(newsock))) < 0)
{
sock_release(newsock);
return(-EINVAL);
}
if (upeer_sockaddr)
{
newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 1);
move_addr_to_user(address,len, upeer_sockaddr, upeer_addrlen);
}
return(fd);
}
/* * Attempt to connect to a socket with the server address. The address * is in user space so we verify it is OK and move it to kernel space. */
//这是一个共客户端使用的函数,他用来连接一个服务端,服务端会在listen里面监听到这个链接请求
//然后将这个新的连接请求放在连接队列里面,然后服务器将用accept来为队列里面的套接字服务
static int sock_connect(int fd, struct sockaddr *uservaddr, int addrlen)
{
struct socket *sock;
struct file *file;
int i;
char address[MAX_SOCK_ADDR];
int err;
//asert
if (fd < 0 || fd >= NR_OPEN || (file=current->files->fd[fd]) == NULL)
return(-EBADF);
//get the file structure
if (!(sock = sockfd_lookup(fd, &file)))
return(-ENOTSOCK);
//move addr from user space to kernel so that the kernel can get the control in this socket
if((err=move_addr_to_kernel(uservaddr,addrlen,address))<0)
return err;
//do something according to the socket's state
switch(sock->state)
{
case SS_UNCONNECTED:
/* This is ok... continue with connect */
break;
case SS_CONNECTED:
/* Socket is already connected */
if(sock->type == SOCK_DGRAM) /* Hack for now - move this all into the protocol */
break;
return -EISCONN;
case SS_CONNECTING:
/* Not yet connected... we will check this. */
/* * FIXME: for all protocols what happens if you start * an async connect fork and both children connect. Clean * this up in the protocols! */
break;
default:
return(-EINVAL);
}
//next level
i = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, file->f_flags);
if (i < 0)
{
return(i);
}
return(0);
}
/* * Get the local address ('name') of a socket object. Move the obtained * name to user space. */
//函数将得到自己的套接字的名字
//
static int sock_getsockname(int fd, struct sockaddr *usockaddr, int *usockaddr_len)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int len;
int err;
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
err=sock->ops->getname(sock, (struct sockaddr *)address, &len, 0);
if(err)
return err;
if((err=move_addr_to_user(address,len, usockaddr, usockaddr_len))<0)
return err;
return 0;
}
/* * Get the remote address ('name') of a socket object. Move the obtained * name to user space. */
//这个函数将会得到对方的套接字地址
static int sock_getpeername(int fd, struct sockaddr *usockaddr, int *usockaddr_len)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int len;
int err;
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
err=sock->ops->getname(sock, (struct sockaddr *)address, &len, 1);
if(err)
return err;
if((err=move_addr_to_user(address,len, usockaddr, usockaddr_len))<0)
return err;
return 0;
}
/* * Send a datagram down a socket. The datagram as with write() is * in user space. We check it can be read. */
//向一个套接字写数据
static int sock_send(int fd, void * buff, int len, unsigned flags)
{
struct socket *sock;
struct file *file;
int err;
//check
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
//check
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//nothing to do
if(len<0)
return -EINVAL;
//验证这个地址,检查是否可读
err=verify_area(VERIFY_READ, buff, len);
if(err)
return err;
//下层调用
return(sock->ops->send(sock, buff, len, (file->f_flags & O_NONBLOCK), flags));
}
/* * Send a datagram to a given address. We move the address into kernel * space and check the user space data area is readable before invoking * the protocol. */
//这个函数是对于udp来说的,向一个套接字发送数据,但是没有那么麻烦
static int sock_sendto(int fd, void * buff, int len, unsigned flags,
struct sockaddr *addr, int addr_len)
{
struct socket *sock;
struct file *file;
char address[MAX_SOCK_ADDR];
int err;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if(len<0)
return -EINVAL;
err=verify_area(VERIFY_READ,buff,len);
if(err)
return err;
if((err=move_addr_to_kernel(addr,addr_len,address))<0)
return err;
return(sock->ops->sendto(sock, buff, len, (file->f_flags & O_NONBLOCK),
flags, (struct sockaddr *)address, addr_len));
}
/* * Receive a datagram from a socket. This isn't really right. The BSD manual * pages explicitly state that recv is recvfrom with a NULL to argument. The * Linux stack gets the right results for the wrong reason and this need to * be tidied in the inet layer and removed from here. * We check the buffer is writable and valid. */
//接收数据,对于tcp
static int sock_recv(int fd, void * buff, int len, unsigned flags)
{
struct socket *sock;
struct file *file;
int err;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if(len<0)
return -EINVAL;
if(len==0)
return 0;
//验证这个地址是不是可写的
err=verify_area(VERIFY_WRITE, buff, len);
if(err)
return err;
//下层函数
return(sock->ops->recv(sock, buff, len,(file->f_flags & O_NONBLOCK), flags));
}
/* * Receive a frame from the socket and optionally record the address of the * sender. We verify the buffers are writable and if needed move the * sender address from kernel to user space. */
//对于udp来说的
static int sock_recvfrom(int fd, void * buff, int len, unsigned flags,
struct sockaddr *addr, int *addr_len)
{
struct socket *sock;
struct file *file;
char address[MAX_SOCK_ADDR];
int err;
int alen;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if(len<0)
return -EINVAL;
if(len==0)
return 0;
err=verify_area(VERIFY_WRITE,buff,len);
if(err)
return err;
len=sock->ops->recvfrom(sock, buff, len, (file->f_flags & O_NONBLOCK),
flags, (struct sockaddr *)address, &alen);
if(len<0)
return len;
if(addr!=NULL && (err=move_addr_to_user(address,alen, addr, addr_len))<0)
return err;
return len;
}
/* * Set a socket option. Because we don't know the option lengths we have * to pass the user mode parameter for the protocols to sort out. */
static int sock_setsockopt(int fd, int level, int optname, char *optval, int optlen)
{
struct socket *sock;
struct file *file;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//下层函数
return(sock->ops->setsockopt(sock, level, optname, optval, optlen));
}
/* * Get a socket option. Because we don't know the option lengths we have * to pass a user mode parameter for the protocols to sort out. */
static int sock_getsockopt(int fd, int level, int optname, char *optval, int *optlen)
{
struct socket *sock;
struct file *file;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if (!sock->ops || !sock->ops->getsockopt)
return(0);
return(sock->ops->getsockopt(sock, level, optname, optval, optlen));
}
/* * Shutdown a socket. */
//关闭一个套接字,关闭这个套接字的话,下次连接需要重新连接
//how指定如何关闭,可以关闭该套接字不再可写,或者不再可读,或者不再可读写
static int sock_shutdown(int fd, int how)
{
struct socket *sock;
struct file *file;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//只能向下看了
return(sock->ops->shutdown(sock, how));
}
/* * Perform a file control on a socket file descriptor. */
int sock_fcntl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct socket *sock;
sock = socki_lookup (filp->f_inode);
if (sock != NULL && sock->ops != NULL && sock->ops->fcntl != NULL)
return(sock->ops->fcntl(sock, cmd, arg));
return(-EINVAL);
}
/* * System call vectors. Since I (RIB) want to rewrite sockets as streams, * we have this level of indirection. Not a lot of overhead, since more of * the work is done via read/write/select directly. * * I'm now expanding this up to a higher level to separate the assorted * kernel/user space manipulations and global assumptions from the protocol * layers proper - AC. */
asmlinkage int sys_socketcall(int call, unsigned long *args)
{
int er;
switch(call)
{
case SYS_SOCKET:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_socket(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_BIND:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_bind(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_CONNECT:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_connect(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_LISTEN:
er=verify_area(VERIFY_READ, args, 2 * sizeof(long));
if(er)
return er;
return(sock_listen(get_fs_long(args+0),
get_fs_long(args+1)));
case SYS_ACCEPT:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_accept(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_GETSOCKNAME:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_getsockname(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_GETPEERNAME:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_getpeername(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_SOCKETPAIR:
er=verify_area(VERIFY_READ, args, 4 * sizeof(long));
if(er)
return er;
return(sock_socketpair(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(unsigned long *)get_fs_long(args+3)));
case SYS_SEND:
er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
if(er)
return er;
return(sock_send(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3)));
case SYS_SENDTO:
er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
if(er)
return er;
return(sock_sendto(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3),
(struct sockaddr *)get_fs_long(args+4),
get_fs_long(args+5)));
case SYS_RECV:
er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
if(er)
return er;
return(sock_recv(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3)));
case SYS_RECVFROM:
er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
if(er)
return er;
return(sock_recvfrom(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3),
(struct sockaddr *)get_fs_long(args+4),
(int *)get_fs_long(args+5)));
case SYS_SHUTDOWN:
er=verify_area(VERIFY_READ, args, 2* sizeof(unsigned long));
if(er)
return er;
return(sock_shutdown(get_fs_long(args+0),
get_fs_long(args+1)));
case SYS_SETSOCKOPT:
er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
if(er)
return er;
return(sock_setsockopt(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(char *)get_fs_long(args+3),
get_fs_long(args+4)));
case SYS_GETSOCKOPT:
er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
if(er)
return er;
return(sock_getsockopt(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(char *)get_fs_long(args+3),
(int *)get_fs_long(args+4)));
default:
return(-EINVAL);
}
}
/* * This function is called by a protocol handler that wants to * advertise its address family, and have it linked into the * SOCKET module. */
int sock_register(int family, struct proto_ops *ops)
{
int i;
cli();
for(i = 0; i < NPROTO; i++)
{
if (pops[i] != NULL)
continue;
pops[i] = ops;
pops[i]->family = family;
sti();
return(i);
}
sti();
return(-ENOMEM);
}
/* * This function is called by a protocol handler that wants to * remove its address family, and have it unlinked from the * SOCKET module. */
int sock_unregister(int family)
{
int i;
cli();
for(i = 0; i < NPROTO; i++)
{
if (pops[i] == NULL)
continue;
if(pops[i]->family == family)
{
pops[i]=NULL;
sti();
return(i);
}
}
sti();
return(-ENOENT);
}
void proto_init(void)
{
extern struct net_proto protocols[]; /* Network protocols */
struct net_proto *pro;
/* Kick all configured protocols. */
pro = protocols;
while (pro->name != NULL)
{
(*pro->init_func)(pro);
pro++;
}
/* We're all done... */
}
void sock_init(void)
{
int i;
printk("Swansea University Computer Society NET3.019\n");
/* * Initialize all address (protocol) families. */
for (i = 0; i < NPROTO; ++i) pops[i] = NULL;
/* * Initialize the protocols module. */
proto_init();
#ifdef CONFIG_NET
/* * Initialize the DEV module. */
dev_init();
/* * And the bottom half handler */
bh_base[NET_BH].routine= net_bh;
enable_bh(NET_BH);
#endif
}
int socket_get_info(char *buffer, char **start, off_t offset, int length)
{
int len = sprintf(buffer, "sockets: used %d\n", sockets_in_use);
if (offset >= len)
{
*start = buffer;
return 0;
}
*start = buffer + offset;
len -= offset;
if (len > length)
len = length;
return len;
}