BSD层源码分析
写于2010.1.20
BSD层主要源码有:
net/protocols.c 链路层协议初始化函数及域初始化函数定义
net/socket.c BSD socket 层实现文件
include/linux/net.h
对于BSD 层来说,socket.c 是这节实现的重点。故我们针对这个文件按照我们分析流程把对应BSD 层其他文件一起进行分析。
BSD层涉及到的结构体:
/*
* Internal representation of a socket. not all the fields are used by
* all configurations:
*
* server client
* conn client connected to server connected to
* iconn list of clients -unused-
* awaiting connections
* wait sleep for clients, sleep for connection,
* sleep for i/o sleep for i/o
*/
include/linux/net.h
struct socket {
short type; /* SOCK_STREAM, … */
socket_state state;
long flags;
struct proto_ops *ops; /* protocols do most everything */
void *data; /* protocol data */
struct socket *conn; /* server socket connected to */
struct socket *iconn; /* incomplete client conn.s */
struct socket *next;
struct wait_queue **wait; /* ptr to place to wait on */
struct inode *inode;
struct fasync_struct *fasync_list; /* Asynchronous wake up list */
};
//对socket操作的函数指针集合
struct proto_ops {
int family;
int (*create) (struct socket *sock, int protocol);
int (*dup) (struct socket *newsock, struct socket *oldsock);
int (*release) (struct socket *sock, struct socket *peer);
int (*bind) (struct socket *sock, struct sockaddr *umyaddr,
int sockaddr_len);
int (*connect) (struct socket *sock, struct sockaddr *uservaddr,
int sockaddr_len, int flags);
int (*socketpair) (struct socket *sock1, struct socket *sock2);
int (*accept) (struct socket *sock, struct socket *newsock,
int flags);
int (*getname) (struct socket *sock, struct sockaddr *uaddr,
int *usockaddr_len, int peer);
int (*read) (struct socket *sock, char *ubuf, int size,
int nonblock);
int (*write) (struct socket *sock, char *ubuf, int size,
int nonblock);
int (*select) (struct socket *sock, int sel_type,
select_table *wait);
int (*ioctl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
int (*listen) (struct socket *sock, int len);
int (*send) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags);
int (*recv) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags);
int (*sendto) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags, struct sockaddr *, int addr_len);
int (*recvfrom) (struct socket *sock, void *buff, int len, int nonblock,
unsigned flags, struct sockaddr *, int *addr_len);
int (*shutdown) (struct socket *sock, int flags);
int (*setsockopt) (struct socket *sock, int level, int optname,
char *optval, int optlen);
int (*getsockopt) (struct socket *sock, int level, int optname,
char *optval, int *optlen);
int (*fcntl) (struct socket *sock, unsigned int cmd,
unsigned long arg);
};
//这个是网络协议名字以及协议操作的集合。
struct net_proto {
char *name; /* Protocol name */
void (*init_func)(struct net_proto *); /* Bootstrap */
};
文件名称: socket.c
系统调用INT $0×80 进入内核来执行函数,该函数根据AX寄存器中的系统调用号进一步调用内核网络栈相应的实现函数。对于socket,bind等这些函数,socket.c文件只作第一层的实现函数(sock_socket,sock_bind)。
虽然linux 中几乎所有的接口都是以文件形式来组织的,但对于网络栈在/dev(现在的linux内核已经有这样的文件了) 目录下却无这样的对应的关系。不过内核还是提。供了对于网络数据的普通文件操作方式,如write,read函数可直接用于读写网络数据,在socket.c文件中可以看到针对网络数据的文件操作函数集合的实现。
下面我们一段一段来分析socket.c:
#include <linux/config.h>
#include <linux/signal.h>
#include <linux/errno.h>
#include <linux/sched.h>
#include <linux/mm.h>
#include <linux/kernel.h>
#include <linux/major.h>
#include <linux/stat.h>
#include <linux/socket.h>
#include <linux/fcntl.h>
#include <linux/net.h>
#include <linux/interrupt.h>
#include <linux/netdevice.h>
#include <asm/system.h>
#include <asm/segment.h>
//这里是对文件的操作的声明
static int sock_lseek(struct inode *inode, struct file *file, off_t offset,
int whence);
static int sock_read(struct inode *inode, struct file *file, char *buf,
int size);
static int sock_write(struct inode *inode, struct file *file, char *buf,
int size);
static int sock_readdir(struct inode *inode, struct file *file,
struct dirent *dirent, int count);
static void sock_close(struct inode *inode, struct file *file);
static int sock_select(struct inode *inode, struct file *file, int which, select_table *seltable);
static int sock_ioctl(struct inode *inode, struct file *file,
unsigned int cmd, unsigned long arg);
static int sock_fasync(struct inode *inode, struct file *filp, int on);
//
/*
* Socket files have a set of ’special’ operations as well as the generic file ones. These don’t appear
* in the operation structures but are done directly via the socketcall() multiplexor.
*/
//文件file_operations 结构的初始化
static struct file_operations socket_file_ops = {
sock_lseek,
sock_read,
sock_write,
sock_readdir,
sock_select,
sock_ioctl,
NULL, /* mmap */
NULL, /* no special open code… */
sock_close,
NULL, /* no fsync */
sock_fasync
};
以上file_operations结构定义了普通文件操作函数集。系统中每一个文件对应一个file结构,file结构中有一个file_operations变量,当使用write,read函数对某个文件描述符进行读写操作时,系统首先根据文件索引到其对应file,然后调用file_operations中对应的函数请求。
/*
* The protocol list. Each protocol is registered in here.
*/
/*将在sock_register中初始化,对于不同操作域具有不同操作函数
*集,如对应INET域的inet_proto_ops操作函数集,对应unix的unix_proto_ops的操作
*/
static struct proto_ops *pops[NPROTO];
/*
* Statistics counters of the socket lists
*/
/*定义系统当前使用套接字数目*/
static int sockets_in_use = 0;
/*
* Support routines. Move socket addresses back and forth across the kernel/user
* divide and look after the messy bits.
*/
#define MAX_SOCK_ADDR 128 /* 108 for Unix domain - 16 for IP, 16 for IPX, about 80 for AX.25 */
//数据移动到内核空间
static int move_addr_to_kernel(void *uaddr, int ulen, void *kaddr)
{
int err;
if(ulen<0||ulen>MAX_SOCK_ADDR)
return -EINVAL;
if(ulen==0)
return 0;
if((err=verify_area(VERIFY_READ,uaddr,ulen))<0)
return err;
memcpy_fromfs(kaddr,uaddr,ulen);
return 0;
}
//数据移动到用户空间
static int move_addr_to_user(void *kaddr, int klen, void *uaddr, int *ulen)
{
int err;
int len;
if((err=verify_area(VERIFY_WRITE,ulen,sizeof(*ulen)))<0) //检测用户空间地址
return err;
len=get_fs_long(ulen);
if(len>klen)
len=klen;
if(len<0 || len> MAX_SOCK_ADDR)
return -EINVAL;
if(len)
{
if((err=verify_area(VERIFY_WRITE,uaddr,len))<0)
return err;
memcpy_tofs(uaddr,kaddr,len);
}
put_fs_long(len,ulen);
return 0;
}
/*
* Obtains the first available file descriptor and sets it up for use.
*/
//获得文件标识符,分配file数据结构
static int get_fd(struct inode *inode)
{
int fd;
struct file *file;
/*
* Find a file descriptor suitable for return to the user.
*/
//获得一个文件描述给用户
file = get_empty_filp();//分配一个file结构
if (!file)
return(-1);
for (fd = 0; fd < NR_OPEN; ++fd)
if (!current->files->fd[fd])
break;
if (fd == NR_OPEN)
{
file->f_count = 0;
return(-1);
}
FD_CLR(fd, ¤t->files->close_on_exec);
current->files->fd[fd] = file;
file->f_op = &socket_file_ops;
file->f_mode = 3;
file->f_flags = O_RDWR;
file->f_count = 1;
file->f_inode = inode;
if (inode)
inode->i_count++;
file->f_pos = 0;
return(fd);
}
/*
* Go from an inode to its socket slot.
*
* The original socket implementation wasn’t very clever, which is
* why this exists at all..
*/
inline struct socket *socki_lookup(struct inode *inode)
{
return &inode->u.socket_i; //根据inode结构查找socket结构
}
/*
* Go from a file number to its socket slot.
*/
//从对应文件描述符得到找到对应file结构,进而得到
//inode结构,然后调用socki_lookup 返回socket
static inline struct socket *sockfd_lookup(int fd, struct file **pfile)
{
struct file *file;
struct inode *inode;
if (fd < 0 || fd >= NR_OPEN || !(file = current->files->fd[fd]))
return NULL;
inode = file->f_inode;
if (!inode || !inode->i_sock)
return NULL;
if (pfile)
*pfile = file;
return socki_lookup(inode);
}
/*
* Allocate a socket.
*/
//socket结构的分配,同时对结构进行初始化
struct socket *sock_alloc(void)
{
struct inode * inode;
struct socket * sock;
//获得一个空闲inode结构
inode = get_empty_inode();
if (!inode)
return NULL;
inode->i_mode = S_IFSOCK;
inode->i_sock = 1;
inode->i_uid = current->uid;
inode->i_gid = current->gid;
//初始化socket结构
sock = &inode->u.socket_i;
sock->state = SS_UNCONNECTED;
sock->flags = 0;
sock->ops = NULL;
sock->data = NULL;
sock->conn = NULL;
sock->iconn = NULL;
sock->next = NULL;
sock->wait = &inode->i_wait;
sock->inode = inode; /* ”backlink”: we could use pointer arithmetic instead */
sock->fasync_list = NULL;
sockets_in_use++;
return sock;
}
/*
* Release a socket.
*/
//
static inline void sock_release_peer(struct socket *peer)
{
peer->state = SS_DISCONNECTING;
wake_up_interruptible(peer->wait);
sock_wake_async(peer, 1);
}
//释放socket
void sock_release(struct socket *sock)
{
int oldstate;
struct socket *peersock, *nextsock;
if ((oldstate = sock->state) != SS_UNCONNECTED)
sock->state = SS_DISCONNECTING;
/*
* Wake up anyone waiting for connections.
*/
for (peersock = sock->iconn; peersock; peersock = nextsock)
{
nextsock = peersock->next;
sock_release_peer(peersock);
}
/*
* Wake up anyone we’re connected to. First, we release the
* protocol, to give it a chance to flush data, etc.
*/
peersock = (oldstate == SS_CONNECTED) ? sock->conn : NULL;
if (sock->ops)
sock->ops->release(sock, peersock);
if (peersock)
sock_release_peer(peersock);
–sockets_in_use; /* Bookkeeping.. */
iput(SOCK_INODE(sock));
}
/*
* Sockets are not seekable.
*/
//没有实现
static int sock_lseek(struct inode *inode, struct file *file, off_t offset, int whence)
{
return(-ESPIPE);
}
/*
* Read data from a socket. ubuf is a user mode pointer. We make sure the user
* area ubuf…ubuf+size-1 is writable before asking the protocol.
*/
//从套接字中读取数据,Ubuf是一个用户空间的指针,当确定协议之前,我们要确定用户空间地址
Ubuf到ubuf+size-1是可以写得。
static int sock_read(struct inode *inode, struct file *file, char *ubuf, int size)
{
struct socket *sock;
int err;
if (!(sock = socki_lookup(inode)))
{
printk(“NET: sock_read: can’t find socket for inode!/n”);
return(-EBADF);
}
if (sock->flags & SO_ACCEPTCON)
return(-EINVAL);
if(size<0)
return -EINVAL;
if(size==0)
return 0;
if ((err=verify_area(VERIFY_WRITE,ubuf,size))<0) ///这一步实现检查。
return err;
return(sock->ops->read(sock, ubuf, size, (file->f_flags & O_NONBLOCK)));
}
/*
* Write data to a socket. We verify that the user area ubuf..ubuf+size-1 is
* readable by the user process.
*/
static int sock_write(struct inode *inode, struct file *file, char *ubuf, int size)
{
struct socket *sock;
int err;
if (!(sock = socki_lookup(inode)))
{
printk(“NET: sock_write: can’t find socket for inode!/n”);
return(-EBADF);
}
if (sock->flags & SO_ACCEPTCON)
return(-EINVAL);
if(size<0)
return -EINVAL;
if(size==0)
return 0;
if ((err=verify_area(VERIFY_READ,ubuf,size))<0)
return err;
return(sock->ops->write(sock, ubuf, size,(file->f_flags & O_NONBLOCK)));
}
/*
* You can’t read directories from a socket!
*/
static int sock_readdir(struct inode *inode, struct file *file, struct dirent *dirent,
int count)
{
return(-EBADF);
}
/*
* With an ioctl arg may well be a user mode pointer, but we don’t know what to do
* with it - thats up to the protocol still.
*/
int sock_ioctl(struct inode *inode, struct file *file, unsigned int cmd,
unsigned long arg)
{
struct socket *sock;
if (!(sock = socki_lookup(inode)))
{
printk(“NET: sock_ioctl: can’t find socket for inode!/n”);
return(-EBADF);
}
return(sock->ops->ioctl(sock, cmd, arg));
}
static int sock_select(struct inode *inode, struct file *file, int sel_type, select_table * wait)
{
struct socket *sock;
if (!(sock = socki_lookup(inode)))
{
printk(“NET: sock_select: can’t find socket for inode!/n”);
return(0);
}
/*
* We can’t return errors to select, so it’s either yes or no.
*/
if (sock->ops && sock->ops->select)
return(sock->ops->select(sock, sel_type, wait));
return(0);
}
//socket关闭,调用顺序sock_release ->sock_release_peer
void sock_close(struct inode *inode, struct file *filp)
{
struct socket *sock;
/*
* It’s possible the inode is NULL if we’re closing an unfinished socket.
*/
if (!inode)
return;
if (!(sock = socki_lookup(inode)))
{
printk(“NET: sock_close: can’t find socket for inode!/n”);
return;
}
sock_fasync(inode, filp, 0);
sock_release(sock);
}
/*
* Update the socket async list
*/
//更新套接字同步列表
static int sock_fasync(struct inode *inode, struct file *filp, int on)
{
struct fasync_struct *fa, *fna=NULL, **prev;
struct socket *sock;
unsigned long flags;
if (on) //根据on 来选择是否分配还是释放fasync_struct结构体
{
fna=(struct fasync_struct *)kmalloc(sizeof(struct fasync_struct), GFP_KERNEL);
if(fna==NULL)
return -ENOMEM;
}
sock = socki_lookup(inode);
prev=&(sock->fasync_list);
save_flags(flags);
cli();
for(fa=*prev; fa!=NULL; prev=&fa->fa_next,fa=*prev)
if(fa->fa_file==filp)
break;
if(on)
{
if(fa!=NULL)
{
kfree_s(fna,sizeof(struct fasync_struct));
restore_flags(flags);
return 0;
}
fna->fa_file=filp;
fna->magic=FASYNC_MAGIC;
fna->fa_next=sock->fasync_list;
sock->fasync_list=fna;
}
else
{
if(fa!=NULL)
{
*prev=fa->fa_next;
kfree_s(fa,sizeof(struct fasync_struct));
}
}
restore_flags(flags);
return 0;
}
//唤醒套接字,通过kill_fasync
int sock_wake_async(struct socket *sock, int how)
{
if (!sock || !sock->fasync_list)
return -1;
switch (how)
{
case 0:
kill_fasync(sock->fasync_list, SIGIO);
break;
case 1:
if (!(sock->flags & SO_WAITDATA))
kill_fasync(sock->fasync_list, SIGIO);
break;
case 2:
if (sock->flags & SO_NOSPACE)
{
kill_fasync(sock->fasync_list, SIGIO);
sock->flags &= ~SO_NOSPACE;
}
break;
}
return 0;
}
/*
* Wait for a connection.
*/
int sock_awaitconn(struct socket *mysock, struct socket *servsock, int flags)
{
struct socket *last;
/*
* We must be listening
*/
//检查服务器端是否处于监听状态,既可以进行连接
if (!(servsock->flags & SO_ACCEPTCON))
{
return(-EINVAL);
}
/*
* Put ourselves on the server’s incomplete connection queue.
*/
mysock->next = NULL;
cli();
if (!(last = servsock->iconn))
servsock->iconn = mysock;
else
{
while (last->next)
last = last->next;
last->next = mysock;
}
mysock->state = SS_CONNECTING;
mysock->conn = servsock;
sti();
/*
* Wake up server, then await connection. server will set state to
* SS_CONNECTED if we’re connected.
*/
wake_up_interruptible(servsock->wait);
sock_wake_async(servsock, 0);
if (mysock->state != SS_CONNECTED)
{
if (flags & O_NONBLOCK)
return -EINPROGRESS;
interruptible_sleep_on(mysock->wait);
if (mysock->state != SS_CONNECTED &&
mysock->state != SS_DISCONNECTING)
{
/*
* if we’re not connected we could have been
* 1) interrupted, so we need to remove ourselves
* from the server list
* 2) rejected (mysock->conn == NULL), and have
* already been removed from the list
*/
if (mysock->conn == servsock)
{
cli();
if ((last = servsock->iconn) == mysock)
servsock->iconn = mysock->next;
else
{
while (last->next != mysock)
last = last->next;
last->next = mysock->next;
}
sti();
}
return(mysock->conn ? -EINTR : -EACCES);
}
}
return(0);
}
/*
* Perform the socket system call. we locate the appropriate
* family, then create a fresh socket.
*/
//调用邋BSD层对应实现函数
static int sock_socket(int family, int type, int protocol)
{
int i, fd;
struct socket *sock;
struct proto_ops *ops;
/* Locate the correct protocol family. */
for (i = 0; i < NPROTO; ++i)
{
if (pops[i] == NULL) continue;
//判断用那种类型family ,如INET_proto_ops,unix_proto_ops
if (pops[i]->family == family)
break;
}
if (i == NPROTO)
{
return -EINVAL;
}
//把对应的操作传给ops
ops = pops[i];
/*
* Check that this is a type that we know how to manipulate and
* the protocol makes sense here. The family can still reject the
* protocol later.
*/
if ((type != SOCK_STREAM && type != SOCK_DGRAM &&
type != SOCK_SEQPACKET && type != SOCK_RAW &&
type != SOCK_PACKET) || protocol < 0)
return(-EINVAL);
/*
* Allocate the socket and allow the family to set things up. if
* the protocol is 0, the family is instructed to select an appropriate
* default.
*/
//分配socket结构
if (!(sock = sock_alloc()))
{
printk(“NET: sock_socket: no more sockets/n”);
return(-ENOSR); /* Was: EAGAIN, but we are out of
system resources! */
}
sock->type = type;
sock->ops = ops;
if ((i = sock->ops->create(sock, protocol)) < 0)
{
sock_release(sock);
return(i);
}
//分配fd ,file结构
if ((fd = get_fd(SOCK_INODE(sock))) < 0)
{
sock_release(sock);
return(-EINVAL);
}
return(fd);
}
/*
* Create a pair of connected sockets.
*/
//只用于unix域,用于2个进程间通过套接字进行联系数据传送
//这个函数用于本机内模拟网络方式进程间通信
static int sock_socketpair(int family, int type, int protocol, unsigned long usockvec[2])
{
int fd1, fd2, i;
struct socket *sock1, *sock2;
int er;
/*
* Obtain the first socket and check if the underlying protocol
* supports the socketpair call.
*/
if ((fd1 = sock_socket(family, type, protocol)) < 0)
return(fd1);
sock1 = sockfd_lookup(fd1, NULL);
if (!sock1->ops->socketpair)
{
sys_close(fd1);
return(-EINVAL);
}
/*
* Now grab another socket and try to connect the two together.
*/
if ((fd2 = sock_socket(family, type, protocol)) < 0)
{
sys_close(fd1);
return(-EINVAL);
}
sock2 = sockfd_lookup(fd2, NULL);
if ((i = sock1->ops->socketpair(sock1, sock2)) < 0)
{
sys_close(fd1);
sys_close(fd2);
return(i);
}
sock1->conn = sock2;
sock2->conn = sock1;
sock1->state = SS_CONNECTED;
sock2->state = SS_CONNECTED;
er=verify_area(VERIFY_WRITE, usockvec, 2 * sizeof(int));
if(er)
{
sys_close(fd1);
sys_close(fd2);
return er;
}
put_fs_long(fd1, &usockvec[0]);
put_fs_long(fd2, &usockvec[1]);
return(0);
}
/*
* Bind a name to a socket. Nothing much to do here since it’s
* the protocol’s responsibility to handle the local address.
*
* We move the socket address to kernel space before we call
* the protocol layer (having also checked the address is ok).
*/
static int sock_bind(int fd, struct sockaddr *umyaddr, int addrlen)
{
struct socket *sock;
int i;
char address[MAX_SOCK_ADDR];
int err;
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
//通过fd获取对应的socket结构
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//将数据从用户缓冲区移到内核缓冲区
if((err=move_addr_to_kernel(umyaddr,addrlen,address))<0)
return err;
if ((i = sock->ops->bind(sock, (struct sockaddr *)address, addrlen)) < 0)
{
return(i);
}
return(0);
}
/*
* Perform a listen. Basically, we allow the protocol to do anything
* necessary for a listen, and if that works, we mark the socket as
* ready for listening.
*/
static int sock_listen(int fd, int backlog)
{
struct socket *sock;
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//判断状态
if (sock->state != SS_UNCONNECTED)
{
return(-EINVAL);
}
if (sock->ops && sock->ops->listen)
sock->ops->listen(sock, backlog);
//标志位设为监听
sock->flags |= SO_ACCEPTCON;
return(0);
}
/*
* For accept, we attempt to create a new socket, set up the link
* with the client, wake up the client, then return the new
* connected fd. We collect the address of the connector in kernel
* space and move it to user at the very end. This is buggy because
* we open the socket then return an error.
*/
static int sock_accept(int fd, struct sockaddr *upeer_sockaddr, int *upeer_addrlen)
{
struct file *file;
struct socket *sock, *newsock;
int i;
char address[MAX_SOCK_ADDR];
int len;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, &file)))
return(-ENOTSOCK);
if (sock->state != SS_UNCONNECTED)
{
return(-EINVAL);
}
if (!(sock->flags & SO_ACCEPTCON))
{
return(-EINVAL);
}
if (!(newsock = sock_alloc()))
{
printk(“NET: sock_accept: no more sockets/n”);
return(-ENOSR); /* Was: EAGAIN, but we are out of system
resources! */
}
newsock->type = sock->type;
newsock->ops = sock->ops;
//建立新的套接字,用于与监听套接字通信
//初始化信息和原监听套接字一样
if ((i = sock->ops->dup(newsock, sock)) < 0)
{
sock_release(newsock);
return(i);
}
i = newsock->ops->accept(sock, newsock, file->f_flags);
if ( i < 0)
{
sock_release(newsock);
return(i);
}
//返回一个新的fd,便于通信
if ((fd = get_fd(SOCK_INODE(newsock))) < 0)
{
sock_release(newsock);
return(-EINVAL);
}
if (upeer_sockaddr)
{
//从请求连接中数据包中取得远端地址
newsock->ops->getname(newsock, (struct sockaddr *)address, &len, 1);
//复制到用户缓冲区
move_addr_to_user(address,len, upeer_sockaddr, upeer_addrlen);
}
return(fd);
}
/*
* Attempt to connect to a socket with the server address. The address
* is in user space so we verify it is OK and move it to kernel space.
*/
/*该函数首先将要链接的的远程地址从用户缓冲区复制到内核缓冲区,之后根据套接字之前的状态采取措施,如果完成套接字调用函数,则简单返回EISCONN. 如果状态有效则调用sock->ops->connect 函数完成具体连接。
*/
static int sock_connect(int fd, struct sockaddr *uservaddr, int addrlen)
{
struct socket *sock;
struct file *file;
int i;
char address[MAX_SOCK_ADDR];
int err;
if (fd < 0 || fd >= NR_OPEN || (file=current->files->fd[fd]) == NULL)
return(-EBADF);
if (!(sock = sockfd_lookup(fd, &file)))
return(-ENOTSOCK);
if((err=move_addr_to_kernel(uservaddr,addrlen,address))<0)
return err;
switch(sock->state)
{
case SS_UNCONNECTED:
/* This is ok… continue with connect */
break;
case SS_CONNECTED:
/* Socket is already connected */
if(sock->type == SOCK_DGRAM) /* Hack for now - move this all into the protocol */
break;
return -EISCONN;
case SS_CONNECTING:
/* Not yet connected… we will check this. */
/*
* FIXME: for all protocols what happens if you start
* an async connect fork and both children connect. Clean
* this up in the protocols!
*/
break;
default:
return(-EINVAL);
}
i = sock->ops->connect(sock, (struct sockaddr *)address, addrlen, file->f_flags);
if (i < 0)
{
return(i);
}
return(0);
}
/*
* Get the local address (‘name’) of a socket object. Move the obtained
* name to user space.
*/
//获得本地地址,并把从内核空间移动到用户空间
static int sock_getsockname(int fd, struct sockaddr *usockaddr, int *usockaddr_len)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int len;
int err;
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
//调用af_inet.c中inet_getname
err=sock->ops->getname(sock, (struct sockaddr *)address, &len, 0);
if(err)
return err;
if((err=move_addr_to_user(address,len, usockaddr, usockaddr_len))<0)
return err;
return 0;
}
/*
* Get the remote address (‘name’) of a socket object. Move the obtained
* name to user space.
*/
//获取远端地址(ip地址和端口号)
static int sock_getpeername(int fd, struct sockaddr *usockaddr, int *usockaddr_len)
{
struct socket *sock;
char address[MAX_SOCK_ADDR];
int len;
int err;
if (fd < 0 || fd >= NR_OPEN || current->files->fd[fd] == NULL)
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
err=sock->ops->getname(sock, (struct sockaddr *)address, &len, 1);
if(err)
return err;
if((err=move_addr_to_user(address,len, usockaddr, usockaddr_len))<0)
return err;
return 0;
}
/*
* Send a datagram down a socket. The datagram as with write() is
* in user space. We check it can be read.
*/
static int sock_send(int fd, void * buff, int len, unsigned flags)
{
struct socket *sock;
struct file *file;
int err;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if(len<0)
return -EINVAL;
err=verify_area(VERIFY_READ, buff, len);
if(err)
return err;
return(sock->ops->send(sock, buff, len, (file->f_flags & O_NONBLOCK), flags));
}由于
/*
* Send a datagram to a given address. We move the address into kernel
* space and check the user space data area is readable before invoking
* the protocol.
*/
static int sock_sendto(int fd, void * buff, int len, unsigned flags,
struct sockaddr *addr, int addr_len)
{
struct socket *sock;
struct file *file;
char address[MAX_SOCK_ADDR];
int err;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if(len<0)
return -EINVAL;
err=verify_area(VERIFY_READ,buff,len);
if(err)
return err;
if((err=move_addr_to_kernel(addr,addr_len,address))<0)
return err;
return(sock->ops->sendto(sock, buff, len, (file->f_flags & O_NONBLOCK),
flags, (struct sockaddr *)address, addr_len));
}
Send和Sendto区别?:
Sendto可以指定远端地址。而send不能。
对于TCp来说,指定远端地址之前必须建立建立连接远端地址。
而udp则不用。
·····································································································
/*
* Receive a datagram from a socket. This isn’t really right. The BSD manual
* pages explicitly state that recv is recvfrom with a NULL to argument. The
* Linux stack gets the right results for the wrong reason and this need to
* be tidied in the inet layer and removed from here.
* We check the buffer is writable and valid.
*/
static int sock_recv(int fd, void * buff, int len, unsigned flags)
{
struct socket *sock;
struct file *file;
int err;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if(len<0)
return -EINVAL;
if(len==0)
return 0;
err=verify_area(VERIFY_WRITE, buff, len);
if(err)
return err;
return(sock->ops->recv(sock, buff, len,(file->f_flags & O_NONBLOCK), flags));
}
/*
* Receive a frame from the socket and optionally record the address of the
* sender. We verify the buffers are writable and if needed move the
* sender address from kernel to user space.
*/
static int sock_recvfrom(int fd, void * buff, int len, unsigned flags,
struct sockaddr *addr, int *addr_len)
{
struct socket *sock;
struct file *file;
char address[MAX_SOCK_ADDR];
int err;
int alen;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if(len<0)
return -EINVAL;
if(len==0)
return 0;
err=verify_area(VERIFY_WRITE,buff,len);
if(err)
return err;
len=sock->ops->recvfrom(sock, buff, len, (file->f_flags & O_NONBLOCK),
flags, (struct sockaddr *)address, &alen);
if(len<0)
return len;
if(addr!=NULL && (err=move_addr_to_user(address,alen, addr, addr_len))<0)
return err;
return len;
}
Sock_recv和sock_recvfrom区别:
Sock_recvfrom可以同时返回远端地址
·····································································································
/*
* Set a socket option. Because we don’t know the option lengths we have
* to pass the user mode parameter for the protocols to sort out.
*/
static int sock_setsockopt(int fd, int level, int optname, char *optval, int optlen)
{
struct socket *sock;
struct file *file;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
return(sock->ops->setsockopt(sock, level, optname, optval, optlen));
}
/*
* Get a socket option. Because we don’t know the
option lengths we have
* to pass a user mode parameter for the protocols to sort out.
*/
static int sock_getsockopt(int fd, int level, int optname, char *optval, int *optlen)
{
struct socket *sock;
struct file *file;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
if (!sock->ops || !sock->ops->getsockopt)
return(0);
return(sock->ops->getsockopt(sock, level, optname, optval, optlen));
}
/*
* Shutdown a socket.
*/
static int sock_shutdown(int fd, int how)
{
struct socket *sock;
struct file *file;
if (fd < 0 || fd >= NR_OPEN || ((file = current->files->fd[fd]) == NULL))
return(-EBADF);
if (!(sock = sockfd_lookup(fd, NULL)))
return(-ENOTSOCK);
return(sock->ops->shutdown(sock, how));
}
/*
* Perform a file control on a socket file descriptor.
*/
int sock_fcntl(struct file *filp, unsigned int cmd, unsigned long arg)
{
struct socket *sock;
sock = socki_lookup (filp->f_inode);
if (sock != NULL && sock->ops != NULL && sock->ops->fcntl != NULL)
return(sock->ops->fcntl(sock, cmd, arg));
return(-EINVAL);
}
/*
* System call vectors. Since I (RIB) want to rewrite sockets as streams,
* we have this level of indirection. Not a lot of overhead, since more of
* the work is done via read/write/select directly.
*
* I’m now expanding this up to a higher level to separate the assorted
* kernel/user space manipulations and global assumptions from the protocol
* layers proper - AC.
*/
/*功能:系统调用的入门函数
*输入:call表示具体被调用的应用层的接口函数(如 bind)
*输出:
*/
asmlinkage int sys_socketcall(int call, unsigned long *args)
{
int er;
switch(call)
{
case SYS_SOCKET:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_socket(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_BIND:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_bind(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_CONNECT:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_connect(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
get_fs_long(args+2)));
case SYS_LISTEN:
er=verify_area(VERIFY_READ, args, 2 * sizeof(long));
if(er)
return er;
return(sock_listen(get_fs_long(args+0),
get_fs_long(args+1)));
case SYS_ACCEPT:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_accept(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_GETSOCKNAME:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_getsockname(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_GETPEERNAME:
er=verify_area(VERIFY_READ, args, 3 * sizeof(long));
if(er)
return er;
return(sock_getpeername(get_fs_long(args+0),
(struct sockaddr *)get_fs_long(args+1),
(int *)get_fs_long(args+2)));
case SYS_SOCKETPAIR:
er=verify_area(VERIFY_READ, args, 4 * sizeof(long));
if(er)
return er;
return(sock_socketpair(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(unsigned long *)get_fs_long(args+3)));
case SYS_SEND:
er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
if(er)
return er;
return(sock_send(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3)));
case SYS_SENDTO:
er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
if(er)
return er;
return(sock_sendto(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3),
(struct sockaddr *)get_fs_long(args+4),
get_fs_long(args+5)));
case SYS_RECV:
er=verify_area(VERIFY_READ, args, 4 * sizeof(unsigned long));
if(er)
return er;
return(sock_recv(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3)));
case SYS_RECVFROM:
er=verify_area(VERIFY_READ, args, 6 * sizeof(unsigned long));
if(er)
return er;
return(sock_recvfrom(get_fs_long(args+0),
(void *)get_fs_long(args+1),
get_fs_long(args+2),
get_fs_long(args+3),
(struct sockaddr *)get_fs_long(args+4),
(int *)get_fs_long(args+5)));
case SYS_SHUTDOWN:
er=verify_area(VERIFY_READ, args, 2* sizeof(unsigned long));
if(er)
return er;
return(sock_shutdown(get_fs_long(args+0),
get_fs_long(args+1)));
case SYS_SETSOCKOPT:
er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
if(er)
return er;
return(sock_setsockopt(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(char *)get_fs_long(args+3),
get_fs_long(args+4)));
case SYS_GETSOCKOPT:
er=verify_area(VERIFY_READ, args, 5*sizeof(unsigned long));
if(er)
return er;
return(sock_getsockopt(get_fs_long(args+0),
get_fs_long(args+1),
get_fs_long(args+2),
(char *)get_fs_long(args+3),
(int *)get_fs_long(args+4)));
default:
return(-EINVAL);
}
}
/*
* This function is called by a protocol handler that wants to
* advertise its address family, and have it linked into the
* SOCKET module.
*/
/*对于不同family具有不同的操作集*/
int sock_register(int family, struct proto_ops *ops)
{
int i;
cli();
for(i = 0; i < NPROTO; i++)
{
if (pops[i] != NULL)
continue;
pops[i] = ops;
pops[i]->family = family;
sti();
return(i);
}
sti();
return(-ENOMEM);
}
/*
* This function is called by a protocol handler that wants to
* remove its address family, and have it unlinked from the
* SOCKET module.
*/
int sock_unregister(int family)
{
int i;
cli();
for(i = 0; i < NPROTO; i++)
{
if (pops[i] == NULL)
continue;
if(pops[i]->family == family)
{
pops[i]=NULL;
sti();
return(i);
}
}
sti();
return(-ENOENT);
}
//网络部分协议初始化,
void proto_init(void)
{
extern struct net_proto protocols[]; /* Network protocols */
struct net_proto *pro;
/* Kick all configured protocols. */
pro = protocols;
while (pro->name != NULL)
{
(*pro->init_func)(pro);
pro++;
}
/* We’re all done… */
}
//系统网络栈初始化总入口函数,在start_kernel函数中被调用对整个网络栈进行初始化
void sock_init(void)
{
int i;
printk(“Swansea University Computer Society NET3.019/n”);
/*
* Initialize all address (protocol) families.
*/
for (i = 0; i < NPROTO; ++i) pops[i] = NULL;
/*
* Initialize the protocols module.
*/
//网络协议初始化
proto_init();
#ifdef CONFIG_NET
/*
* Initialize the DEV module.
*/
//网卡驱动初始化和操作的下半部分初始化
dev_init();
/*
* And the bottom half handler
*/
bh_base[NET_BH].routine= net_bh;
enable_bh(NET_BH);
#endif
}
int socket_get_info(char *buffer, char **start, off_t offset, int length)
{
int len = sprintf(buffer, ”sockets: used %d/n”, sockets_in_use);
if (offset >= len)
{
*start = buffer;
return 0;
}
*start = buffer + offset;
len -= offset;
if (len > length)
len = length;
return len;
}