原文地址: http://www.penna.cn/blog/?p=218
UDP发送:
| sys_write fs/read_write.c
| sock_writev net/socket.c
| sock_sendmsg net/socket.c
| inet_sendmsg net/ipv4/af_inet.c
| udp_sendmsg net/ipv4/udp.c
| ip_build_xmit net/ipv4/ip_output.c
| output_maybe_reroute net/ipv4/ip_output.c
| ip_output net/ipv4/ip_output.c
| ip_finish_output net/ipv4/ip_output.c
| dev_queue_xmit net/dev.c
| ——————————————–
| el3_start_xmit driver/net/3c309.c
V
write()
e.g. write(sockfd,”Hello”,strlen(“Hello”));
user
————————————————–
kernel
sys_write() <fs/read_write.c>
asmlinkage ssize_t sys_write(unsigned int fd,const char __user * buf,size_t count)
…
ret = vfs_write(file,buf,count,&pos);
…
vfs_write()
…
if (file->f_op->write)
ret = file->f_op->write(file,buf,count,pos);
…
//在前面建立socket时sock_map_fd将sock对应file的f_op定义为socket_file_ops,参见:
static const struct file_operations socket_file_ops ={
…
.aio_write = sock_aio_write,
…
}
sock_aio_write()//与之前的版本不同了。。。
do_sock_write()
__sock_sendmsg()
static inline int __sock_sendmsg(struct kiocb *iocb,struct socket *sock,struct msghdr *msg,size_t size)
…
return sock->ops->sendmsg(iocb,sock,msg,size);
…
//sock->ops在inet_create函数中被初始化,参见:
inet_create() <net/ipv4/af_inet.c:>
static struct inet_protosw inetsw_array[] <net/ipv4/af_inet.c:>
<net/ipv4/tcp_ipv4.c>
const struct proto_ops inet_stream_ops ={
…
.sendmsg = tcp_sendmsg,
…
}
<net/ipv4/udp.c>
const struct proto_ops inet_dgram_ops ={
…
.sendmsg = inet_sendmsg,
…
}
————————————————–
UDP
inet_sendmsg(…)
int inet_sendmsg(struct kiocb *iocb,struct socket *sock,struct msghdr *msg,size_t size)
…
return sk->sk_prot->sendmsg(iocb,sk,msg,size);
…
…
udp_sendmsg(…)
ip_route_output_flow()
这里进行路由!参见5、路由和ARP
ip_append_data()
* ip_append_data() and ip_append_page() can make one large IP datagram
* from many pieces of data. Each pieces will be holded on the socket
* until ip_push_pending_frames() is called. Each piece can be a page
* or non-page data.
*
* Not only UDP,other transport protocols –e.g. raw sockets –can use
* this interface potentially.
*
* LATER:length must be adjusted by pad at tail,when it is required.
udp_push_pending_frames()
udp_push_pending_frames()
* Push out all pending data as one UDP datagram. Socket is locked.
————————————————–
TCP
tcp_transmit_skb()
err = icsk->icsk_af_ops->queue_xmit(skb,0);
tcp_transmit_skb 引用表:
tcp_mtu_probe
tcp_write_xmit
tcp_push_one
tcp_retransmit_skb
tcp_send_active_reset
tcp_send_synack
tcp_connect
tcp_send_ack
tcp_xmit_probe_skb
tcp_write_wakeup
ip_queue_xmit() ip_send_reply() ip_build_and_send_pkt()
int ip_queue_xmit(struct sk_buff *skb,int ipfragok)
这里有route过程
ip_route_output_flow(…)
*dccp int ip_build_and_send_pkt(struct sk_buff *skb,struct sock *sk,__be32 saddr,__be32 daddr,struct ip_options *opt)
Add an ip header to a skbuff and send it out.
void ip_send_reply(struct sock *sk,struct sk_buff *skb,struct ip_reply_arg *arg,unsigned int len) <net/ipv4/ip_output.c>
* Generic function to send a packet as reply to another packet.
* Used to send TCP resets so far. ICMP should use this function too.
* Should run single threaded per socket because it uses the sock
* structure to pass arguments.
这里有用到ip_route_output_key()进行路由。
int ip_push_pending_frames(struct sock *sk);
Combined all pending IP fragments on the socket as one IP datagram
and push them out.
ip_local_out();
————————————————–
IP
ip_push_pending_frames()
ip_local_out() <ip_output.c>
int ip_local_out(struct sk_buff *skb)
{
int err;
err = __ip_local_out(skb);
if (likely(err == 1))
err = dst_output(skb);
return err;
}
EXPORT_SYMBOL_GPL(ip_local_out);
__ip_local_out() <ip_output.c>
int __ip_local_out(struct sk_buff *skb)
{
struct iphdr *iph = ip_hdr(skb);
iph->tot_len = htons(skb->len);
ip_send_check(iph);
return nf_hook(PF_INET,NF_INET_LOCAL_OUT,skb,NULL,skb->dst->dev,
dst_output);
}
dst_output() <net/core/dst.c>
static inline int dst_output(struct sk_buff *skb)
{
return skb->dst->output(skb);
}
其中dst->output() = ip_output();在__mkroute_output()和__mkroute_input()中注册。
ip_output() <net/ipv4/ip_output.c>
return NF_HOOK_COND(PF_INET,NF_INET_POST_ROUTING,skb,NULL,dev,
ip_finish_output,
!(IPCB(skb)->flags &IPSKB_REROUTED));
ip_finish_output() <net/ipv4/ip_output.c>
static int ip_finish_output(struct sk_buff *skb)
{
#if defined(CONFIG_NETFILTER) &&defined(CONFIG_XFRM)
if (skb->dst->xfrm != NULL){
IPCB(skb)->flags |= IPSKB_REROUTED;
return dst_output(skb);
}
#endif
if (skb->len >ip_skb_dst_mtu(skb) &&!skb_is_gso(skb))
return ip_fragment(skb,ip_finish_output2);
else
return ip_finish_output2(skb);
}
ip_finish_output2() <net/ipv4/ip_output.c>
…
if (dst->hh)
return neigh_hh_output(dst->hh,skb);
else if (dst->neighbour)
return dst->neighbour->output(skb);
…
// 此函数在neigh_alloc中注册为neigh_blackhole(),但这个是默认的,一般会被替换掉
static struct neighbour *neigh_alloc(struct neigh_table *tbl)
…
n->output = neigh_blackhole;
…
// 此函数在arp_constructor中注册为ops->queue_xmit或ops->output或connected_output
static int arp_constructor(struct neighbour *neigh)
…
.output = neigh_resolve_output,
.connected_output = neigh_connected_output,
.queue_xmit = dev_queue_xmit()
…
// 故一般为neigh_resolve_output
neigh_resolve_output() <net/core/neighbour.c>
…
err = dev_hard_header(skb,dev,ntohs(skb->protocol),neigh->ha,NULL,skb->len);
// 这里有Mac头填充的动作,参见路由和ARP
// 说明此前neigh->ha已获取。如果neigh->ha为空呢?
…
if (err >= 0)
rc = neigh->ops->queue_xmit(skb);// 此函数注册为dev_queue_xmit()
…
dev_queue_xmit() <net/core/dev.c>
int dev_queue_xmit(struct sk_buff *skb)
…
if (!netif_queue_stopped(dev) &&
!netif_subqueue_stopped(dev,skb)){
rc = 0;
if (!dev_hard_start_xmit(skb,dev)){
HARD_TX_UNLOCK(dev);
goto out;
}
}
…
dev_hard_start_xmit() <net/core/dev.c>
int dev_hard_start_xmit(struct sk_buff *skb,struct net_device *dev)
…
return dev->hard_start_xmit(skb,dev);
…
xxx_start_xmit() <drivers/net/xxx.c>
…