Ake Koomsin

Digging Around FreeBSD Socket API TCP Send Path

When we want to send data across network, it involves either

  • write()/writev()
  • send()/sendto()/sendmsg()

and sockets.

To use write(), writev() and send(), the socket must be connected. On the other hand sendto() and sendmsg() can be used in both connected and unconnected connection.

We normally can find the implementation of system calls in the kernel by adding ‘sys_’ prefix to the name of the system call except send(). In FreeBSD libc, send() is just a wrapper of sendto() with some default parameters.

Let’s take a look at sys_write().

In sys/kern/sys_generic.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
int
sys_write(td, uap)
    struct thread *td;
  struct write_args *uap;
{
  struct uio auio;
  struct iovec aiov;
  int error;

  if (uap->nbyte > IOSIZE_MAX)
      return (EINVAL);
  aiov.iov_base = (void *)(uintptr_t)uap->buf;
  aiov.iov_len = uap->nbyte;
  auio.uio_iov = &aiov;
  auio.uio_iovcnt = 1;
  auio.uio_resid = uap->nbyte;
  auio.uio_segflg = UIO_USERSPACE;
  error = kern_writev(td, uap->fd, &auio);
  return(error);
}

It is just a special form of writev() as it calls kern_writev() eventually. Taking a look at kern_writev() gives us some interesting code pattern.

In sys/kern/sys_generic.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
int
kern_writev(struct thread *td, int fd, struct uio *auio)
{
    struct file *fp;
    cap_rights_t rights;
    int error;

    error = fget_write(td, fd, cap_rights_init(&rights, CAP_WRITE), &fp);
    if (error)
        return (error);
    error = dofilewrite(td, fd, fp, auio, (off_t)-1, 0);
    fdrop(fp, td);
    return (error);
}

fget_write() is called to verify that we have permission to perform write operation and get a pointer to the file associated with the file descriptor. After thus function call, the reference count of the file is increased. That is why in the end fdrop() macro is called to decrease the reference count and perform some cleanup if necessary.

The write operation happens when dofilewrite() is called.

In sys/kern/sys_generic.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
static int
dofilewrite(td, fd, fp, auio, offset, flags)
    struct thread *td;
    int fd;
    struct file *fp;
    struct uio *auio;
    off_t offset;
    int flags;
{
      ...
    if ((error = fo_write(fp, auio, td->td_ucred, flags, td))) {
        ...
    }
      ...
}

dofilewrite() performs some verification and dispatches write operating to an appropriate function by calling fo_write(). The real write function depends on the type of the file.

In sys/sys/file.h
1
2
3
4
5
6
7
static __inline int
fo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{

    return ((*fp->f_ops->fo_write)(fp, uio, active_cred, flags, td));
}

For socket, file operations are defined as follows.

In sys/kern/sys_socket.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
struct fileops    socketops = {
    .fo_read = soo_read,
    .fo_write = soo_write,
    .fo_truncate = soo_truncate,
    .fo_ioctl = soo_ioctl,
    .fo_poll = soo_poll,
    .fo_kqfilter = soo_kqfilter,
    .fo_stat = soo_stat,
    .fo_close = soo_close,
    .fo_chmod = invfo_chmod,
    .fo_chown = invfo_chown,
    .fo_sendfile = invfo_sendfile,
    .fo_flags = DFLAG_PASSABLE
};

That means the actual write function is soo_write().

In sys/kern/sys_socket.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
int
soo_write(struct file *fp, struct uio *uio, struct ucred *active_cred,
    int flags, struct thread *td)
{
    struct socket *so = fp->f_data;
    int error;

#ifdef MAC
    error = mac_socket_check_send(active_cred, so);
    if (error)
        return (error);
#endif
    error = sosend(so, 0, uio, 0, 0, 0, uio->uio_td);
    if (error == EPIPE && (so->so_options & SO_NOSIGPIPE) == 0) {
        PROC_LOCK(uio->uio_td->td_proc);
        tdsignal(uio->uio_td, SIGPIPE);
        PROC_UNLOCK(uio->uio_td->td_proc);
    }
    return (error);
}

Again, soo_write() performs some necessary verification and calls sosend().

Before we go further, let’s take a look at sys_sendt() to see how it differs from normal write() system call.

In sys/kern/uipc_syscalls.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
int
sys_sendto(td, uap)
    struct thread *td;
    struct sendto_args /* {
        int s;
        caddr_t buf;
        size_t  len;
        int flags;
        caddr_t to;
        int tolen;
    } */ *uap;
{
    struct msghdr msg;
    struct iovec aiov;

    msg.msg_name = uap->to;
    msg.msg_namelen = uap->tolen;
    msg.msg_iov = &aiov;
    msg.msg_iovlen = 1;
    msg.msg_control = 0;
#ifdef COMPAT_OLDSOCK
    msg.msg_flags = 0;
#endif
    aiov.iov_base = uap->buf;
    aiov.iov_len = uap->len;
    return (sendit(td, uap->s, &msg, uap->flags));
}

sys_sendmsg() is similar to sys_sendto(). They just handle the arguments differently. They both call sendit() at the end. sendit() performs some check and call kern_sendit() which call sosend() eventually.

In sys/kern/uipc_syscalls.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
static int
sendit(td, s, mp, flags)
    struct thread *td;
    int s;
    struct msghdr *mp;
    int flags;
{
    ...
    error = kern_sendit(td, s, mp, flags, control, UIO_USERSPACE);
    ...
}

int
kern_sendit(td, s, mp, flags, control, segflg)
    struct thread *td;
    int s;
    struct msghdr *mp;
    int flags;
    struct mbuf *control;
    enum uio_seg segflg;
{
    ...
    error = sosend(so, mp->msg_name, &auio, 0, control, flags, td);
    ...
}

sosend() is a basically a wrapper of the function pointed by pru_sosend.

In sys/kern/uipc_socket.c
1
2
3
4
5
6
7
8
9
10
11
12
int
sosend(struct socket *so, struct sockaddr *addr, struct uio *uio,
    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
    int error;

    CURVNET_SET(so->so_vnet);
    error = so->so_proto->pr_usrreqs->pru_sosend(so, addr, uio, top,
        control, flags, td);
    CURVNET_RESTORE();
    return (error);
}

At this point, the actual function depends on the protocol type. In case of TCP, the pru_sosend points to sosend_generic() (This is the default value. UDP has its own sosend_dgram()).

In sys/kern/uipc_generic.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
int
sosend_generic(struct socket *so, struct sockaddr *addr, struct uio *uio,
    struct mbuf *top, struct mbuf *control, int flags, struct thread *td)
{
    long space;
    ssize_t resid;
    int clen = 0, error, dontroute;
    int atomic = sosendallatonce(so) || top;

    ...
            /*
             * XXX all the SBS_CANTSENDMORE checks previously
             * done could be out of date.  We could have recieved
             * a reset packet in an interrupt or maybe we slept
             * while doing page faults in uiomove() etc.  We
             * could probably recheck again inside the locking
             * protection here, but there are probably other
             * places that this also happens.  We must rethink
             * this.
             */
            VNET_SO_ASSERT(so);
            error = (*so->so_proto->pr_usrreqs->pru_send)(so,
                (flags & MSG_OOB) ? PRUS_OOB :
     ...
}

At some point in sosend_generic(), pru_send function pointer is called. For TCP, this points to function tcp_user_send().

In sys/netinet/tcp_usrreq.c
1
2
3
4
5
6
7
8
9
10
11
12
static int
tcp_usr_send(struct socket *so, int flags, struct mbuf *m,
    struct sockaddr *nam, struct mbuf *control, struct thread *td)
{
    ...
        tp->snd_up = tp->snd_una + so->so_snd.sb_cc;
        tp->t_flags |= TF_FORCEDATA;
        error = tcp_output(tp);
        tp->t_flags &= ~TF_FORCEDATA;
    }
    ...
}

The data is passed to tcp_output() to figure out what to be sent and send it to lower layer by ip_output().

In sys/netinet/tcp_output.c
1
2
3
4
5
6
7
8
9
10
11
int
tcp_output(struct tcpcb *tp)
{
    ...
        TCP_PROBE5(send, NULL, tp, ip, tp, th);

        error = ip_output(m, tp->t_inpcb->inp_options, &ro,
            ((so->so_options & SO_DONTROUTE) ? IP_ROUTETOIF : 0), 0,
            tp->t_inpcb);
    ...
}

Eventually, the data in mbuf will be passed to the device driver through ifp->if_output function pointer.

In sys/netinet/ip_output.c
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
int
ip_output(struct mbuf *m, struct mbuf *opt, struct route *ro, int flags,
    struct ip_moptions *imo, struct inpcb *inp)
{
    ...
        /*
         * Reset layer specific mbuf flags
         * to avoid confusing lower layers.
         */
        m_clrprotoflags(m);
        IP_PROBE(send, NULL, NULL, ip, ifp, ip, NULL);
        error = (*ifp->if_output)(ifp, m,
            (const struct sockaddr *)gw, ro);
        goto done;
    ...
}

In conclusion, sending data across network is not a trivial task. The data from user application will pass through a series of layers. Each layer has its own responsibility.