[PATCH] allow send/recv(MSG_DONTWAIT) on non-sockets (was Re: O_NONBLOCK is broken)



On Tuesday 14 August 2007 13:33, Alan Cox wrote:
b) Make recv(fd, buf, size, flags) and send(fd, buf, size, flags);
work with non-socket fds too, for flags==0 or flags==MSG_DONTWAIT.
(it's ok to fail with "socket op on non-socket fd" for other values
of flags)

I think that makes a lot of sense, and to be honest other MSG_ flags make
useful sense and have meaningful semantics that might be helpful
elsewhere if ever coded that way.

Yes, that's my feeling too.

If you want to do this the first job is going to be to sort out the way
non-block is propogated to device driver read/write handlers. At the
moment they all check filp->f_flags

...which happens in ~250 files. I'd rather not touch that much
of code, if possible.

Attached patch detects send/recv(fd, buf, size, MSG_DONTWAIT) on
non-sockets and turns them into non-blocking write/read.
Since filp->f_flags appear to be read and modified without any locking,
I cannot modify it without potentially affecting other processes
accessing the same file through shared struct file.

Therefore I simply make a temporary copy of struct file, set
O_NONBLOCK in it and pass it to vfs_read/write.
Is this heresy? ;) I see only one spinlock in struct file:

#ifdef CONFIG_EPOLL
spinlock_t f_ep_lock;
#endif /* #ifdef CONFIG_EPOLL */

Do I need to take it?

Also attached is ndelaytest.c which can be used to test that
send(MSG_DONTWAIT) indeed is failing with EAGAIN if write would block
and that other processes never see O_NONBLOCK set.

Comments?
--
vda
--- linux-2.6.22-rc6.src/fs/read_write.c Fri Jun 15 19:30:05 2007
+++ linux-2.6.22-rc6_ndelay/fs/read_write.c Sun Aug 19 10:43:24 2007
@@ -15,6 +15,7 @@
#include <linux/module.h>
#include <linux/syscalls.h>
#include <linux/pagemap.h>
+#include <linux/socket.h>
#include "read_write.h"

#include <asm/uaccess.h>
@@ -351,6 +352,36 @@
static inline void file_pos_write(struct file *file, loff_t pos)
{
file->f_pos = pos;
+}
+
+/* Helper for send/recv on non-sockets */
+ssize_t rw_with_flags(struct file *file, int fput_needed, void __user *buf, size_t count, unsigned flags)
+{
+ int err;
+ loff_t pos;
+ struct file *file_copy;
+
+ file_copy = file;
+ if (flags & MSG_DONTWAIT) {
+ /* We make copy even if O_NONBLOCK is already set. */
+ /* We don't want it to change under our feet. */
+ file_copy = kmalloc(sizeof(*file_copy), GFP_KERNEL);
+ memcpy(file_copy, file, sizeof(*file_copy));
+ file_copy->f_flags |= O_NONBLOCK;
+ }
+
+ pos = file_pos_read(file);
+ if (flags & MSG_OOB) /* MSG_OOB is reused to mean 'write' */
+ err = vfs_write(file_copy, buf, count, &pos);
+ else
+ err = vfs_read(file_copy, buf, count, &pos);
+ file_pos_write(file, pos);
+
+ if (flags & MSG_DONTWAIT) {
+ kfree(file_copy);
+ }
+ fput_light(file, fput_needed);
+ return err;
}

asmlinkage ssize_t sys_read(unsigned int fd, char __user * buf, size_t count)
--- linux-2.6.22-rc6.src/include/linux/fs.h Wed Jun 27 21:24:18 2007
+++ linux-2.6.22-rc6_ndelay/include/linux/fs.h Sun Aug 19 10:32:20 2007
@@ -1154,6 +1154,9 @@
extern ssize_t vfs_writev(struct file *, const struct iovec __user *,
unsigned long, loff_t *);

+extern ssize_t rw_with_flags(struct file *, int, void __user *, size_t,
+ unsigned);
+
/*
* NOTE: write_inode, delete_inode, clear_inode, put_inode can be called
* without the big kernel lock held in all filesystems.
--- linux-2.6.22-rc6.src/net/socket.c Fri Jun 15 19:30:08 2007
+++ linux-2.6.22-rc6_ndelay/net/socket.c Sun Aug 19 11:34:07 2007
@@ -1585,8 +1585,17 @@
goto out;

sock = sock_from_file(sock_file, &err);
- if (!sock)
- goto out_put;
+ if (!sock) {
+ if (addr)
+ goto out_put;
+ if (flags & ~MSG_DONTWAIT)
+ goto out_put;
+ /* it's not a socket, but we support a special case:
+ * send(fd, buf, count, MSG_DONTWAIT)
+ * (MSG_OOB is reused to mean 'write') */
+ return rw_with_flags(sock_file, fput_needed, buff, len, flags | MSG_OOB);
+ }
+
iov.iov_base = buff;
iov.iov_len = len;
msg.msg_name = NULL;
@@ -1646,8 +1655,15 @@
goto out;

sock = sock_from_file(sock_file, &err);
- if (!sock)
- goto out_put;
+ if (!sock) {
+ if (addr)
+ goto out_put;
+ if (flags & ~MSG_DONTWAIT)
+ goto out_put;
+ /* it's not a socket, but we support a special case:
+ * recv(fd, ubuf, size, MSG_DONTWAIT) */
+ return rw_with_flags(sock_file, fput_needed, ubuf, size, flags);
+ }

msg.msg_control = NULL;
msg.msg_controllen = 0;
#include <sys/types.h>
#include <sys/socket.h>
#include <errno.h>
#include <stdio.h>
#include <unistd.h>
#include <fcntl.h>
#include <time.h>
#include <signal.h>

#define SECONDS 10

#define STR "."
//#define STR "123456789 123456789 123456789 123456789 "

/* To see send() resulting in EAGAIN:
* strace -ff -o log ndelaytest | while sleep 11; do break; done
* log.$PID:
* send(1, "123456789 123456789 123456789 12"..., 40, MSG_DONTWAIT)
* = -1 EAGAIN (Resource temporarily unavailable)
*/

int main()
{
pid_t pid;
time_t t;
int fl;

puts("starting");
t = time(0);

pid = fork();
if (pid == 0) {
/* child */
while ((time(0) - t) < SECONDS-1) {
#if 0
/* Uncomment this part and simply run the executable
* to see race detection code in action */
#define OP "write"
fcntl(1, F_SETFL, fcntl(1, F_GETFL) | O_NONBLOCK);
fl = write(1, STR, sizeof(STR) - 1);
fcntl(1, F_SETFL, fcntl(1, F_GETFL) & ~O_NONBLOCK);
#else
/* This part tests whether send(MSG_DONTWAIT)
* is racy or not */
#define OP "send"
fl = send(1, STR, sizeof(STR) - 1, MSG_DONTWAIT);
#endif
if (fl < 0) {
perror(OP);
kill(getppid(), SIGKILL);
return 1;
}
}
return 0;
}

while ((time(0) - t) < SECONDS) {
fl = fcntl(1, F_GETFL);
if (fl & O_NONBLOCK) {
fprintf(stderr, "NONBLOCK:1\n");
kill(pid, SIGKILL);
fcntl(1, F_SETFL, fl & ~O_NONBLOCK);
return 1;
}
}
fprintf(stderr, "NONBLOCK:0\n");
return 0;
}


Relevant Pages

  • [PATCH 06/21] mm: vm_stat_account unshackled
    ... +static inline void vm_stat_account(struct mm_struct *mm, ... unsigned long flags, struct file *file, long pages) ... +void vm_stat_account(struct mm_struct *mm, unsigned long flags, ... send the line "unsubscribe linux-kernel" in ...
    (Linux-Kernel)
  • Re: [PATCH 1/1] signal: make group kill signal fatal
    ... we always add SIGKILL to any thread. ... static int release(struct inode *ino, struct file *file) ... poll() never returns to the user-space. ...
    (Linux-Kernel)