From e1603b6effe177210701d3d7132d1b68e7bd2c93 Mon Sep 17 00:00:00 2001 From: Kirill Tkhai Date: Fri, 9 Feb 2018 18:04:54 +0300 Subject: inotify: Extend ioctl to allow to request id of new watch descriptor Watch descriptor is id of the watch created by inotify_add_watch(). It is allocated in inotify_add_to_idr(), and takes the numbers starting from 1. Every new inotify watch obtains next available number (usually, old + 1), as served by idr_alloc_cyclic(). CRIU (Checkpoint/Restore In Userspace) project supports inotify files, and restores watched descriptors with the same numbers, they had before dump. Since there was no kernel support, we had to use cycle to add a watch with specific descriptor id: while (1) { int wd; wd = inotify_add_watch(inotify_fd, path, mask); if (wd < 0) { break; } else if (wd == desired_wd_id) { ret = 0; break; } inotify_rm_watch(inotify_fd, wd); } (You may find the actual code at the below link: https://github.com/checkpoint-restore/criu/blob/v3.7/criu/fsnotify.c#L577) The cycle is suboptiomal and very expensive, but since there is no better kernel support, it was the only way to restore that. Happily, we had met mostly descriptors with small id, and this approach had worked somehow. But recent time containers with inotify with big watch descriptors begun to come, and this way stopped to work at all. When descriptor id is something about 0x34d71d6, the restoring process spins in busy loop for a long time, and the restore hungs and delay of migration from node to node could easily be watched. This patch aims to solve this problem. It introduces new ioctl INOTIFY_IOC_SETNEXTWD, which allows to request the number of next created watch descriptor from userspace. It simply calls idr_set_cursor() primitive to populate idr::idr_next, so that next idr_alloc_cyclic() allocation will return this id, if it is not occupied. This is the way which is used to restore some other resources from userspace. For example, /proc/sys/kernel/ns_last_pid works the same for task pids. The new code is under CONFIG_CHECKPOINT_RESTORE #define, so small system may exclude it. v2: Use INT_MAX instead of custom definition of max id, as IDR subsystem guarantees id is between 0 and INT_MAX. CC: Jan Kara CC: Matthew Wilcox CC: Andrew Morton CC: Amir Goldstein Signed-off-by: Kirill Tkhai Reviewed-by: Cyrill Gorcunov Reviewed-by: Matthew Wilcox Reviewed-by: Andrew Morton Signed-off-by: Jan Kara --- include/uapi/linux/inotify.h | 8 ++++++++ 1 file changed, 8 insertions(+) (limited to 'include') diff --git a/include/uapi/linux/inotify.h b/include/uapi/linux/inotify.h index 5474461683db..4800bf2a531d 100644 --- a/include/uapi/linux/inotify.h +++ b/include/uapi/linux/inotify.h @@ -71,5 +71,13 @@ struct inotify_event { #define IN_CLOEXEC O_CLOEXEC #define IN_NONBLOCK O_NONBLOCK +/* + * ioctl numbers: inotify uses 'I' prefix for all ioctls, + * except historical FIONREAD, which is based on 'T'. + * + * INOTIFY_IOC_SETNEXTWD: set desired number of next created + * watch descriptor. + */ +#define INOTIFY_IOC_SETNEXTWD _IOW('I', 0, __s32) #endif /* _UAPI_LINUX_INOTIFY_H */ -- cgit v1.2.3 From 7b1f641776e0c8b824fb10135168e4b683a9e2ba Mon Sep 17 00:00:00 2001 From: Jan Kara Date: Wed, 21 Feb 2018 15:07:52 +0100 Subject: fsnotify: Let userspace know about lost events due to ENOMEM Currently if notification event is lost due to event allocation failing we ENOMEM, we just silently continue (except for fanotify permission events where we deny the access). This is undesirable as userspace has no way of knowing whether the notifications it got are complete or not. Treat lost events due to ENOMEM the same way as lost events due to queue overflow so that userspace knows something bad happened and it likely needs to rescan the filesystem. Reviewed-by: Amir Goldstein Signed-off-by: Jan Kara --- fs/notify/fanotify/fanotify.c | 9 ++++++++- fs/notify/inotify/inotify_fsnotify.c | 8 +++++++- fs/notify/notification.c | 3 ++- include/linux/fsnotify_backend.h | 6 ++++++ 4 files changed, 23 insertions(+), 3 deletions(-) (limited to 'include') diff --git a/fs/notify/fanotify/fanotify.c b/fs/notify/fanotify/fanotify.c index 928f2a5eedb7..d51e1bb781cf 100644 --- a/fs/notify/fanotify/fanotify.c +++ b/fs/notify/fanotify/fanotify.c @@ -221,8 +221,15 @@ static int fanotify_handle_event(struct fsnotify_group *group, event = fanotify_alloc_event(group, inode, mask, data); ret = -ENOMEM; - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * We don't queue overflow events for permission events as + * there the access is denied and so no event is in fact lost. + */ + if (!fanotify_is_perm_event(mask)) + fsnotify_queue_overflow(group); goto finish; + } fsn_event = &event->fse; ret = fsnotify_add_event(group, fsn_event, fanotify_merge); diff --git a/fs/notify/inotify/inotify_fsnotify.c b/fs/notify/inotify/inotify_fsnotify.c index 8b73332735ba..40dedb37a1f3 100644 --- a/fs/notify/inotify/inotify_fsnotify.c +++ b/fs/notify/inotify/inotify_fsnotify.c @@ -99,8 +99,14 @@ int inotify_handle_event(struct fsnotify_group *group, fsn_mark); event = kmalloc(alloc_len, GFP_KERNEL); - if (unlikely(!event)) + if (unlikely(!event)) { + /* + * Treat lost event due to ENOMEM the same way as queue + * overflow to let userspace know event was lost. + */ + fsnotify_queue_overflow(group); return -ENOMEM; + } fsn_event = &event->fse; fsnotify_init_event(fsn_event, inode, mask); diff --git a/fs/notify/notification.c b/fs/notify/notification.c index 66f85c651c52..3c3e36745f59 100644 --- a/fs/notify/notification.c +++ b/fs/notify/notification.c @@ -111,7 +111,8 @@ int fsnotify_add_event(struct fsnotify_group *group, return 2; } - if (group->q_len >= group->max_events) { + if (event == group->overflow_event || + group->q_len >= group->max_events) { ret = 2; /* Queue overflow event only if it isn't already queued */ if (!list_empty(&group->overflow_event->list)) { diff --git a/include/linux/fsnotify_backend.h b/include/linux/fsnotify_backend.h index 067d52e95f02..9f1edb92c97e 100644 --- a/include/linux/fsnotify_backend.h +++ b/include/linux/fsnotify_backend.h @@ -331,6 +331,12 @@ extern int fsnotify_add_event(struct fsnotify_group *group, struct fsnotify_event *event, int (*merge)(struct list_head *, struct fsnotify_event *)); +/* Queue overflow event to a notification group */ +static inline void fsnotify_queue_overflow(struct fsnotify_group *group) +{ + fsnotify_add_event(group, group->overflow_event, NULL); +} + /* true if the group notification queue is empty */ extern bool fsnotify_notify_queue_is_empty(struct fsnotify_group *group); /* return, but do not dequeue the first event on the notification queue */ -- cgit v1.2.3 From b91ed9d8082c394dda63f94f935219cd0a565938 Mon Sep 17 00:00:00 2001 From: Ritesh Harjani Date: Fri, 16 Mar 2018 19:13:02 +0530 Subject: quota: Kill an unused extern entry form quota.h Kill an unused extern entry from quota.h which is leftover of below patch. [f32764bd2: quota: Convert quota statistics to generic percpu_counter] Signed-off-by: Ritesh Harjani Signed-off-by: Jan Kara --- include/linux/quota.h | 1 - 1 file changed, 1 deletion(-) (limited to 'include') diff --git a/include/linux/quota.h b/include/linux/quota.h index 5ac9de4fcd6f..ca9772c8e48b 100644 --- a/include/linux/quota.h +++ b/include/linux/quota.h @@ -267,7 +267,6 @@ struct dqstats { struct percpu_counter counter[_DQST_DQSTAT_LAST]; }; -extern struct dqstats *dqstats_pcpu; extern struct dqstats dqstats; static inline void dqstats_inc(unsigned int type) -- cgit v1.2.3